Parser.java

// Copyright (c) 2023 Tobias Briones. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause
// This file is part of https://github.com/tobiasbriones/blog

package engineer.mathsoftware.blog.slides.lang;

import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class Parser<K extends Enum<?>> {
    private static final Pattern STRING_PATTERN
        = Pattern
        .compile("(['\"])([^'\"]*)(['\"])");
    private static final Pattern SINGLE_LINE_COMMENT_PATTERN
        = Pattern
        .compile("(//)(.*)(\\r\\n|\\r|\\n)");
    private static final String PASCAL_CASE_GROUP_REGEX
        = "([A-Z][a-zA-Z0-9]*)";
    private static final Pattern PASCAL_CASE_TYPE_PATTERN
        = Pattern
        .compile("([ (.,:]{1}|:{2}|::)"
            + PASCAL_CASE_GROUP_REGEX
            + "([ (.,:\\[]{1}|:{2}|::)"
        );

    public static List<String> tokens(String code) {
        var delimiters = List.of(
            "\n",
            " ",
            ";",
            "=",
            "+",
            "-",
            "*",
            "/",
            ",",
            ":",
            "?",
            "(",
            ")",
            "<",
            ">",
            "[",
            "]",
            "."
        );
        var delimiterPatterns = delimiters
            .stream()
            .map(delimiter -> "\\" + delimiter)
            .collect(Collectors.joining("|"));
        var delimiterPattern
            = "(?<=" + delimiterPatterns + ")|(?=" + delimiterPatterns + ")";
        var afterStrings = tokensEnclosedBy(STRING_PATTERN, code);
        var afterComments = new ArrayList<String>();

        for (var token : afterStrings) {
            if (STRING_PATTERN.matcher(token).find()) {
                afterComments.add(token);
                continue;
            }
            afterComments.addAll(tokensEnclosedBy(SINGLE_LINE_COMMENT_PATTERN,
                token
            ));
        }

        var afterTypes = new ArrayList<String>();

        for (var token : afterComments) {
            if (
                STRING_PATTERN.matcher(token).find()
                    || SINGLE_LINE_COMMENT_PATTERN.matcher(token).find()
            ) {
                afterTypes.add(token);
                continue;
            }
            afterTypes.addAll(tokensSurroundedBy(PASCAL_CASE_TYPE_PATTERN,
                token
            ));
        }

        var result = new ArrayList<String>();

        for (var token : afterTypes) {
            if (
                STRING_PATTERN.matcher(token).find()
                    || SINGLE_LINE_COMMENT_PATTERN.matcher(token).find()
                    || token.matches(PASCAL_CASE_GROUP_REGEX)
            ) {
                result.add(token);
                continue;
            }
            result.addAll(List.of(token.split(delimiterPattern)));
        }
        return result;
    }

    private final Map<String, K> keywordMap;

    public Parser(Class<K> keywordType) {
        var keywords = keywordType.getEnumConstants();
        keywordMap = new HashMap<>(keywords.length);

        for (var keyword : keywords) {
            var key = keyword.name().toLowerCase(Locale.ROOT);
            keywordMap.put(key, keyword);
        }
    }

    public Element.TokenParsing parseToken(String value) {
        var token = value.trim();

        if (STRING_PATTERN.matcher(token).find()) {
            return new Element.TokenParsing(new Element.StringLiteral(value));
        }
        if (token.startsWith("//")) {
            return new Element.TokenParsing(new Element.Comment(value));
        }
        if (token.matches(PASCAL_CASE_GROUP_REGEX)) {
            return new Element.TokenParsing(new Element.Type(value));
        }
        if (keywordMap.containsKey(token)) {
            return new Element.TokenParsing(new Element.Keyword(value));
        }
        if (token.matches("[,=+\\-*/;&|?:!<>]+")) {
            return new Element.TokenParsing(new Element.Symbol(value));
        }
        if (token.matches("-?\\d+(\\.\\d+)?")) {
            return new Element.TokenParsing(new Element.Number(value));
        }
        return new Element.TokenParsing(new Element.Other(value));
    }

    private static ArrayList<String> tokensEnclosedBy(
        Pattern pattern,
        String code
    ) {
        var matcher = pattern.matcher(code);
        var result = new ArrayList<String>();
        int lastIndex = 0;

        while (matcher.find()) {
            var startIndex = matcher.start();
            var endIndex = matcher.end();
            var delimiter1 = matcher.group(1);
            var matchedString = matcher.group(2);
            var delimiter2 = matcher.group(3);

            result.add(code.substring(lastIndex, startIndex));
            result.add(delimiter1 + matchedString + delimiter2);

            lastIndex = endIndex;
        }
        result.add(code.substring(lastIndex));
        return result;
    }

    private static ArrayList<String> tokensSurroundedBy(
        Pattern pattern,
        String code
    ) {
        var matcher = pattern.matcher(code);
        var result = new ArrayList<String>();
        int lastIndex = 0;

        while (matcher.find()) {
            var startIndex = matcher.start();
            var endIndex = matcher.end();

            result.add(code.substring(lastIndex, startIndex));

            for (int i = 1; i <= matcher.groupCount(); i++) {
                var matched = matcher.group(i);

                result.add(matched);
            }

            lastIndex = endIndex;
        }
        result.add(code.substring(lastIndex));
        return result;
    }
}