diff --git a/LanguageServer/src/main/java/org/nittalab/dtram/languageserver/utils/Tokenizer.java b/LanguageServer/src/main/java/org/nittalab/dtram/languageserver/utils/Tokenizer.java index 3e448bc..9ba0dae 100644 --- a/LanguageServer/src/main/java/org/nittalab/dtram/languageserver/utils/Tokenizer.java +++ b/LanguageServer/src/main/java/org/nittalab/dtram/languageserver/utils/Tokenizer.java @@ -8,8 +8,7 @@ import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.List; /** * Tokenizes the model file @@ -48,13 +47,13 @@ /** * Tokenizes all texts in {@link BufferedReader} * - * @return All tokens in {@link ArrayList} + * @return All tokens in {@link List} * @throws IOException Be thrown if {@link BufferedReader} fails to read text lines * @author Shohei Yamagiwa * @since 0.1 */ - public final ArrayList execute() throws IOException { - ArrayList allTokens = new ArrayList<>(); + public final List execute() throws IOException { + List allTokens = new ArrayList<>(); int line = 1; int column = 1; while (true) { @@ -80,8 +79,39 @@ allTokens = splitBySpace(allTokens); allTokens = extractMultilineComments(allTokens); allTokens = extractComments(allTokens); - allTokens = splitBySymbol(allTokens, Tokens.LEFT_BRACKET, Tokens.LEFT_BRACKET_REGX); - allTokens = splitBySymbol(allTokens, Tokens.RIGHT_BRACKET, Tokens.RIGHT_BRACKET_REGX); + + allTokens = splitTokens(allTokens, Tokens.EQ); + allTokens = splitTokens(allTokens, Tokens.NEQ); + allTokens = splitTokens(allTokens, Tokens.GE); + allTokens = splitTokens(allTokens, Tokens.LE); + allTokens = splitTokens(allTokens, Tokens.AND); + allTokens = splitTokens(allTokens, Tokens.OR); + + allTokens = splitTokens(allTokens, Tokens.LEFT_BRACKET); + allTokens = splitTokens(allTokens, Tokens.RIGHT_BRACKET); + allTokens = splitTokens(allTokens, Tokens.LEFT_CURLY_BRACKET); + allTokens = splitTokens(allTokens, Tokens.RIGHT_CURLY_BRACKET); + allTokens = splitTokens(allTokens, Tokens.LEFT_SQUARE_BRACKET); + allTokens = splitTokens(allTokens, Tokens.RIGHT_SQUARE_BRACKET); + + allTokens = splitTokens(allTokens, Tokens.ADD); + allTokens = splitTokens(allTokens, Tokens.SUB); + allTokens = splitTokens(allTokens, Tokens.MUL); + allTokens = splitTokens(allTokens, Tokens.DIV); + allTokens = splitTokens(allTokens, Tokens.MOD); + + allTokens = splitTokens(allTokens, Tokens.MINUS); + + allTokens = splitTokens(allTokens, Tokens.GT); + allTokens = splitTokens(allTokens, Tokens.LT); + allTokens = splitTokens(allTokens, Tokens.NEG); + + allTokens = splitTokens(allTokens, Tokens.EQUALS); + allTokens = splitTokens(allTokens, Tokens.ASSIGNMENT); + allTokens = splitTokens(allTokens, Tokens.COLON); + allTokens = splitTokens(allTokens, Tokens.COMMA); + allTokens = splitTokens(allTokens, Tokens.DOT); + allTokens = splitTokens(allTokens, Tokens.DOUBLE_QUOT); return allTokens; } @@ -89,11 +119,11 @@ * Splits tokens by half-width space. * * @param original Original tokens - * @return {@link ArrayList} of {@link Token} extracted by splitting + * @return {@link List} of {@link Token} extracted by splitting * @author Shohei Yamagiwa * @since 0.1 */ - protected static ArrayList splitBySpace(ArrayList original) { + protected static List splitBySpace(List original) { ArrayList newTokens = new ArrayList<>(); original.forEach(originalToken -> { @@ -131,11 +161,11 @@ * Extracts multiline comment from given tokens. * * @param original Original tokens - * @return {@link ArrayList} of {@link Token} with extracted multiline comments + * @return {@link List} of {@link Token} with extracted multiline comments * @author Shohei Yamagiwa * @since 0.1 */ - protected static ArrayList extractMultilineComments(ArrayList original) { + protected static List extractMultilineComments(List original) { ArrayList newTokens = new ArrayList<>(); Token commentToken = new Token(); @@ -177,11 +207,11 @@ * Extracts single-line comment from given tokens. * * @param original Original tokens - * @return New {@link ArrayList} of {@link Token} with extracted single-line comments + * @return New {@link List} of {@link Token} with extracted single-line comments * @author Shohei Yamagiwa * @since 0.1 */ - protected static ArrayList extractComments(ArrayList original) { + protected static List extractComments(List original) { ArrayList newTokens = new ArrayList<>(); int commentLine = -1; @@ -219,94 +249,84 @@ } /** - * FIXME: Output string is not correct. - *

- * Splits tokens into several new tokens with given symbol. + * Splits tokens into several new tokens by given symbol. * - * @param original Original tokens - * @param symbolStr String of the symbol. - * @param symbolRegex Regex of the symbol - * @return New split {@link ArrayList} of {@link Token} + * @param tokens Original tokens + * @param symbol String of the symbol. + * @return New split {@link List} of {@link Token} * @author Shohei Yamagiwa * @since 0.1 */ - protected static ArrayList splitBySymbol(final ArrayList original, final String symbolStr, final String symbolRegex) { - Pattern pattern = Pattern.compile(symbolRegex); // Compile regex to use in the loop. - ArrayList newTokens = new ArrayList<>(); + protected static List splitTokens(List tokens, String symbol) { + List newTokens = new ArrayList<>(); - for (Token originalToken : original) { - if (originalToken.isAtomic()) { // Token is atomic so no more splits are needed - newTokens.add(originalToken); + for (Token token : tokens) { + /* Keep holding old token because the token is already split properly */ + if (token.isAtomic()) { + newTokens.add(token); continue; } - /* Specified symbol isn't contained in the token */ - Matcher matcher = pattern.matcher(originalToken.getText()); - if (!matcher.find()) { - newTokens.add(originalToken); - continue; - } - - /* Split into several tokens */ - String[] tokens = originalToken.getText().split(symbolRegex, -1); - ArrayList splitTokens = new ArrayList<>(); - Position lastTokenPos = originalToken.getStartPos(); - - for (int i = 0; i < tokens.length; i++) { - String tokenStr = tokens[i]; - - if (i == 0) { - if (tokenStr.isEmpty()) { - Token symbol = new Token(symbolStr, lastTokenPos, true); - splitTokens.add(symbol); - lastTokenPos = symbol.getEndPos(); - } else { - Token newToken = new Token(tokenStr, lastTokenPos, false); - splitTokens.add(newToken); - lastTokenPos = newToken.getEndPos(); - - Token symbol = new Token(symbolStr, lastTokenPos.move(0, 1), true); - splitTokens.add(symbol); - lastTokenPos = symbol.getEndPos(); - } - continue; - } - if (i == tokens.length - 1 && !tokenStr.isEmpty()) { - Token newToken = new Token(tokenStr, lastTokenPos.move(0, 1), false); - splitTokens.add(newToken); - break; - } - if (tokenStr.isEmpty()) { - Token symbol = new Token(symbolStr, lastTokenPos.move(0, 1), true); - splitTokens.add(symbol); - lastTokenPos = symbol.getEndPos(); - } else { - Token newToken = new Token(tokenStr, lastTokenPos.move(0, 1), false); - splitTokens.add(newToken); - lastTokenPos = newToken.getEndPos(); - - Token symbol = new Token(symbolStr, lastTokenPos.move(0, 1), true); - splitTokens.add(symbol); - lastTokenPos = symbol.getEndPos(); - } - } - newTokens.addAll(splitTokens); + /* Split into minimal tokens */ + newTokens.addAll(splitToken(token, symbol)); } - System.out.println(constructTextFromTokens(newTokens)); - System.out.println(); return newTokens; } - private static String constructTextFromTokens(ArrayList tokens) { - int lastLine = 1; - StringBuilder builder = new StringBuilder(); - for (Token token : tokens) { - if (token.getStartPos().getLine() != lastLine) { - builder.append(System.lineSeparator()); - } - builder.append(token.getText()); - lastLine = token.getEndPos().getLine(); + /** + * Split a single token into several tokens by given symbol. + * + * @param token Original token + * @param symbol The symbol to be used in splitting. + * @return Split tokens in {@link List} + * @author Shohei Yamagiwa + * @since 0.1 + */ + protected static List splitToken(Token token, String symbol) { + /* The token is already minimal token, so skip it */ + if (token.isAtomic()) { + return List.of(token); } - return builder.toString(); + + String tokenText = token.getText(); + int symbolIndex = tokenText.indexOf(symbol); + + /* There is no symbol in the text so skip it */ + if (symbolIndex == -1) { + return List.of(token); + } + + List tokens = new ArrayList<>(); + if (symbolIndex == 0) { // Symbol in first + String symbolStr = tokenText.substring(symbolIndex, symbolIndex + symbol.length()); + Token symbolToken = new Token(symbolStr, token.getStartPos(), true); + tokens.add(symbolToken); + + String restStr = tokenText.substring(symbolIndex + symbol.length()); + if (!restStr.isEmpty()) { + Token restToken = new Token(restStr, symbolToken.getEndPos().move(0, 1), false); + tokens.addAll(splitToken(restToken, symbol)); + } + } else if (symbolIndex == tokenText.length() - 1) { // Symbol in the last + String restStr = tokenText.substring(0, symbolIndex); + String symbolStr = tokenText.substring(symbolIndex); + Token restToken = new Token(restStr, token.getStartPos(), false); + Token symbolToken = new Token(symbolStr, restToken.getEndPos().move(0, 1), true); + + tokens.addAll(splitToken(restToken, symbol)); + tokens.add(symbolToken); + } else { // Otherwise + String leftStr = tokenText.substring(0, symbolIndex); + String symbolStr = tokenText.substring(symbolIndex, symbolIndex + symbol.length()); + String rightStr = tokenText.substring(symbolIndex + symbol.length()); + Token leftToken = new Token(leftStr, token.getStartPos(), false); + Token symbolToken = new Token(symbolStr, leftToken.getEndPos().move(0, 1), true); + Token rightToken = new Token(rightStr, symbolToken.getEndPos().move(0, 1), false); + + tokens.addAll(splitToken(leftToken, symbol)); + tokens.add(symbolToken); + tokens.addAll(splitToken(rightToken, symbol)); + } + return tokens; } }