From 6c5657a8d7f1a42f1ed222c113ade1405e131e7c Mon Sep 17 00:00:00 2001 From: Malte Schmitz Date: Sat, 26 Nov 2016 20:15:30 +0100 Subject: [PATCH] Parser documentation --- src/main/java/expression/Addition.java | 5 +- src/main/java/expression/Expression.java | 14 +- src/main/java/expression/Identifier.java | 5 +- src/main/java/expression/Int.java | 8 +- src/main/java/expression/Subtraction.java | 5 +- src/main/java/parser/Parser.java | 395 ++++++++++++++++++++---------- src/main/java/program/Assignment.java | 5 +- src/main/java/program/Composition.java | 5 +- src/main/java/program/Conditional.java | 5 +- src/main/java/program/Loop.java | 5 +- src/main/java/program/Program.java | 16 +- 11 files changed, 329 insertions(+), 139 deletions(-) diff --git a/src/main/java/expression/Addition.java b/src/main/java/expression/Addition.java index 893f72e..0cefcd4 100644 --- a/src/main/java/expression/Addition.java +++ b/src/main/java/expression/Addition.java @@ -1,6 +1,9 @@ /*!! Expression */ -/*! # Addition */ +/*! +Addition +======== +*/ /*!- Header */ package expression; diff --git a/src/main/java/expression/Expression.java b/src/main/java/expression/Expression.java index ebabd60..63c3b31 100644 --- a/src/main/java/expression/Expression.java +++ b/src/main/java/expression/Expression.java @@ -1,9 +1,21 @@ /*!! Expression*/ -/*! # Expression*/ +/*! +Expression +============== +*/ /*!- Header */ package expression; /*! `Expression` is the common abstract class for Expressions that can be evaluated using the `Evaluator`. */ abstract public class Expression { } + +/*! Expression can be written as the following +[Algebraic Data Type (ADT)](https://en.wikipedia.org/wiki/Algebraic_data_type) + + Expression = Addition(leftHandSide: Expression, rightHandSide: Expression) + | Subtraction(leftHandSide: Expression, rightHandSide: Expression) + | Identifier(name: String) + | Int(value: int) +*/ \ No newline at end of file diff --git a/src/main/java/expression/Identifier.java b/src/main/java/expression/Identifier.java index 40f2b71..7dced40 100644 --- a/src/main/java/expression/Identifier.java +++ b/src/main/java/expression/Identifier.java @@ -1,6 +1,9 @@ /*!! Expression*/ -/*! # Identifier*/ +/*! +Identifier +========== +*/ /*! Header*/ package expression; diff --git a/src/main/java/expression/Int.java b/src/main/java/expression/Int.java index dcd9bd1..b026fc6 100644 --- a/src/main/java/expression/Int.java +++ b/src/main/java/expression/Int.java @@ -1,6 +1,12 @@ /*!! Expression*/ -/*! # Int */ +/*! +Int_(eger)_ +=========== + +In order to avoid confusion with Java's `Integer` auto-boxing class for the primitive `int` this wrapper is called +`Int` instead of `Integer`. +*/ /*!- Header */ package expression; diff --git a/src/main/java/expression/Subtraction.java b/src/main/java/expression/Subtraction.java index 0383c61..8de951a 100644 --- a/src/main/java/expression/Subtraction.java +++ b/src/main/java/expression/Subtraction.java @@ -1,6 +1,9 @@ /*!! Expression */ -/*! # Subtraction */ +/*! +Subtraction +=========== +*/ /*!- Header */ package expression; diff --git a/src/main/java/parser/Parser.java b/src/main/java/parser/Parser.java index 7da6534..a03ee84 100644 --- a/src/main/java/parser/Parser.java +++ b/src/main/java/parser/Parser.java @@ -1,7 +1,8 @@ /*!! Parser */ -/*! -# Parser +/*! +Parser +====== In order to parse simple while programs we use a [Recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser). The syntax of our while programs @@ -54,83 +55,84 @@ public class Parser { public Parser(String input) { this.input = input; } - - public Program parse() { - position = 0; - Program program = program(); - whitespace(); - if (position < input.length()) { - throw new SyntaxException("End of input", position); + + /*! + The Basics + ---------- + */ + + /*! We start with defining a helper function that consumes whitespaces, by incrementing the `position` until + the current character is not a whitespace. + + Such a function is necessary, because we do the tokenization on the fly during the parsing. In more complex + projects the [tokenization](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) would be an extra + pre-processing step which handles the whitespace removal and creates a stream of tokens out of the input string.*/ + private void whitespace() { + while(position < input.length() && Character.isWhitespace(input.charAt(position))) { + position += 1; } - return program; } - Program program() { - Program firstStatement = statement(); - List moreStatements = new ArrayList(); - while (test(";")) { - consume(";"); - Program statement = statement(); - moreStatements.add(statement); - } - Program program = firstStatement; - for (Program statement: moreStatements) { - program = new Composition(program, statement); + /*! Our parsing functions always want to parse something at the current position in the input and raise an + exception if not possible. In order to implement the rules containing _or_ we need either look-ahead or back + tracking in order to decide which branch to take. In those cases we catch the exceptions raised by the called + sub-parsers. + + The `consume` method consumes the given string by incrementing the `position`. It raises a `SyntaxException` + if the given string is not the next token in the `input` at the current `position`.*/ + private void consume(String token) { + whitespace(); + if (position + token.length() <= input.length() && input.substring(position, position + token.length()).equals(token)) { + position += token.length(); + } else { + throw new SyntaxException(token, position); } - return program; } - Program statement() { + /*! In some situations we want to perform a look-ahead: We want to test if the next token is the given one or not. + The `test` function calls the `consume` function defined above and returns if it raised an exception or not. */ + private boolean test(String token) { int start = position; - Program statement; + boolean success; try { - statement = assignment(); + consume(token); + success = true; } catch (SyntaxException se) { - position = start; - try { - statement = conditional(); - } catch (SyntaxException se2) { - position = start; - statement = loop(); - } + success = false; } - return statement; + position = start; + return success; } - Program loop() { - consume("while"); - consume("("); - Expression condition = expression(); - consume(")"); - consume("{"); - Program program = program(); - consume("}"); - return new Loop(condition, program); - } + /*! + Expression + ---------- - Program conditional() { - consume("if"); - consume("("); - Expression condition = expression(); - consume(")"); - consume("then"); - consume("{"); - Program thenCase = program(); - consume("}"); - consume("else"); - consume("{"); - Program elseCase = program(); - consume("}"); - return new Conditional(condition, thenCase, elseCase); - } + Using the basic mechanisms defined above we can implement parsing functions for expressions as defined in our + grammar: - Program assignment() { - Identifier identifier = identifier(); - consume(":="); - Expression expression = expression(); - return new Assignment(identifier, expression); - } + Expr = Expr "+" Atom | + Expr "-" Atom | + Atom + This rule is left recursive: If we want to parse an expression, we try to parse an addition first, which starts with + an expression, so we parse an expression by trying to parse an addition first, which starts with an expression, + so we parse an expression by ... In order to implement this rule we need to change it, so that the parsing process + terminates: + + Expr = Atom { ("+" | "-") Atom } + + In the rule above we replaced the recursion with the repetition indicated by `{` and `}`. Parsing this + repetitive rule involves two steps: Parsing the sequence of atoms and operators into a list + + List + + and translate this list into the real data structure afterwards. + + An operator is either `PLUS` or `MINUS`. */ + private enum Operator { PLUS, MINUS } + + /*! `OperatorWithExpression` stores a pair of an operator and the atom immediately following the operator. */ private static class OperatorWithExpression { private final Operator operator; private final Expression expression; @@ -141,45 +143,21 @@ public class Parser { } } - private enum Operator { PLUS, MINUS } - - private boolean testOperator() { - int start = position; - boolean result; - try { - operator(); - result = true; - } catch (SyntaxException se) { - result = false; - } - position = start; - return result; - } - - private Operator operator() { - whitespace(); - char next = (char) 0; - if (position < input.length()) { - next = input.charAt(position); - position += 1; - } - if (next == '+') { - return Operator.PLUS; - } else if (next == '-') { - return Operator.MINUS; - } else { - throw new SyntaxException("Operator", position); - } - } - + /*! Using this data structure we now can define the expression parser: */ Expression expression() { + /*! Parse the first atom */ Expression firstAtom = atom(); List moreAtoms = new ArrayList(); + /*! Parse more operators and atoms while the helper function `testOperator()` indicates that the `operator()` parser + will succeed (without raising an expression). */ while(testOperator()) { Operator operator = operator(); Expression expression = atom(); moreAtoms.add(new OperatorWithExpression(operator, expression)); } + /*! Translate the sequence of operator and atoms into the inductive `Addition` and `Subtraction` data + structure. We start with the `firstAtom` and replace the `expression` for every element of the list + with an `Addition` or `Subtraction` combining the old `expression` and the current list element. */ Expression expression = firstAtom; for (OperatorWithExpression atom: moreAtoms) { switch (atom.operator) { @@ -194,40 +172,115 @@ public class Parser { return expression; } + /*! The `expression` parser above uses the `operator` parser defined below. */ + + private Operator operator() { + whitespace(); + /*! Only check the character at the current position in the input if the current + position is a valid position in the input (and not after the end of the input).*/ + char next = (char) 0; + if (position < input.length()) { + next = input.charAt(position); + position += 1; + } + if (next == '+') { + return Operator.PLUS; + } else if (next == '-') { + return Operator.MINUS; + } else { + throw new SyntaxException("Operator", position); + } + } + + /*! In the `expression` parser above we used the method `testOperator` defined below which tests + if the `operator` parser would succeed without throwing a `SyntaxException`. The implementation calls + the `operator` parser in a `try` block and returns `false` if the exception was catched and `true` + otherwise. */ + private boolean testOperator() { + int start = position; + boolean result; + try { + operator(); + result = true; + } catch (SyntaxException se) { + result = false; + } + position = start; + return result; + } + + /*! The rule for the non-terminal Atom + + Atom = Id | Num | "(" Expr ")" + + can be directly translated into the following `atom` parser. We start by parsing an identifier. + If this succeeds we are done. If this parser throws a `SyntaxException` we try the next option + of the _or_: We parse a numeric literal. If the corresponding raises a `SyntaxException`, too, + we continue with the atom in braces. If this still raises a `SyntaxException` we pass on this exception + to our caller (by not catching it). In this case we failed parsing an atom. */ Expression atom() { int start = position; Expression result; try { - consume("("); - result = expression(); - consume(")"); + result = identifier(); } catch (SyntaxException se) { + /*! Reset the position. The `identifier` parser has failed, but it might have changed the global + `position` before raising the `SyntaxException` so we need to reset the position before trying + another parser.*/ position = start; try { result = integer(); } catch (SyntaxException se2) { - result = identifier(); + position = start; + consume("("); + result = expression(); + consume(")"); } } return result; } + /*! An identifier is a sequence of lower case letters. This helper functions checks if the given character + could be part of an identifier. */ private boolean isLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; } + /*! In order to parse an identifier we increment the current `position` while the current character could be part + of an identifier. */ + Identifier identifier() { + whitespace(); + int start = position; + while (position < input.length() && isLowerLetter(input.charAt(position))) { + position += 1; + } + + /*! We need to make sure that the identifier is not empty. */ + if (position > start) { + return new Identifier(input.substring(start, position)); + } else { + throw new SyntaxException("Identifier", position); + } + } + + /*! Parsing an integer follows more or less the same pattern as parsing an identifier (see above).*/ Expression integer() { whitespace(); int start = position; + /*! We check for a unary prefix minus first.*/ boolean minus = position < input.length() && input.charAt(position) == '-'; if (minus) { position += 1; } + /*! Now we check for at least one digit. */ boolean digitsFound = false; while (position < input.length() && Character.isDigit(input.charAt(position))) { position += 1; digitsFound = true; } + /*! In the end we relay on `Integer.parseInt` for translating the string that we found into + a real integer and wrap the returned value in an `Int` to create an element of the `Expression` + data structure. */ if (digitsFound) { return new Int(Integer.parseInt(input.substring(start, position))); } else { @@ -235,44 +288,130 @@ public class Parser { } } - Identifier identifier() { - whitespace(); - int start = position; - while (position < input.length() && isLowerLetter(input.charAt(position))) { - position += 1; + /*! + Program + ------- + + Parsing a program is pretty straight forward if we are able to parse token, identifier and expressions + using the parser functions defined in the last sections. There is only one problem left, that needs to + be solved first. The rule + + Prog = Id ":=" Expr | + Prog ";" Prog | + "if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}" | + "while" "(" Expr ")" "{" Prog "}" + + is again recursive in a way that leads to an endless recursion. We basically apply the same rewriting as + for the Expr rule and end up with + + Prog = Stmt { ";" Stmt } + Stmt = Id ":=" Expr | + "if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}" | + "while" "(" Expr ")" "{" Prog "}" + + We start with parsing this new Prog non-terminal in the same way as we parse Expr in `expression`. + In this case there is only one possible operator between the statements, the sequential operator `;`. + This simplifies the situation as we do not need to store the operator. Apart from that simplification + we apply the same idea with its two steps: 1) parsing the sequence and 2) creating the `Program` data structure from the + list. */ + + Program program() { + /*! Parsing the first statement which must be there */ + Program firstStatement = statement(); + /*! Parsing optional following statements seperated with `;` */ + List moreStatements = new ArrayList(); + while (test(";")) { + consume(";"); + Program statement = statement(); + moreStatements.add(statement); } - if (position > start) { - return new Identifier(input.substring(start, position)); - } else { - throw new SyntaxException("Identifier", position); + /*! We use the first statement as initial result */ + Program program = firstStatement; + /*! and then replace the result with a `Composition` combining the old result and the new statement.*/ + for (Program statement: moreStatements) { + program = new Composition(program, statement); } + return program; } - private void whitespace() { - while(position < input.length() && Character.isWhitespace(input.charAt(position))) { - position += 1; + /*! Parsing a statement boils down to trying to parse + - an assignment and if that fails + - a conditional and if that fails + - a loop and if that fails + - fail completely.*/ + Program statement() { + int start = position; + Program statement; + try { + statement = assignment(); + } catch (SyntaxException se) { + position = start; + try { + statement = conditional(); + } catch (SyntaxException se2) { + position = start; + statement = loop(); + } } + return statement; } - private void consume(String token) { - whitespace(); - if (position + token.length() <= input.length() && input.substring(position, position + token.length()).equals(token)) { - position += token.length(); - } else { - throw new SyntaxException(token, position); - } + /*! Parsing a loop is very straight forward and just follows the rule + `"while" "(" Expr ")" "{" Prog "}"`.*/ + Program loop() { + consume("while"); + consume("("); + Expression condition = expression(); + consume(")"); + consume("{"); + Program program = program(); + consume("}"); + return new Loop(condition, program); } - private boolean test(String token) { - int start = position; - boolean success; - try { - consume(token); - success = true; - } catch (SyntaxException se) { - success = false; + /*! Parsing a conditional simply follows the rule + `"if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}"`.*/ + Program conditional() { + consume("if"); + consume("("); + Expression condition = expression(); + consume(")"); + consume("then"); + consume("{"); + Program thenCase = program(); + consume("}"); + consume("else"); + consume("{"); + Program elseCase = program(); + consume("}"); + return new Conditional(condition, thenCase, elseCase); + } + + /*! Parsing an assignment simply follows the rule `Id ":=" Expr`.*/ + Program assignment() { + Identifier identifier = identifier(); + consume(":="); + Expression expression = expression(); + return new Assignment(identifier, expression); + } + + /*! + Checking The End + ---------------- + + Everything that remains to be done is checking that we reached the end of the input after we + are done. As every parser only consumes as much from the input as needed, the `program` parser + might end in the middle of the input string. In the following public interface method we call + the `program` parser and check that we have reached the end of the input afterwards.*/ + + public Program parse() { + position = 0; + Program program = program(); + /*! Whitespace is the only thing allowed after the program.*/ + whitespace(); + if (position < input.length()) { + throw new SyntaxException("End of input", position); } - position = start; - return success; + return program; } } diff --git a/src/main/java/program/Assignment.java b/src/main/java/program/Assignment.java index 4bd2038..6c7e4ed 100644 --- a/src/main/java/program/Assignment.java +++ b/src/main/java/program/Assignment.java @@ -1,6 +1,9 @@ /*!! Program*/ -/*! # Program */ +/*! +Assignment +========== +*/ /*!- Header */ diff --git a/src/main/java/program/Composition.java b/src/main/java/program/Composition.java index fc06491..370a111 100644 --- a/src/main/java/program/Composition.java +++ b/src/main/java/program/Composition.java @@ -1,6 +1,9 @@ /*!! Program*/ -/*! # Composition*/ +/*! +Composition +=========== +*/ /*!- Header*/ package program; diff --git a/src/main/java/program/Conditional.java b/src/main/java/program/Conditional.java index 8f7dfb2..100a354 100644 --- a/src/main/java/program/Conditional.java +++ b/src/main/java/program/Conditional.java @@ -1,6 +1,9 @@ /*!! Program*/ -/*! # Conditional*/ +/*! +Conditional +=========== +*/ /*!- Header*/ package program; diff --git a/src/main/java/program/Loop.java b/src/main/java/program/Loop.java index f21f3ce..ead2a92 100644 --- a/src/main/java/program/Loop.java +++ b/src/main/java/program/Loop.java @@ -1,6 +1,9 @@ /*!! Program*/ -/*! # Loop */ +/*! +Loop +==== +*/ /*!- Header*/ package program; diff --git a/src/main/java/program/Program.java b/src/main/java/program/Program.java index cd45b44..97883fa 100644 --- a/src/main/java/program/Program.java +++ b/src/main/java/program/Program.java @@ -1,9 +1,21 @@ /*!! Program*/ -/*! # Program */ +/*! +Program +======= +*/ /*!- Header*/ package program; -/*! `Program` is the abstract common class for programs that can be exeuted using the `Interpreter`. */ +/*! `Program` is the abstract common class for programs that can be executed using the `Interpreter`. */ abstract public class Program { } + +/*! Program can be written as the following +[Algebraic Data Type (ADT)](https://en.wikipedia.org/wiki/Algebraic_data_type) + + Program = Assignment(identifier: Identifier, expression: Expression) + | Composition(first: Program, second: Program) + | Loop(condition: Expression, program: Program) + | Conditional(condition: Expression, thenCase: Program, elseCase: Program) +*/ \ No newline at end of file