From 6c5657a8d7f1a42f1ed222c113ade1405e131e7c Mon Sep 17 00:00:00 2001
From: Malte Schmitz <malte@schmitz-sh.de>
Date: Sat, 26 Nov 2016 20:15:30 +0100
Subject: [PATCH] Parser documentation

---
 src/main/java/expression/Addition.java    |   5 +-
 src/main/java/expression/Expression.java  |  14 +-
 src/main/java/expression/Identifier.java  |   5 +-
 src/main/java/expression/Int.java         |   8 +-
 src/main/java/expression/Subtraction.java |   5 +-
 src/main/java/parser/Parser.java          | 395 ++++++++++++++++++++----------
 src/main/java/program/Assignment.java     |   5 +-
 src/main/java/program/Composition.java    |   5 +-
 src/main/java/program/Conditional.java    |   5 +-
 src/main/java/program/Loop.java           |   5 +-
 src/main/java/program/Program.java        |  16 +-
 11 files changed, 329 insertions(+), 139 deletions(-)

diff --git a/src/main/java/expression/Addition.java b/src/main/java/expression/Addition.java
index 893f72e..0cefcd4 100644
--- a/src/main/java/expression/Addition.java
+++ b/src/main/java/expression/Addition.java
@@ -1,6 +1,9 @@
 /*!! Expression */
 
-/*! # Addition */
+/*!
+Addition
+========
+*/
 
 /*!- Header */
 package expression;
diff --git a/src/main/java/expression/Expression.java b/src/main/java/expression/Expression.java
index ebabd60..63c3b31 100644
--- a/src/main/java/expression/Expression.java
+++ b/src/main/java/expression/Expression.java
@@ -1,9 +1,21 @@
 /*!! Expression*/
 
-/*! # Expression*/
+/*!
+Expression
+==============
+*/
 
 /*!- Header */
 package expression;
 
 /*! `Expression` is the common abstract class for Expressions that can be evaluated using the `Evaluator`. */
 abstract public class Expression { }
+
+/*! Expression can be written as the following
+[Algebraic Data Type (ADT)](https://en.wikipedia.org/wiki/Algebraic_data_type)
+
+    Expression = Addition(leftHandSide: Expression, rightHandSide: Expression)
+               | Subtraction(leftHandSide: Expression, rightHandSide: Expression)
+               | Identifier(name: String)
+               | Int(value: int)
+*/
\ No newline at end of file
diff --git a/src/main/java/expression/Identifier.java b/src/main/java/expression/Identifier.java
index 40f2b71..7dced40 100644
--- a/src/main/java/expression/Identifier.java
+++ b/src/main/java/expression/Identifier.java
@@ -1,6 +1,9 @@
 /*!! Expression*/
 
-/*! # Identifier*/
+/*!
+Identifier
+==========
+*/
 
 /*! Header*/
 package expression;
diff --git a/src/main/java/expression/Int.java b/src/main/java/expression/Int.java
index dcd9bd1..b026fc6 100644
--- a/src/main/java/expression/Int.java
+++ b/src/main/java/expression/Int.java
@@ -1,6 +1,12 @@
 /*!! Expression*/
 
-/*! # Int */
+/*!
+Int_(eger)_
+===========
+
+In order to avoid confusion with Java's `Integer` auto-boxing class for the primitive `int` this wrapper is called
+`Int` instead of `Integer`.
+*/
 
 /*!- Header */
 package expression;
diff --git a/src/main/java/expression/Subtraction.java b/src/main/java/expression/Subtraction.java
index 0383c61..8de951a 100644
--- a/src/main/java/expression/Subtraction.java
+++ b/src/main/java/expression/Subtraction.java
@@ -1,6 +1,9 @@
 /*!! Expression */
 
-/*! # Subtraction */
+/*!
+Subtraction
+===========
+*/
 
 /*!- Header */
 package expression;
diff --git a/src/main/java/parser/Parser.java b/src/main/java/parser/Parser.java
index 7da6534..a03ee84 100644
--- a/src/main/java/parser/Parser.java
+++ b/src/main/java/parser/Parser.java
@@ -1,7 +1,8 @@
 /*!! Parser */
-/*!
 
-# Parser
+/*!
+Parser
+======
 
 In order to parse simple while programs we use a
 [Recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser). The syntax of our while programs
@@ -54,83 +55,84 @@ public class Parser {
     public Parser(String input) {
         this.input = input;
     }
-    
-    public Program parse() {
-        position = 0;
-        Program program = program();
-        whitespace();
-        if (position < input.length()) {
-            throw new SyntaxException("End of input", position);
+
+    /*!
+    The Basics
+    ----------
+    */
+
+    /*! We start with defining a helper function that consumes whitespaces, by incrementing the `position` until
+    the current character is not a whitespace.
+
+    Such a function is necessary, because we do the tokenization on the fly during the parsing. In more complex
+    projects the [tokenization](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) would be an extra
+    pre-processing step which handles the whitespace removal and creates a stream of tokens out of the input string.*/
+    private void whitespace() {
+        while(position < input.length() && Character.isWhitespace(input.charAt(position))) {
+            position += 1;
         }
-        return program;
     }
 
-    Program program() {
-        Program firstStatement = statement();
-        List<Program> moreStatements = new ArrayList<Program>();
-        while (test(";")) {
-            consume(";");
-            Program statement = statement();
-            moreStatements.add(statement);
-        }
-        Program program = firstStatement;
-        for (Program statement: moreStatements) {
-            program = new Composition(program, statement);
+    /*! Our parsing functions always want to parse something at the current position in the input and raise an
+    exception if not possible. In order to implement the rules containing _or_ we need either look-ahead or back
+    tracking in order to decide which branch to take. In those cases we catch the exceptions raised by the called
+    sub-parsers.
+
+    The `consume` method consumes the given string by incrementing the `position`. It raises a `SyntaxException`
+    if the given string is not the next token in the `input` at the current `position`.*/
+    private void consume(String token) {
+        whitespace();
+        if (position + token.length() <= input.length() && input.substring(position, position + token.length()).equals(token)) {
+            position += token.length();
+        } else {
+            throw new SyntaxException(token, position);
         }
-        return program;
     }
 
-    Program statement() {
+    /*! In some situations we want to perform a look-ahead: We want to test if the next token is the given one or not.
+    The `test` function calls the `consume` function defined above and returns if it raised an exception or not. */
+    private boolean test(String token) {
         int start = position;
-        Program statement;
+        boolean success;
         try {
-            statement = assignment();
+            consume(token);
+            success = true;
         } catch (SyntaxException se) {
-            position = start;
-            try {
-                statement = conditional();
-            } catch (SyntaxException se2) {
-                position = start;
-                statement = loop();
-            }
+            success = false;
         }
-        return statement;
+        position = start;
+        return success;
     }
 
-    Program loop() {
-        consume("while");
-        consume("(");
-        Expression condition = expression();
-        consume(")");
-        consume("{");
-        Program program = program();
-        consume("}");
-        return new Loop(condition, program);
-    }
+    /*!
+    Expression
+    ----------
 
-    Program conditional() {
-        consume("if");
-        consume("(");
-        Expression condition = expression();
-        consume(")");
-        consume("then");
-        consume("{");
-        Program thenCase = program();
-        consume("}");
-        consume("else");
-        consume("{");
-        Program elseCase = program();
-        consume("}");
-        return new Conditional(condition, thenCase, elseCase);
-    }
+    Using the basic mechanisms defined above we can implement parsing functions for expressions as defined in our
+    grammar:
 
-    Program assignment() {
-        Identifier identifier = identifier();
-        consume(":=");
-        Expression expression = expression();
-        return new Assignment(identifier, expression);
-    }
+        Expr = Expr "+" Atom |
+               Expr "-" Atom |
+               Atom
 
+    This rule is left recursive: If we want to parse an expression, we try to parse an addition first, which starts with
+    an expression, so we parse an expression by trying to parse an addition first, which starts with an expression,
+    so we parse an expression by ... In order to implement this rule we need to change it, so that the parsing process
+    terminates:
+
+        Expr = Atom { ("+" | "-") Atom }
+
+    In the rule above we replaced the recursion with the repetition indicated by `{` and `}`. Parsing this
+    repetitive rule involves two steps: Parsing the sequence of atoms and operators into a list
+
+        List<OperatorWithExpression>
+
+    and translate this list into the real data structure afterwards.
+
+    An operator is either `PLUS` or `MINUS`. */
+    private enum Operator { PLUS, MINUS }
+
+    /*! `OperatorWithExpression` stores a pair of an operator and the atom immediately following the operator. */
     private static class OperatorWithExpression {
         private final Operator operator;
         private final Expression expression;
@@ -141,45 +143,21 @@ public class Parser {
         }
     }
 
-    private enum Operator { PLUS, MINUS }
-
-    private boolean testOperator() {
-        int start = position;
-        boolean result;
-        try {
-            operator();
-            result = true;
-        } catch (SyntaxException se) {
-            result = false;
-        }
-        position = start;
-        return result;
-    }
-
-    private Operator operator() {
-        whitespace();
-        char next = (char) 0;
-        if (position < input.length()) {
-            next = input.charAt(position);
-            position += 1;
-        }
-        if (next == '+') {
-            return Operator.PLUS;
-        } else if (next == '-') {
-            return Operator.MINUS;
-        } else {
-            throw new SyntaxException("Operator", position);
-        }
-    }
-
+    /*! Using this data structure we now can define the expression parser: */
     Expression expression() {
+        /*! Parse the first atom */
         Expression firstAtom = atom();
         List<OperatorWithExpression> moreAtoms = new ArrayList<OperatorWithExpression>();
+        /*! Parse more operators and atoms while the helper function `testOperator()` indicates that the `operator()` parser
+        will succeed (without raising an expression). */
         while(testOperator()) {
             Operator operator = operator();
             Expression expression = atom();
             moreAtoms.add(new OperatorWithExpression(operator, expression));
         }
+        /*! Translate the sequence of operator and atoms into the inductive `Addition` and `Subtraction` data
+        structure. We start with the `firstAtom` and replace the `expression` for every element of the list
+        with an `Addition` or `Subtraction` combining the old `expression` and the current list element. */
         Expression expression = firstAtom;
         for (OperatorWithExpression atom: moreAtoms) {
             switch (atom.operator) {
@@ -194,40 +172,115 @@ public class Parser {
         return expression;
     }
 
+    /*! The `expression` parser above uses the `operator` parser defined below. */
+
+    private Operator operator() {
+        whitespace();
+        /*! Only check the character at the current position in the input if the current
+        position is a valid position in the input (and not after the end of the input).*/
+        char next = (char) 0;
+        if (position < input.length()) {
+            next = input.charAt(position);
+            position += 1;
+        }
+        if (next == '+') {
+            return Operator.PLUS;
+        } else if (next == '-') {
+            return Operator.MINUS;
+        } else {
+            throw new SyntaxException("Operator", position);
+        }
+    }
+
+    /*! In the `expression` parser above we used the method `testOperator` defined below which tests
+    if the `operator` parser would succeed without throwing a `SyntaxException`. The implementation calls
+    the `operator` parser in a `try` block and returns `false` if the exception was catched and `true`
+    otherwise. */
+    private boolean testOperator() {
+        int start = position;
+        boolean result;
+        try {
+            operator();
+            result = true;
+        } catch (SyntaxException se) {
+            result = false;
+        }
+        position = start;
+        return result;
+    }
+
+    /*! The rule for the non-terminal Atom
+
+        Atom = Id | Num | "(" Expr ")"
+
+    can be directly translated into the following `atom` parser. We start by parsing an identifier.
+    If this succeeds we are done. If this parser throws a `SyntaxException` we try the next option
+    of the _or_: We parse a numeric literal. If the corresponding  raises a `SyntaxException`, too,
+    we continue with the atom in braces. If this still raises a `SyntaxException` we pass on this exception
+    to our caller (by not catching it). In this case we failed parsing an atom. */
     Expression atom() {
         int start = position;
         Expression result;
         try {
-            consume("(");
-            result = expression();
-            consume(")");
+            result = identifier();
         } catch (SyntaxException se) {
+            /*! Reset the position. The `identifier` parser has failed, but it might have changed the global
+            `position` before raising the `SyntaxException` so we need to reset the position before trying
+            another parser.*/
             position = start;
             try {
                 result = integer();
             } catch (SyntaxException se2) {
-                result = identifier();
+                position = start;
+                consume("(");
+                result = expression();
+                consume(")");
             }
         }
         return result;
     }
 
+    /*! An identifier is a sequence of lower case letters. This helper functions checks if the given character
+    could be part of an identifier. */
     private boolean isLowerLetter(char ch) {
         return ch >= 'a' && ch <= 'z';
     }
 
+    /*! In order to parse an identifier we increment the current `position` while the current character could be part
+    of an identifier. */
+    Identifier identifier() {
+        whitespace();
+        int start = position;
+        while (position < input.length() && isLowerLetter(input.charAt(position))) {
+            position += 1;
+        }
+
+        /*! We need to make sure that the identifier is not empty. */
+        if (position > start) {
+            return new Identifier(input.substring(start, position));
+        } else {
+            throw new SyntaxException("Identifier", position);
+        }
+    }
+
+    /*! Parsing an integer follows more or less the same pattern as parsing an identifier (see above).*/
     Expression integer() {
         whitespace();
         int start = position;
+        /*! We check for a unary prefix minus first.*/
         boolean minus = position < input.length() && input.charAt(position) == '-';
         if (minus) {
             position += 1;
         }
+        /*! Now we check for at least one digit. */
         boolean digitsFound = false;
         while (position < input.length() && Character.isDigit(input.charAt(position))) {
             position += 1;
             digitsFound = true;
         }
+        /*! In the end we relay on `Integer.parseInt` for translating the string that we found into
+        a real integer and wrap the returned value in an `Int` to create an element of the `Expression`
+        data structure. */
         if (digitsFound) {
             return new Int(Integer.parseInt(input.substring(start, position)));
         } else {
@@ -235,44 +288,130 @@ public class Parser {
         }
     }
 
-    Identifier identifier() {
-        whitespace();
-        int start = position;
-        while (position < input.length() && isLowerLetter(input.charAt(position))) {
-            position += 1;
+    /*!
+    Program
+    -------
+
+    Parsing a program is pretty straight forward if we are able to parse token, identifier and expressions
+    using the parser functions defined in the last sections. There is only one problem left, that needs to
+    be solved first. The rule
+
+        Prog = Id ":=" Expr |
+               Prog ";" Prog |
+              "if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}" |
+              "while" "(" Expr ")" "{" Prog "}"
+
+    is again recursive in a way that leads to an endless recursion. We basically apply the same rewriting as
+    for the Expr rule and end up with
+
+        Prog = Stmt { ";" Stmt }
+        Stmt = Id ":=" Expr |
+              "if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}" |
+              "while" "(" Expr ")" "{" Prog "}"
+
+    We start with parsing this new Prog non-terminal in the same way as we parse Expr in `expression`.
+    In this case there is only one possible operator between the statements, the sequential operator `;`.
+    This simplifies the situation as we do not need to store the operator. Apart from that simplification
+    we apply the same idea with its two steps: 1) parsing the sequence and 2) creating the `Program` data structure from the
+    list. */
+
+    Program program() {
+        /*! Parsing the first statement which must be there */
+        Program firstStatement = statement();
+        /*! Parsing optional following statements seperated with `;` */
+        List<Program> moreStatements = new ArrayList<Program>();
+        while (test(";")) {
+            consume(";");
+            Program statement = statement();
+            moreStatements.add(statement);
         }
-        if (position > start) {
-            return new Identifier(input.substring(start, position));
-        } else {
-            throw new SyntaxException("Identifier", position);
+        /*! We use the first statement as initial result */
+        Program program = firstStatement;
+        /*! and then replace the result with a `Composition` combining the old result and the new statement.*/
+        for (Program statement: moreStatements) {
+            program = new Composition(program, statement);
         }
+        return program;
     }
 
-    private void whitespace() {
-        while(position < input.length() && Character.isWhitespace(input.charAt(position))) {
-            position += 1;
+    /*! Parsing a statement boils down to trying to parse
+    - an assignment and if that fails
+    - a conditional and if that fails
+    - a loop and if that fails
+    - fail completely.*/
+    Program statement() {
+        int start = position;
+        Program statement;
+        try {
+            statement = assignment();
+        } catch (SyntaxException se) {
+            position = start;
+            try {
+                statement = conditional();
+            } catch (SyntaxException se2) {
+                position = start;
+                statement = loop();
+            }
         }
+        return statement;
     }
 
-    private void consume(String token) {
-        whitespace();
-        if (position + token.length() <= input.length() && input.substring(position, position + token.length()).equals(token)) {
-            position += token.length();
-        } else {
-            throw new SyntaxException(token, position);
-        }
+    /*! Parsing a loop is very straight forward and just follows the rule
+    `"while" "(" Expr ")" "{" Prog "}"`.*/
+    Program loop() {
+        consume("while");
+        consume("(");
+        Expression condition = expression();
+        consume(")");
+        consume("{");
+        Program program = program();
+        consume("}");
+        return new Loop(condition, program);
     }
 
-    private boolean test(String token) {
-        int start = position;
-        boolean success;
-        try {
-            consume(token);
-            success = true;
-        } catch (SyntaxException se) {
-            success = false;
+    /*! Parsing a conditional simply follows the rule
+    `"if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}"`.*/
+    Program conditional() {
+        consume("if");
+        consume("(");
+        Expression condition = expression();
+        consume(")");
+        consume("then");
+        consume("{");
+        Program thenCase = program();
+        consume("}");
+        consume("else");
+        consume("{");
+        Program elseCase = program();
+        consume("}");
+        return new Conditional(condition, thenCase, elseCase);
+    }
+
+    /*! Parsing an assignment simply follows the rule `Id ":=" Expr`.*/
+    Program assignment() {
+        Identifier identifier = identifier();
+        consume(":=");
+        Expression expression = expression();
+        return new Assignment(identifier, expression);
+    }
+
+    /*!
+    Checking The End
+    ----------------
+
+    Everything that remains to be done is checking that we reached the end of the input after we
+    are done. As every parser only consumes as much from the input as needed, the `program` parser
+    might end in the middle of the input string. In the following public interface method we call
+    the `program` parser and check that we have reached the end of the input afterwards.*/
+
+    public Program parse() {
+        position = 0;
+        Program program = program();
+        /*! Whitespace is the only thing allowed after the program.*/
+        whitespace();
+        if (position < input.length()) {
+            throw new SyntaxException("End of input", position);
         }
-        position = start;
-        return success;
+        return program;
     }
 }
diff --git a/src/main/java/program/Assignment.java b/src/main/java/program/Assignment.java
index 4bd2038..6c7e4ed 100644
--- a/src/main/java/program/Assignment.java
+++ b/src/main/java/program/Assignment.java
@@ -1,6 +1,9 @@
 /*!! Program*/
 
-/*! # Program */
+/*!
+Assignment
+==========
+*/
 
 /*!- Header */
 
diff --git a/src/main/java/program/Composition.java b/src/main/java/program/Composition.java
index fc06491..370a111 100644
--- a/src/main/java/program/Composition.java
+++ b/src/main/java/program/Composition.java
@@ -1,6 +1,9 @@
 /*!! Program*/
 
-/*! # Composition*/
+/*!
+Composition
+===========
+*/
 
 /*!- Header*/
 package program;
diff --git a/src/main/java/program/Conditional.java b/src/main/java/program/Conditional.java
index 8f7dfb2..100a354 100644
--- a/src/main/java/program/Conditional.java
+++ b/src/main/java/program/Conditional.java
@@ -1,6 +1,9 @@
 /*!! Program*/
 
-/*! # Conditional*/
+/*!
+Conditional
+===========
+*/
 
 /*!- Header*/
 package program;
diff --git a/src/main/java/program/Loop.java b/src/main/java/program/Loop.java
index f21f3ce..ead2a92 100644
--- a/src/main/java/program/Loop.java
+++ b/src/main/java/program/Loop.java
@@ -1,6 +1,9 @@
 /*!! Program*/
 
-/*! # Loop */
+/*!
+Loop
+====
+*/
 
 /*!- Header*/
 package program;
diff --git a/src/main/java/program/Program.java b/src/main/java/program/Program.java
index cd45b44..97883fa 100644
--- a/src/main/java/program/Program.java
+++ b/src/main/java/program/Program.java
@@ -1,9 +1,21 @@
 /*!! Program*/
 
-/*! # Program */
+/*!
+Program
+=======
+*/
 
 /*!- Header*/
 package program;
 
-/*! `Program` is the abstract common class for programs that can be exeuted using the `Interpreter`. */
+/*! `Program` is the abstract common class for programs that can be executed using the `Interpreter`. */
 abstract public class Program { }
+
+/*! Program can be written as the following
+[Algebraic Data Type (ADT)](https://en.wikipedia.org/wiki/Algebraic_data_type)
+
+    Program = Assignment(identifier: Identifier, expression: Expression)
+            | Composition(first: Program, second: Program)
+            | Loop(condition: Expression, program: Program)
+            | Conditional(condition: Expression, thenCase: Program, elseCase: Program)
+*/
\ No newline at end of file