浏览代码

Parser documentation

pull/1/head
Malte Schmitz 9 年前
父节点
当前提交
6c5657a8d7
共有 11 个文件被更改,包括 329 次插入139 次删除
  1. +4
    -1
      src/main/java/expression/Addition.java
  2. +13
    -1
      src/main/java/expression/Expression.java
  3. +4
    -1
      src/main/java/expression/Identifier.java
  4. +7
    -1
      src/main/java/expression/Int.java
  5. +4
    -1
      src/main/java/expression/Subtraction.java
  6. +267
    -128
      src/main/java/parser/Parser.java
  7. +4
    -1
      src/main/java/program/Assignment.java
  8. +4
    -1
      src/main/java/program/Composition.java
  9. +4
    -1
      src/main/java/program/Conditional.java
  10. +4
    -1
      src/main/java/program/Loop.java
  11. +14
    -2
      src/main/java/program/Program.java

+ 4
- 1
src/main/java/expression/Addition.java 查看文件

@@ -1,6 +1,9 @@
/*!! Expression */

/*! # Addition */
/*!
Addition
========
*/

/*!- Header */
package expression;


+ 13
- 1
src/main/java/expression/Expression.java 查看文件

@@ -1,9 +1,21 @@
/*!! Expression*/

/*! # Expression*/
/*!
Expression
==============
*/

/*!- Header */
package expression;

/*! `Expression` is the common abstract class for Expressions that can be evaluated using the `Evaluator`. */
abstract public class Expression { }

/*! Expression can be written as the following
[Algebraic Data Type (ADT)](https://en.wikipedia.org/wiki/Algebraic_data_type)

Expression = Addition(leftHandSide: Expression, rightHandSide: Expression)
| Subtraction(leftHandSide: Expression, rightHandSide: Expression)
| Identifier(name: String)
| Int(value: int)
*/

+ 4
- 1
src/main/java/expression/Identifier.java 查看文件

@@ -1,6 +1,9 @@
/*!! Expression*/

/*! # Identifier*/
/*!
Identifier
==========
*/

/*! Header*/
package expression;


+ 7
- 1
src/main/java/expression/Int.java 查看文件

@@ -1,6 +1,12 @@
/*!! Expression*/

/*! # Int */
/*!
Int_(eger)_
===========

In order to avoid confusion with Java's `Integer` auto-boxing class for the primitive `int` this wrapper is called
`Int` instead of `Integer`.
*/

/*!- Header */
package expression;


+ 4
- 1
src/main/java/expression/Subtraction.java 查看文件

@@ -1,6 +1,9 @@
/*!! Expression */

/*! # Subtraction */
/*!
Subtraction
===========
*/

/*!- Header */
package expression;


+ 267
- 128
src/main/java/parser/Parser.java 查看文件

@@ -1,7 +1,8 @@
/*!! Parser */
/*!

# Parser
/*!
Parser
======

In order to parse simple while programs we use a
[Recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser). The syntax of our while programs
@@ -54,83 +55,84 @@ public class Parser {
public Parser(String input) {
this.input = input;
}
public Program parse() {
position = 0;
Program program = program();
whitespace();
if (position < input.length()) {
throw new SyntaxException("End of input", position);

/*!
The Basics
----------
*/

/*! We start with defining a helper function that consumes whitespaces, by incrementing the `position` until
the current character is not a whitespace.

Such a function is necessary, because we do the tokenization on the fly during the parsing. In more complex
projects the [tokenization](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) would be an extra
pre-processing step which handles the whitespace removal and creates a stream of tokens out of the input string.*/
private void whitespace() {
while(position < input.length() && Character.isWhitespace(input.charAt(position))) {
position += 1;
}
return program;
}

Program program() {
Program firstStatement = statement();
List<Program> moreStatements = new ArrayList<Program>();
while (test(";")) {
consume(";");
Program statement = statement();
moreStatements.add(statement);
}
Program program = firstStatement;
for (Program statement: moreStatements) {
program = new Composition(program, statement);
/*! Our parsing functions always want to parse something at the current position in the input and raise an
exception if not possible. In order to implement the rules containing _or_ we need either look-ahead or back
tracking in order to decide which branch to take. In those cases we catch the exceptions raised by the called
sub-parsers.

The `consume` method consumes the given string by incrementing the `position`. It raises a `SyntaxException`
if the given string is not the next token in the `input` at the current `position`.*/
private void consume(String token) {
whitespace();
if (position + token.length() <= input.length() && input.substring(position, position + token.length()).equals(token)) {
position += token.length();
} else {
throw new SyntaxException(token, position);
}
return program;
}

Program statement() {
/*! In some situations we want to perform a look-ahead: We want to test if the next token is the given one or not.
The `test` function calls the `consume` function defined above and returns if it raised an exception or not. */
private boolean test(String token) {
int start = position;
Program statement;
boolean success;
try {
statement = assignment();
consume(token);
success = true;
} catch (SyntaxException se) {
position = start;
try {
statement = conditional();
} catch (SyntaxException se2) {
position = start;
statement = loop();
}
success = false;
}
return statement;
position = start;
return success;
}

Program loop() {
consume("while");
consume("(");
Expression condition = expression();
consume(")");
consume("{");
Program program = program();
consume("}");
return new Loop(condition, program);
}
/*!
Expression
----------

Program conditional() {
consume("if");
consume("(");
Expression condition = expression();
consume(")");
consume("then");
consume("{");
Program thenCase = program();
consume("}");
consume("else");
consume("{");
Program elseCase = program();
consume("}");
return new Conditional(condition, thenCase, elseCase);
}
Using the basic mechanisms defined above we can implement parsing functions for expressions as defined in our
grammar:

Program assignment() {
Identifier identifier = identifier();
consume(":=");
Expression expression = expression();
return new Assignment(identifier, expression);
}
Expr = Expr "+" Atom |
Expr "-" Atom |
Atom

This rule is left recursive: If we want to parse an expression, we try to parse an addition first, which starts with
an expression, so we parse an expression by trying to parse an addition first, which starts with an expression,
so we parse an expression by ... In order to implement this rule we need to change it, so that the parsing process
terminates:

Expr = Atom { ("+" | "-") Atom }

In the rule above we replaced the recursion with the repetition indicated by `{` and `}`. Parsing this
repetitive rule involves two steps: Parsing the sequence of atoms and operators into a list

List<OperatorWithExpression>

and translate this list into the real data structure afterwards.

An operator is either `PLUS` or `MINUS`. */
private enum Operator { PLUS, MINUS }

/*! `OperatorWithExpression` stores a pair of an operator and the atom immediately following the operator. */
private static class OperatorWithExpression {
private final Operator operator;
private final Expression expression;
@@ -141,45 +143,21 @@ public class Parser {
}
}

private enum Operator { PLUS, MINUS }

private boolean testOperator() {
int start = position;
boolean result;
try {
operator();
result = true;
} catch (SyntaxException se) {
result = false;
}
position = start;
return result;
}

private Operator operator() {
whitespace();
char next = (char) 0;
if (position < input.length()) {
next = input.charAt(position);
position += 1;
}
if (next == '+') {
return Operator.PLUS;
} else if (next == '-') {
return Operator.MINUS;
} else {
throw new SyntaxException("Operator", position);
}
}

/*! Using this data structure we now can define the expression parser: */
Expression expression() {
/*! Parse the first atom */
Expression firstAtom = atom();
List<OperatorWithExpression> moreAtoms = new ArrayList<OperatorWithExpression>();
/*! Parse more operators and atoms while the helper function `testOperator()` indicates that the `operator()` parser
will succeed (without raising an expression). */
while(testOperator()) {
Operator operator = operator();
Expression expression = atom();
moreAtoms.add(new OperatorWithExpression(operator, expression));
}
/*! Translate the sequence of operator and atoms into the inductive `Addition` and `Subtraction` data
structure. We start with the `firstAtom` and replace the `expression` for every element of the list
with an `Addition` or `Subtraction` combining the old `expression` and the current list element. */
Expression expression = firstAtom;
for (OperatorWithExpression atom: moreAtoms) {
switch (atom.operator) {
@@ -194,40 +172,115 @@ public class Parser {
return expression;
}

/*! The `expression` parser above uses the `operator` parser defined below. */

private Operator operator() {
whitespace();
/*! Only check the character at the current position in the input if the current
position is a valid position in the input (and not after the end of the input).*/
char next = (char) 0;
if (position < input.length()) {
next = input.charAt(position);
position += 1;
}
if (next == '+') {
return Operator.PLUS;
} else if (next == '-') {
return Operator.MINUS;
} else {
throw new SyntaxException("Operator", position);
}
}

/*! In the `expression` parser above we used the method `testOperator` defined below which tests
if the `operator` parser would succeed without throwing a `SyntaxException`. The implementation calls
the `operator` parser in a `try` block and returns `false` if the exception was catched and `true`
otherwise. */
private boolean testOperator() {
int start = position;
boolean result;
try {
operator();
result = true;
} catch (SyntaxException se) {
result = false;
}
position = start;
return result;
}

/*! The rule for the non-terminal Atom

Atom = Id | Num | "(" Expr ")"

can be directly translated into the following `atom` parser. We start by parsing an identifier.
If this succeeds we are done. If this parser throws a `SyntaxException` we try the next option
of the _or_: We parse a numeric literal. If the corresponding raises a `SyntaxException`, too,
we continue with the atom in braces. If this still raises a `SyntaxException` we pass on this exception
to our caller (by not catching it). In this case we failed parsing an atom. */
Expression atom() {
int start = position;
Expression result;
try {
consume("(");
result = expression();
consume(")");
result = identifier();
} catch (SyntaxException se) {
/*! Reset the position. The `identifier` parser has failed, but it might have changed the global
`position` before raising the `SyntaxException` so we need to reset the position before trying
another parser.*/
position = start;
try {
result = integer();
} catch (SyntaxException se2) {
result = identifier();
position = start;
consume("(");
result = expression();
consume(")");
}
}
return result;
}

/*! An identifier is a sequence of lower case letters. This helper functions checks if the given character
could be part of an identifier. */
private boolean isLowerLetter(char ch) {
return ch >= 'a' && ch <= 'z';
}

/*! In order to parse an identifier we increment the current `position` while the current character could be part
of an identifier. */
Identifier identifier() {
whitespace();
int start = position;
while (position < input.length() && isLowerLetter(input.charAt(position))) {
position += 1;
}

/*! We need to make sure that the identifier is not empty. */
if (position > start) {
return new Identifier(input.substring(start, position));
} else {
throw new SyntaxException("Identifier", position);
}
}

/*! Parsing an integer follows more or less the same pattern as parsing an identifier (see above).*/
Expression integer() {
whitespace();
int start = position;
/*! We check for a unary prefix minus first.*/
boolean minus = position < input.length() && input.charAt(position) == '-';
if (minus) {
position += 1;
}
/*! Now we check for at least one digit. */
boolean digitsFound = false;
while (position < input.length() && Character.isDigit(input.charAt(position))) {
position += 1;
digitsFound = true;
}
/*! In the end we relay on `Integer.parseInt` for translating the string that we found into
a real integer and wrap the returned value in an `Int` to create an element of the `Expression`
data structure. */
if (digitsFound) {
return new Int(Integer.parseInt(input.substring(start, position)));
} else {
@@ -235,44 +288,130 @@ public class Parser {
}
}

Identifier identifier() {
whitespace();
int start = position;
while (position < input.length() && isLowerLetter(input.charAt(position))) {
position += 1;
/*!
Program
-------

Parsing a program is pretty straight forward if we are able to parse token, identifier and expressions
using the parser functions defined in the last sections. There is only one problem left, that needs to
be solved first. The rule

Prog = Id ":=" Expr |
Prog ";" Prog |
"if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}" |
"while" "(" Expr ")" "{" Prog "}"

is again recursive in a way that leads to an endless recursion. We basically apply the same rewriting as
for the Expr rule and end up with

Prog = Stmt { ";" Stmt }
Stmt = Id ":=" Expr |
"if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}" |
"while" "(" Expr ")" "{" Prog "}"

We start with parsing this new Prog non-terminal in the same way as we parse Expr in `expression`.
In this case there is only one possible operator between the statements, the sequential operator `;`.
This simplifies the situation as we do not need to store the operator. Apart from that simplification
we apply the same idea with its two steps: 1) parsing the sequence and 2) creating the `Program` data structure from the
list. */

Program program() {
/*! Parsing the first statement which must be there */
Program firstStatement = statement();
/*! Parsing optional following statements seperated with `;` */
List<Program> moreStatements = new ArrayList<Program>();
while (test(";")) {
consume(";");
Program statement = statement();
moreStatements.add(statement);
}
if (position > start) {
return new Identifier(input.substring(start, position));
} else {
throw new SyntaxException("Identifier", position);
/*! We use the first statement as initial result */
Program program = firstStatement;
/*! and then replace the result with a `Composition` combining the old result and the new statement.*/
for (Program statement: moreStatements) {
program = new Composition(program, statement);
}
return program;
}

private void whitespace() {
while(position < input.length() && Character.isWhitespace(input.charAt(position))) {
position += 1;
/*! Parsing a statement boils down to trying to parse
- an assignment and if that fails
- a conditional and if that fails
- a loop and if that fails
- fail completely.*/
Program statement() {
int start = position;
Program statement;
try {
statement = assignment();
} catch (SyntaxException se) {
position = start;
try {
statement = conditional();
} catch (SyntaxException se2) {
position = start;
statement = loop();
}
}
return statement;
}

private void consume(String token) {
whitespace();
if (position + token.length() <= input.length() && input.substring(position, position + token.length()).equals(token)) {
position += token.length();
} else {
throw new SyntaxException(token, position);
}
/*! Parsing a loop is very straight forward and just follows the rule
`"while" "(" Expr ")" "{" Prog "}"`.*/
Program loop() {
consume("while");
consume("(");
Expression condition = expression();
consume(")");
consume("{");
Program program = program();
consume("}");
return new Loop(condition, program);
}

private boolean test(String token) {
int start = position;
boolean success;
try {
consume(token);
success = true;
} catch (SyntaxException se) {
success = false;
/*! Parsing a conditional simply follows the rule
`"if" "(" Expr ")" "then" "{" Prog "}" "else" "{" Prog "}"`.*/
Program conditional() {
consume("if");
consume("(");
Expression condition = expression();
consume(")");
consume("then");
consume("{");
Program thenCase = program();
consume("}");
consume("else");
consume("{");
Program elseCase = program();
consume("}");
return new Conditional(condition, thenCase, elseCase);
}

/*! Parsing an assignment simply follows the rule `Id ":=" Expr`.*/
Program assignment() {
Identifier identifier = identifier();
consume(":=");
Expression expression = expression();
return new Assignment(identifier, expression);
}

/*!
Checking The End
----------------

Everything that remains to be done is checking that we reached the end of the input after we
are done. As every parser only consumes as much from the input as needed, the `program` parser
might end in the middle of the input string. In the following public interface method we call
the `program` parser and check that we have reached the end of the input afterwards.*/

public Program parse() {
position = 0;
Program program = program();
/*! Whitespace is the only thing allowed after the program.*/
whitespace();
if (position < input.length()) {
throw new SyntaxException("End of input", position);
}
position = start;
return success;
return program;
}
}

+ 4
- 1
src/main/java/program/Assignment.java 查看文件

@@ -1,6 +1,9 @@
/*!! Program*/

/*! # Program */
/*!
Assignment
==========
*/

/*!- Header */



+ 4
- 1
src/main/java/program/Composition.java 查看文件

@@ -1,6 +1,9 @@
/*!! Program*/

/*! # Composition*/
/*!
Composition
===========
*/

/*!- Header*/
package program;


+ 4
- 1
src/main/java/program/Conditional.java 查看文件

@@ -1,6 +1,9 @@
/*!! Program*/

/*! # Conditional*/
/*!
Conditional
===========
*/

/*!- Header*/
package program;


+ 4
- 1
src/main/java/program/Loop.java 查看文件

@@ -1,6 +1,9 @@
/*!! Program*/

/*! # Loop */
/*!
Loop
====
*/

/*!- Header*/
package program;


+ 14
- 2
src/main/java/program/Program.java 查看文件

@@ -1,9 +1,21 @@
/*!! Program*/

/*! # Program */
/*!
Program
=======
*/

/*!- Header*/
package program;

/*! `Program` is the abstract common class for programs that can be exeuted using the `Interpreter`. */
/*! `Program` is the abstract common class for programs that can be executed using the `Interpreter`. */
abstract public class Program { }

/*! Program can be written as the following
[Algebraic Data Type (ADT)](https://en.wikipedia.org/wiki/Algebraic_data_type)

Program = Assignment(identifier: Identifier, expression: Expression)
| Composition(first: Program, second: Program)
| Loop(condition: Expression, program: Program)
| Conditional(condition: Expression, thenCase: Program, elseCase: Program)
*/

正在加载...
取消
保存