I was always very facinated of everything about compilers, interpreters, parsers and lexers. Now I was wondering how I could implement such things using Python. I decided to write a simple lexer for algebraic expressions that converts a sequence of characters into a sequence of tokens. Till now I just implemented such things using C or C++ and so I started the simplest way as I learned by reading the Dragon Book.
First of all I created a simple enum type that allows me to create enums as used to from languages like C++ and Java. Then I implemented a Token class that holds all information about a token and enables me to easily print a token using the Python’s magic str method. Finally I added a Lexer class that processes character by character and returns the tokens.
This resulted in the following code:
#!/usr/bin/python
import sys
import os
import re
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
reverse = dict((value, key) for key, value in enums.iteritems())
enums['key'] = reverse
return type('Enum', (), enums)
Type = enum(
'PLUS', 'MINUS', 'TIMES', 'DIV', 'MOD', 'EXP', 'FAC',
'FUNC', 'VAR', 'NUMBER', 'LBRACE', 'RBRACE'
)
class Token(object):
def __init__(self, type, alias, value):
self.type = type
self.alias = alias
self.value = value
def __str__(self):
return '{2} <{1}>'.format(self.type, self.alias, self.value)
class Lexer(object):
def __init__(self, stream):
# Initialize class attributes ...
self.stream = stream
self.current = None
self.offset = -1
self.matchRes = None
# ... and get the first character.
self.getChar()
def nextToken(self):
if self.current is None:
return None
self.skipWhitespaces()
if self.current == '+' and self.lookbefore() not in ('+','-','^','/','%'):
return Token(Type.PLUS, Type.key[Type.PLUS], self.getChar())
elif self.current == '-' and self.lookbefore() not in ('+','-','^','/','%'):
return Token(Type.MINUS, Type.key[Type.MINUS], self.getChar())
elif self.current == '*':
return Token(Type.TIMES, Type.key[Type.TIMES], self.getChar())
elif self.current == '/':
return Token(Type.DIV, Type.key[Type.DIV], self.getChar())
elif self.current == '%':
return Token(Type.MOD, Type.key[Type.MOD], self.getChar())
elif self.current == '^':
return Token(Type.EXP, Type.key[Type.EXP], self.getChar())
elif self.current == '!':
return Token(Type.FAC, Type.key[Type.FAC], self.getChar())
elif self.current == '(':
return Token(Type.LBRACE, Type.key[Type.LBRACE], self.getChar())
elif self.current == ')':
return Token(Type.RBRACE, Type.key[Type.RBRACE], self.getChar())
elif self.match('[a-zA-Z_][a-zA-Z0-9_]*(?=([ \t]+)?\()'):
return Token(Type.FUNC, Type.key[Type.FUNC], self.getMatch())
elif self.match('[+-]?[a-zA-Z_][a-zA-Z0-9_]*(?!([ \t]+)?\()'):
return Token(Type.VAR, Type.key[Type.VAR], self.getMatch())
elif self.match('[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'):
return Token(Type.NUMBER, Type.key[Type.NUMBER], self.getMatch())
else:
print 'No match found'
def skipWhitespaces(self):
while self.current.isspace():
self.getChar()
def getChar(self):
if self.offset + 1 >= len(self.stream):
return None
result = self.stream[self.offset]
self.offset += 1
self.current = self.stream[self.offset]
return result
def getMatch(self):
if self.matchRes is None:
return None
result = self.matchRes.group(0)
self.offset += len(result)
if self.offset + 1 >= len(self.stream):
self.current = None
else:
self.current = self.stream[self.offset]
return result
def match(self, format):
# Prepare the given pattern ...
pattern = re.compile(format)
# ... and match the pattern against the stream.
self.matchRes = re.match(pattern, self.stream[self.offset:])
if self.matchRes is not None:
return True
else:
return False
def lookahead(self):
return self.stream[self.offset+1:self.offset+2]
def lookbefore(self):
return self.stream[self.offset-1:self.offset]
def main(argv):
form = argv[0]
# Do something with this data.
if form is not None:
lex = Lexer(form)
token = lex.nextToken()
while token is not None:
print token
token = lex.nextToken()
else:
print 'Invalid parameters.'
sys.exit(1)
if __name__ == "__main__":
main(sys.argv[1:])
But this code is very long and it took some time until it worked. Fortunately, I stumbled over Python’s Regex module documentation where I found a simple tokenizer based on regular expressions. So I wrote a new script that uses this technique to parse algebraic expressions. This resulted in a much smaller script:
#!/usr/bin/python
import collections
import sys
import re
def tokenize(stream):
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
tokenSpec = [
# Arithmetic operators
('OPERATOR', r'(?<!--[\+\-\^\*/%])[\+\-]|[\^\*/%!]'),<br /-->
# Function identifiers
('FUNCTIONID', r'[a-zA-Z_][a-zA-Z0-9_]*(?=([ \t]+)?\()'),
# Variable identifiers
('VARIABLEID', r'[+-]?[a-zA-Z_][a-zA-Z0-9_]*(?!([ \t]+)?\()'),
# Any numeric value (decimal or floating point)
('NUMBER', r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'),
# Left brace
('LBRACE', r'[(]'),
# Right brace
('RBRACE', r'[)]'),
# Assignment operator
('ASSIGN', r':='),
# Line endings
('NEWLINE', r'\n'),
# Skip over spaces and tabs
('SKIP', r'[ \t]'),
]
tokenRegex = '|'.join('(?P<%s>%s)' % pair for pair in tokenSpec)
keywords = {
'+': 'PLUS', '-': 'MINUS', '*': 'TIMES', '/': 'DIV',
'^': 'EXP', '%': 'MOD', '!': 'FAC'
}
nextToken = re.compile(tokenRegex).match
# Setup the line start and current position ...
pos = lineStart = 0
# ... as well as the current line.
line = 1
# Fetch the first token ...
token = nextToken(stream)
# ... and start the processing.
while token is not None:
# Fetch the token type ...
typ = token.lastgroup
# ... and increment line counter if it is a newline.
if typ == 'NEWLINE':
lineStart = pos
line += 1
elif typ != 'SKIP':
# Fetch the token value ...
value = token.group(typ)
# ... and handle keywords.
if typ == 'OPERATOR' and value in keywords.keys():
typ = keywords[value]
yield Token(typ, value, line, token.start() - lineStart)
pos = token.end()
token = nextToken(stream, pos)
if pos != len(stream):
raise TokenizerException('Unexpected character %r on line %d' % (stream[pos], line))
def main(argv):
form = argv[0]
# Do something with this data.
if form is not None:
for token in tokenize(form):
print token
else:
print 'Invalid parameters.\n'
sys.exit(1)
if __name__ == "__main__":
main(sys.argv[1:])
But as life goes, once you have a running solution, a friend tells you that there is a far simpler and more efficient way to implement this. Yeah, you are right, René told me about the pyparsing library. I couldn’t resist and so I implemented the same process using an other approach and all this just for fun. This script was again a little bit shorter than the regular expression based solution and brings another benefit: You are not only able to implement the lexical analysis in just a few lines, you could also do the parsing using this library by adding just a few lines.
#!/usr/bin/env python
from pyparsing import Word, Regex, Literal, OneOrMore, ParseException
import sys
import re
def main(argv):
form = argv[0]
# Do something with this data.
if form is not None:
try:
operator = Regex(r'(?<!--[\+\-\^\*/%])[\+\-]|[\^\*/%!]')<br /-->
function = Regex(r'[a-zA-Z_][a-zA-Z0-9_]*(?=([ \t]+)?\()')
variable = Regex(r'[+-]?[a-zA-Z_][a-zA-Z0-9_]*(?!([ \t]+)?\()')
number = Regex(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?')
lbrace = Word('(')
rbrace = Word(')')
assign = Literal(':=')
linebreak = Word('\n')
skip = Word(' \t')
lexOnly = operator | function | variable | number | lbrace \
| rbrace | assign | linebreak | skip
lexAllOnly = OneOrMore(lexOnly)
print lexAllOnly.parseString(form)
except ParseException, err:
print err.line
print " "*(err.column-1) + "^"
print err
else:
print 'Invalid parameters.\n'
sys.exit(1)
if __name__ == "__main__":
main(sys.argv[1:])
For me it was very fascinating to see how many possibilties Python offers to do lexical analysis and how powerful regular expressions are. From one day of fun it became a day of many insights. Maybe I will delving deeper into the topic when I have a bit more time.
I hope you enjoyed this short overview of the different abilities of Python to tokenize a sequence of characters. I am looking forward to see your implementations and read your recommendations. Until next time, happy coding.