The grammar simply parses words or phrases in a file with white space as a delimiter and requiring that all words or phrases start with a letter and are followed by any combination of letters and digits.
BNF:
G[<Word>]
<Word> ::= <Letter> | <Letter> <LetterDigit>
<LetterDigit> ::= <Letter> | <Digit> | <LetterDigit> <Letter> | <LetterDigit> <Digit>
<Letter> ::= a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | q | r | s | t | u | v | w | x | y | z
<Digit> ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
A graphical representation of the grammar:
Finally, the code ... in python of course :)
UPDATE: Now featuring code indentation goodness.
###############################
###############################
#!/usr/bin/python
import sys
class SimpleLex:
"""@Author: Adam Miller - Simple lexical anyzer"""
#allowed alphabetic characters in grammar
alpha = 'abcdefghijklmnopqrstuvwxyz'
#allowed digits in grammar
digit = '0123456789'
#white space symbolic representation
wSpace = ' \n \t'
#grammar table of relations
table = [
[1,3,0],
[1,1,2],
[2,2,2],
[3,3,3],
]
#relations of type and their state index
relations = {
"alpha":0,
"digit":1,
"wSpace":2
}
def __init__(self, file_name):
"""open a file, extract contents, close file,
initialize character list, and state variable"""
self.f = open(file_name, 'r')
self.lines = self.f.readlines()
self.f.close()
self.chars = []
self.state = 0
def scan(self):
"""process the file's contents one character at a time"""
for line in self.lines:
for c in line:
# 3 is the error state
if self.state == 3:
print "Error"
self.state = 0
self.chars = []
# 2 is the success state
elif self.state == 2:
print ''.join(self.chars)
self.state = 0
self.chars = []
#let table drive state transitions
self.state = self.table[self.state][self.lex(c)]
"""print final character buffer because the loop will end
execution and not let the last success state be checked"""
print ''.join(self.chars)
def lex(self, c):
"""return the correct relational index to the type of c"""
self.chars.append(c)
if c in self.alpha:
return self.relations["alpha"]
elif c in self.digit:
return self.relations["digit"]
elif c in self.wSpace:
return self.relations["wSpace"]
else:
return 0
#use the code i just wrote
if len(sys.argv) < 2 or len(sys.argv) > 2:
sys.stderr.write("Usage: sampleLex\n")
elif len(sys.argv) == 2:
lexer = SimpleLex(sys.argv[1])
lexer.scan()
For your viewing pleasure, a small example of the "little lexer" in action:
###############################
adam@pseudogen:~$ ./simpleLex.py lexFile.txt
hello
world
from
my
ub3r
l33t
lexical
analyzer
###############################
The code, the diagrams and the file used in this example are all available here.