# simple_lexer.py # -------------------------------------------------------------------------- class Lexer (object) : eos = 0 identifier = 1 number = 2 real = 3 character = 4 string = 5 separator = 6 def __init__ (self) : self.source = "" self.reset () def reset (self) : # next character self.byteOfs = 0 # byte offset (from 0) self.fileInx = 0 # file index self.lineNum = 1 # line number (from 1) self.colNum = 1 # column number (from 1) self.charByteOfs = 0 # character (in self.ch) self.charFileInx = 0 self.charLineNum = 1 self.charColNum = 1 self.tokenByteOfs = 0 # token (in self.token) self.tokenFileInx = 0 self.tokenLineNum = 1 self.tokenColNum = 1 self.ch = 0 self.token = self.eos self.tokenText = "" self.sourceLen = len (self.source) self.nextChar () self.nextToken () # ----------------------------------------------------------------------- def openFile (self, fileName) : f = open (fileName, "r") self.source = f.read () self.reset () def openString (self, sourceText) : self.source = sourceText self.reset () # ----------------------------------------------------------------------- def error (self, text) : raise Exception (text + ", line " + str (self.lineNum) + ", column " + str (self.colNum)) # ----------------------------------------------------------------------- def nextChar (self) : self.charByteOfs = self.byteOfs self.charFileInx = self.fileInx self.charLineNum = self.lineNum self.charColNum = self.colNum if self.byteOfs < self.sourceLen : self.ch = self.source [self.byteOfs] self.byteOfs = self.byteOfs + 1 if self.ch == '\n' : self.lineNum = self.lineNum + 1 elif self.ch != '\r' : self.colNum = self.colNum + 1 else : self.ch = 0 def mark (self) : return (self.tokenByteOfs, self.tokenFileInx, self.tokenLineNum, self.tokenColNum) def rewind (self, params) : self.byteOfs = params [0] self.fileInx = params [1] self.lineNum = params [2] self.colNum = params [3] self.nextChar () self.nextToken () # ----------------------------------------------------------------------- def isLetter (self, c) : return c >= 'A' and c <= 'Z' or c >= 'a' and c <= 'z' or c == '_'; def isDigit (self, c) : return c >= '0' and c <= '9'; def isHexDigit (self, c) : return c >= 'A' and c <= 'F' or c >= 'a' and c <= 'f' or c >= '0' and c <= '9'; # ----------------------------------------------------------------------- def comment (self, mark1) : while self.ch != 0 and self.ch != mark1 : self.nextChar () if self.ch == mark1 : self.nextChar () # skip mark1 else : self.error ("Unterminated comment") def comment2 (self, mark1, mark2) : prev = ' ' while self.ch != 0 and not (prev == mark1 and self.ch == mark2) : prev = self.ch self.nextChar () if self.ch == mark2 : self.nextChar () # skip mark2 else : self.error ("Unterminated comment") def checkDigit (self) : if not self.isDigit (self.ch) : self.Error ("Digit expected") def digits (self) : while self.isDigit (self.ch) : self.tokenText = self.tokenText + self.ch self.nextChar () def stringChar (self) : if self.ch != '\\' : c = self.ch self.nextChar () return c else : self.nextChar () # skip backslash if self.ch >= '0' and self.ch <= '7' : cnt = 1 c = 0 while self.ch >= '0' and self.ch <= '7' and cnt <= 3 : c = 8 * c + ord (self.ch) - ord ('0') cnt = cnt + 1 self.nextChar () return chr (c) elif self.ch == 'x' or self.ch <= 'X' : self.nextChar () c = 0 while self.ch >= '0' and self.ch <= '9' or self.ch >= 'A' and self.ch <= 'Z' or self.ch >= 'a' and self.ch <= 'z': if self.ch >= '0' and self.ch <= '9' : n = ord (self.ch) - ord ('0') elif self.ch >= 'A' and self.ch <= 'Z' : n = ord (self.ch) - ord ('A') + 10 else : n = ord (self.ch) - ord ('a') + 10 c = 16 * c + n self.nextChar () return chr (c % 255) elif self.ch == 'a' : c = '\a' elif self.ch == 'b' : c = '\b' elif self.ch == 'f' : c = '\f' elif self.ch == 'n' : c = '\n' elif self.ch == 'r' : c = '\r' elif self.ch == 't' : c = '\t' elif self.ch == 'v' : c = '\v' elif self.ch == '\'' or self.ch == '\"' or ch == '?' : c = self.ch else : c = self.ch # same self.nextChar () return c def backStep (self) : self.tokenByteOfs = self.tokenByteOfs - 1 self.tokenColNum = self.tokenColNum - 1 # ----------------------------------------------------------------------- def nextToken (self) : self.token = self.eos self.tokenText = "" slash = False whiteSpace = True while whiteSpace and self.ch != 0: while self.ch != 0 and self.ch <= ' ' : self.nextChar (); whiteSpace = False if self.ch == '/' : self.nextChar () # skip '/' if self.ch == '/' : self.nextChar () # skip '/' while self.ch != 0 and self.ch != '\n' and self.ch != '\r' : self.nextChar () whiteSpace = True # check again for white space elif self.ch == '*' : self.nextChar () # skip '*' self.comment2 ('*', '/') whiteSpace = True # check again for white space else: slash = True # produce '/' token self.tokenByteOfs = self.charByteOfs self.tokenFileInx = self.charFileInx self.tokenLineNum = self.charLineNum self.tokenColNum = self.charColNum if slash : self.token = self.separator self.tokenText = '/' self.backStep () elif self.ch == 0 : self.token = self.eos elif self.isLetter (self.ch) : self.token = self.identifier while self.isLetter (self.ch) : self.tokenText = self.tokenText + self.ch self.nextChar () elif self.isDigit (self.ch) : self.token = self.number self.digits () elif self.ch == '\'' : self.token = self.character self.nextChar () while self.ch != 0 and self.ch != '\n' and self.ch != '\r' and self.ch != '\'' : self.tokenText = self.tokenText + self.stringChar () if self.ch != '\'' : self.error ("Unterminated string") self.nextChar () elif self.ch == '\"' : self.token = self.string self.nextChar () while self.ch != 0 and self.ch != '\n' and self.ch != '\r' and self.ch != '\"' : self.tokenText = self.tokenText + self.stringChar () if self.ch != '\"' : self.error ("Unterminated string") self.nextChar () else : self.token = self.separator self.tokenText = self.ch self.nextChar () # ----------------------------------------------------------------------- def isEndOfSource (self) : return self.token == self.eos def isIdentifier (self) : return self.token == self.identifier def isNumber (self) : return self.token == self.number def isSeparator (self, value) : return self.token == self.separator and self.tokenText == value def checkSeparator (self, value) : if not self.isSeparator (value) : self.error (value + " expected") self.nextToken () def isKeyword (self, value) : return self.token == self.identifier and self.tokenText == value def checkKeyword (self, value) : if not self.isKeyword (value) : self.error (value + " expected") self.nextToken () # -------------------------------------------------------------------------- if __name__ == "__main__" : lexer = Lexer () lexer.openFile ("input.txt") while not lexer.isEndOfSource () : print (lexer.tokenText) lexer.nextToken () # kate: indent-width 1; show-tabs true; replace-tabs true; remove-trailing-spaces all