====== Lexikální analyzátor ===== Program (v jazyce Python) čtoucí jednotlivé znaky zdrojového textu napsaného v jazyce C \\ a vytvářející posloupnost lexikálních symbolů. Lexikálních symboly jsou identifikátory, celá čísla, desetinná čísla, řetězcové konstanty v jednoduchých nebo dvojitých uvozovkách a oddělovače. Jednotlivé lexikálních symboly jsou načítány funkcí **nextToken**. \\ Druh symbolu je uložen do proměnné **token** (//identifier, number, real_number, character_literal, string_literal, separator//). \\ Konktrétní hohnota je uložen do proměnné **tokenText**. http://gitlab.fjfi.cvut.cz/culikzde/view/-/blob/master/tutorial/mini-parser/mini_lexer.py \\ http://gitlab.fjfi.cvut.cz/culikzde/view/-/blob/master/tutorial/easy-grammar/easy_lexer.py lexer v jazyce C https://gitlab.fjfi.cvut.cz/culikzde/c-parser/-/blob/master/lex.cc from __future__ import print_function import os class LexerException (Exception) : pass class Lexer (object) : eos = 0 # end of source identifier = 1 number = 2 # integer real_number = 3 # float or double character_literal = 4 # string with sigle quotes string_literal = 5 # double quoted string separator = 6 def __init__ (self) : super (Lexer, self).__init__ () self.reset () def reset (self) : self.fileName = "" self.source = "" self.sourceLen = 0 self.charLineNum = 1 self.charColNum = 1 self.charByteOfs = 0 self.tokenLineNum = 1 self.tokenColNum = 1 self.tokenByteOfs = 0 self.ch = '\0' self.token = self.eos self.tokenText = "" self.tokenValue = "" # ----------------------------------------------------------------------- def openFile (self, fileName) : self.reset () self.fileName = os.path.abspath (fileName) f = open (self.fileName, "r") self.source = f.read () self.sourceLen = len (self.source) self.nextChar () # read first character self.nextToken () # read first token def close (self) : pass # ----------------------------------------------------------------------- def nextChar (self) : if self.charByteOfs < self.sourceLen : self.ch = self.source [self.charByteOfs] self.charByteOfs = self.charByteOfs + 1 if self.ch == '\n' : self.charLineNum = self.charLineNum + 1 self.charColNum = 0 elif self.ch != '\r' : self.charColNum = self.charColNum + 1 else : self.ch = '\0' # ----------------------------------------------------------------------- def backStep (self) : self.tokenColNum = self.tokenColNum - 1 self.tokenByteOfs = self.tokenByteOfs - 1 def getPosition (self) : txt = self.fileName + ":" + str (self.tokenLineNum) + ":" + str (self.tokenColNum) return txt def info (self, text) : print (self.getPosition () + ": info: " + text) def warning (self, text) : print (self.getPosition () + ": warning: " + text) def error (self, text) : raise LexerException (self.getPosition () + ": error: " + text + ", tokenText=" + self.tokenText) # ----------------------------------------------------------------------- def isLetter (self, c) : return c >= 'A' and c <= 'Z' or c >= 'a' and c <= 'z' or c == '_' def isDigit (self, c) : return c >= '0' and c <= '9' def isLetterOrDigit (self, c) : return self.isLetter (c) or self.isDigit (c) # ----------------------------------------------------------------------- def comment_eol (self) : txt = "" while self.ch != '\0' and self.ch != '\r' and self.ch != '\n' : txt += self.ch self.nextChar () self.comment_directive (txt) def comment_two_marks (self, mark1, mark2) : prev = ' ' while self.ch != '\0' and not (prev == mark1 and self.ch == mark2) : prev = self.ch self.nextChar () if self.ch == mark2 : self.nextChar () # skip mark2 else : self.error ("Unterminated comment") # ----------------------------------------------------------------------- def checkDigit (self) : if not self.isDigit (self.ch) : self.error ("Digit expected") def digits (self) : while self.isDigit (self.ch) : self.tokenText = self.tokenText + self.ch self.nextChar () # ----------------------------------------------------------------------- def numericToken (self) : self.token = self.number self.digits () if self.ch == '.' : self.decimalToken () self.token = self.real_number self.tokenText = self.tokenText + self.ch # store '.' self.nextChar () # skip '.' self.checkDigit () self.digits () if self.ch == 'e' or self.ch == 'E' : self.token = self.real_number self.tokenText = self.tokenText + self.ch # store 'e' self.nextChar () # skip 'e' if self.ch == '+' or self.ch == '-' : self.tokenText = self.tokenText + self.ch # store '+' or '-' self.nextChar () # skip '+' or '-' self.checkDigit () self.digits () # ----------------------------------------------------------------------- def stringChar (self) : if self.ch != '\\' : c = self.ch self.nextChar () return c else : self.nextChar () # skip backslash c = self.ch self.nextChar () return c # ----------------------------------------------------------------------- def nextToken (self) : self.token = self.eos self.tokenText = "" slash = False whiteSpace = True while whiteSpace and self.ch != '\0' : while self.ch != '\0' and self.ch <= ' ' : self.nextChar () whiteSpace = False if self.ch == '/' : self.nextChar () # skip '/' if self.ch == '/' : self.nextChar () # skip '/' self.comment_eol () whiteSpace = True # check again for white space elif self.ch == '*' : self.nextChar () # skip '*' self.comment_two_marks ('*', '/') whiteSpace = True # check again for white space else : slash = True # produce '/' token self.tokenLineNum = self.charLineNum self.tokenColNum = self.charColNum self.tokenByteOfs = self.charByteOfs if slash : self.token = self.separator self.tokenText = '/' self.backStep () self.processSeparator () elif self.ch == '\0' : self.token = self.eos elif self.isLetter (self.ch) : self.token = self.identifier while self.isLetterOrDigit (self.ch) : self.tokenText = self.tokenText + self.ch self.nextChar () elif self.isDigit (self.ch) : self.numericToken () elif self.ch == '\'' : self.token = self.character_literal self.nextChar () while self.ch != '\0' and self.ch != '\r' and self.ch != '\n' and self.ch != '\'' : self.tokenText = self.tokenText + self.stringChar () if self.ch != '\'' : self.error ("Unterminated string") self.nextChar () elif self.ch == '\"' : self.token = self.string_literal self.nextChar () while self.ch != '\0' and self.ch != '\r' and self.ch != '\n' and self.ch != '\"' : self.tokenText = self.tokenText + self.stringChar () if self.ch != '\"' : self.error ("Unterminated string") self.nextChar () else : self.token = self.separator self.tokenText = self.ch self.nextChar () self.processSeparator () if self.token == self.identifier : "convert identifier to keyword" self.lookupKeyword () def lookupKeyword (self) : pass def processSeparator (self) : if self.tokenText == "<" or self.tokenText == ">" : if self.ch == "=" : self.tokenText = self.tokenText + self.ch self.nextChar () Literatura: Kapitola 4.1 http://inf.ethz.ch/personal/wirth/CompilerConstruction/CompilerConstruction1.pdf