====== Lexikální analyzátor ===== Program (v jazyce Python) čtoucí jednotlivé znaky zdrojového textu napsaného v jazyce C \\ a vytvářející posloupnost lexikálních symbolů. Lexikálních symboly jsou identifikátory, celá čísla, desetinná čísla, řetězcové konstanty v jednoduchých nebo dvojitých uvozovkách a oddělovače. Jednotlivé lexikálních symboly jsou načítány funkcí **nextToken**. \\ Druh symbolu je uložen do proměnné **token** (//identifier, number, real_number, character_literal, string_literal, separator//). \\ Konktrétní hohnota je uložen do proměnné **tokenText**. http://gitlab.fjfi.cvut.cz/culikzde/simple-view/-/blob/master/tutorial/mini-parser/mini_lexer.py \\ http://gitlab.fjfi.cvut.cz/culikzde/simple-view/-/blob/master/tutorial/easy-grammar/easy_lexer.py lexer v jazyce C https://gitlab.fjfi.cvut.cz/culikzde/c-parser/-/blob/master/lex.cc


from __future__ import print_function
import os

class LexerException (Exception) :
   pass

class Lexer (object) :
   eos = 0 # end of source
   identifier = 1
   number = 2 # integer
   real_number = 3 # float or double
   character_literal = 4 # string with sigle quotes
   string_literal = 5 # double quoted string
   separator = 6

   def __init__ (self) :
       super (Lexer, self).__init__ ()
       self.reset ()

   def reset (self) :
       self.fileName = ""
       self.source = ""
       self.sourceLen = 0

       self.charLineNum = 1
       self.charColNum = 1
       self.charByteOfs = 0

       self.tokenLineNum = 1
       self.tokenColNum = 1
       self.tokenByteOfs = 0

       self.ch = '\0'
       self.token = self.eos
       self.tokenText = ""
       self.tokenValue = ""

   # -----------------------------------------------------------------------

   def openFile (self, fileName) :
       self.reset ()
       self.fileName = os.path.abspath (fileName)
       f = open (self.fileName, "r")
       self.source = f.read ()
       self.sourceLen = len (self.source)
       self.nextChar () # read first character
       self.nextToken () # read first token

   def close (self) :
       pass

   # -----------------------------------------------------------------------

   def nextChar (self) :
       if self.charByteOfs < self.sourceLen :
          self.ch = self.source [self.charByteOfs]
          self.charByteOfs = self.charByteOfs + 1
          if self.ch == '\n' :
             self.charLineNum = self.charLineNum + 1
             self.charColNum = 0
          elif self.ch != '\r' :
             self.charColNum = self.charColNum + 1
       else :
          self.ch = '\0'

   # -----------------------------------------------------------------------

   def backStep (self) :
       self.tokenColNum = self.tokenColNum - 1
       self.tokenByteOfs = self.tokenByteOfs - 1

   def getPosition (self) :
       txt = self.fileName + ":" + str (self.tokenLineNum) + ":" + str (self.tokenColNum)
       return txt

   def info (self, text) :
       print (self.getPosition () + ": info: " + text)

   def warning (self, text) :
       print (self.getPosition () + ": warning: " + text)

   def error (self, text) :
       raise LexerException (self.getPosition () + ": error: " + text + ", tokenText=" + self.tokenText)

   # -----------------------------------------------------------------------

   def isLetter (self, c) :
       return c >= 'A' and c <= 'Z' or c >= 'a' and c <= 'z' or c == '_'

   def isDigit (self, c) :
       return c >= '0' and c <= '9'

   def isLetterOrDigit (self, c) :
       return self.isLetter (c) or self.isDigit (c)

   # -----------------------------------------------------------------------

   def comment_eol (self) :
       txt = ""

       while self.ch != '\0' and self.ch != '\r' and self.ch != '\n' :
          txt += self.ch
          self.nextChar ()

       self.comment_directive (txt)

   def comment_two_marks (self, mark1, mark2) :
       prev = ' '
       while self.ch != '\0' and not (prev == mark1 and self.ch == mark2) :
          prev = self.ch
          self.nextChar ()
       if self.ch == mark2 :
          self.nextChar () # skip mark2
       else :
          self.error ("Unterminated comment")

   # -----------------------------------------------------------------------

   def checkDigit (self) :
       if not self.isDigit (self.ch) :
          self.error ("Digit expected")

   def digits (self) :
       while self.isDigit (self.ch) :
          self.tokenText = self.tokenText + self.ch
          self.nextChar ()

   # -----------------------------------------------------------------------

   def numericToken (self) :
       self.token = self.number
       self.digits ()
       if self.ch == '.' :
          self.decimalToken ()
          self.token = self.real_number
          self.tokenText = self.tokenText + self.ch # store '.'
          self.nextChar () # skip '.'
          self.checkDigit ()
          self.digits ()
       if self.ch == 'e' or self.ch == 'E' :
          self.token = self.real_number
          self.tokenText = self.tokenText + self.ch # store 'e'
          self.nextChar () # skip 'e'
          if self.ch == '+' or self.ch == '-' :
             self.tokenText = self.tokenText + self.ch # store '+' or '-'
             self.nextChar () # skip '+' or '-'
          self.checkDigit ()
          self.digits ()

   # -----------------------------------------------------------------------

   def stringChar (self) :
       if self.ch != '\\' :
          c = self.ch
          self.nextChar ()
          return c
       else :
          self.nextChar () # skip backslash
          c = self.ch
          self.nextChar ()
          return c

   # -----------------------------------------------------------------------

   def nextToken (self) :
       self.token = self.eos
       self.tokenText = ""

       slash = False
       whiteSpace = True

       while whiteSpace and self.ch != '\0' :
          while self.ch != '\0' and self.ch <= ' ' :
             self.nextChar ()

          whiteSpace = False
          if self.ch == '/' :
             self.nextChar () # skip '/'
             if self.ch == '/' :
                self.nextChar () # skip '/'
                self.comment_eol ()
                whiteSpace = True # check again for white space
             elif self.ch == '*' :
                self.nextChar () # skip '*'
                self.comment_two_marks ('*', '/')
                whiteSpace = True # check again for white space
             else :
                slash = True # produce '/' token

       self.tokenLineNum = self.charLineNum
       self.tokenColNum = self.charColNum
       self.tokenByteOfs = self.charByteOfs

       if slash :
          self.token = self.separator
          self.tokenText = '/'
          self.backStep ()
          self.processSeparator ()

       elif self.ch == '\0' :
          self.token = self.eos

       elif self.isLetter (self.ch) :
          self.token = self.identifier
          while self.isLetterOrDigit (self.ch) :
             self.tokenText = self.tokenText + self.ch
             self.nextChar ()

       elif self.isDigit (self.ch) :
          self.numericToken ()

       elif self.ch == '\'' :
          self.token = self.character_literal
          self.nextChar ()
          while self.ch != '\0' and self.ch != '\r' and self.ch != '\n' and self.ch != '\'' :
             self.tokenText = self.tokenText + self.stringChar ()
          if self.ch != '\'' :
             self.error ("Unterminated string")
          self.nextChar ()

       elif self.ch == '\"' :
          self.token = self.string_literal
          self.nextChar ()
          while self.ch != '\0' and self.ch != '\r' and self.ch != '\n' and self.ch != '\"' :
             self.tokenText = self.tokenText + self.stringChar ()
          if self.ch != '\"' :
             self.error ("Unterminated string")
          self.nextChar ()

       else :
          self.token = self.separator
          self.tokenText = self.ch
          self.nextChar ()
          self.processSeparator ()

       if self.token == self.identifier :
          "convert identifier to keyword"
          self.lookupKeyword ()

   def lookupKeyword (self) :
       pass

   def processSeparator (self) :
       if self.tokenText == "<" or self.tokenText == ">" :
          if self.ch == "=" :
             self.tokenText = self.tokenText + self.ch
             self.nextChar ()

Literatura: Kapitola 4.1 http://inf.ethz.ch/personal/wirth/CompilerConstruction/CompilerConstruction1.pdf