import re class Lexer(object): def __init__(self, definitions): self.definitions = definitions # create a compound regular expression from the lexer definitions parts = [] for name, part in definitions: parts.append("(?P<%s>%s)" % (name, part)) self.regexpString = "|".join(parts) self.regexp = re.compile(self.regexpString, re.MULTILINE) def parse(self, text): # yield lexemes for match in self.regexp.finditer(text): found = False for name, rexp in self.definitions: m = match.group(name) if m is not None: yield (name, m) break if __name__ == "__main__": import sys import cgi PYTHON_KEYWORDS = ['and', 'del', 'from', 'not', 'while', 'as', 'elif', 'global', 'or', 'with', 'assert', 'else', 'if', 'pass', 'yield', 'break', 'except', 'import', 'print', 'class', 'exec', 'in', 'raise', 'continue', 'finally', 'is', 'return', 'def', 'for', 'lambda', 'try'] # define the various parts of the language the lexer will recognize definitions = [ # all python keywords ("keyword" , r"\b(%s)\b" % "|".join(PYTHON_KEYWORDS)), # both double-quote and single-qoute delimited string with support for \ escaping ("string" , r"[ru]?(\"([^\"\\]|(\\.))*\")|('([^\'\\]|(\\.))*')"), # python operators ("operators" , r"[%s]+" % re.escape("<>=*/+-~!%&()|{}[],.?:")), # function method and class names (not very accurate) ("function" , r"(?<=def )([A-Za-z_][A-Za-z0-9_]*)\s*(?=[(:])"), ("class" , r"(?<=class )([A-Za-z_][A-Za-z0-9_]*)\s*(?=[(:])"), # legal identifiers ("ident" , r"[A-Za-z_][A-Za-z0-9_]*"), # legal numbers (no support for hexadecimal and octal notations) ("number" , r"[0-9.]+"), # comments and whitespace ("comment" , r"#.*$"), ("whitespace" , r"\s+"), # unrecognized patterns will be consumed by other ("other" , r".+") ] # read the source file to colorize text = open(sys.argv[0], "r").read() lexer = Lexer(definitions) # html templates for colorizing certain language parts templates = { "operators" : "%s", "string" : "%s", "keyword" : "%s", "comment" : "%s", "function" : "%s", "class" : "%s", } # run the lexer and create the html output output = open("pylexer.html", "w") print >>output, "
"
    for tokenType, tokenValue in lexer.parse(text):
        template = templates.get(tokenType) or "%s"
        output.write(template % cgi.escape(tokenValue))
    print >>output, "
" output.close()