import re

class Lexer(object):
    def __init__(self, definitions):
        self.definitions = definitions
        
        # create a compound regular expression from the lexer definitions
        parts = []
        for name, part in definitions:
            parts.append("(?P<%s>%s)" % (name, part))
        self.regexpString = "|".join(parts)
        self.regexp = re.compile(self.regexpString, re.MULTILINE)

    def parse(self, text):
        # yield lexemes
        for match in self.regexp.finditer(text):
            found = False
            for name, rexp in self.definitions:
                m = match.group(name)
                if m is not None:
                    yield (name, m)
                    break
                    
if __name__ == "__main__":
    import sys
    import cgi

    PYTHON_KEYWORDS = ['and', 'del', 'from', 'not', 'while', 'as', 
                       'elif', 'global', 'or', 'with', 'assert', 
                       'else', 'if', 'pass', 'yield', 'break', 
                       'except', 'import', 'print', 'class', 'exec',
                       'in', 'raise', 'continue', 'finally', 'is', 
                       'return', 'def', 'for', 'lambda', 'try']

    # define the various parts of the language the lexer will recognize
    definitions = [
        # all python keywords
        ("keyword"    , r"\b(%s)\b" % "|".join(PYTHON_KEYWORDS)),
        
        # both double-quote and single-qoute delimited string with support for \ escaping
        ("string"     , r"[ru]?(\"([^\"\\]|(\\.))*\")|('([^\'\\]|(\\.))*')"),
        
        # python operators
        ("operators"  , r"[%s]+" % re.escape("<>=*/+-~!%&()|{}[],.?:")),

        # function method and class names (not very accurate)
        ("function"   ,   r"(?<=def )([A-Za-z_][A-Za-z0-9_]*)\s*(?=[(:])"),
        ("class"      , r"(?<=class )([A-Za-z_][A-Za-z0-9_]*)\s*(?=[(:])"),

        # legal identifiers 
        ("ident"      , r"[A-Za-z_][A-Za-z0-9_]*"),
        
        # legal numbers (no support for hexadecimal and octal notations)
        ("number"     , r"[0-9.]+"),
        
        # comments and whitespace
        ("comment"    , r"#.*$"),
        ("whitespace" , r"\s+"),
        
        # unrecognized patterns will be consumed by other
        ("other"      , r".+")
    ]

    # read the source file to colorize
    text = open(sys.argv[0], "r").read()
    lexer = Lexer(definitions)
    
    # html templates for colorizing certain language parts
    templates = {
        "operators"  : "<b><font size='3'>%s</font></b>",
        "string"     : "<span style='color:#808'>%s</span>",   
        "keyword"    : "<b><span style='color:#007'>%s</span></b>",   
        "comment"    : "<i><span style='color:#0a0'>%s</span></i>",   
        "function"   : "<b><span style='color:#0aa'>%s</span></b>",   
        "class"      : "<b><span style='color:#00e'>%s</span></b>",   
    }

    # run the lexer and create the html output
    output = open("pylexer.html", "w")
    print >>output, "<html><body><pre>"
    for tokenType, tokenValue in lexer.parse(text):
        template = templates.get(tokenType) or "%s"
        output.write(template % cgi.escape(tokenValue))
    print >>output, "</pre></body></html>"
    output.close()