import re class Lexer(object): def __init__(self, definitions): self.definitions = definitions # create a compound regular expression from the lexer definitions parts = [] for name, part in definitions: parts.append("(?P<%s>%s)" % (name, part)) self.regexpString = "|".join(parts) self.regexp = re.compile(self.regexpString, re.MULTILINE) def parse(self, text): # yield lexemes for match in self.regexp.finditer(text): found = False for name, rexp in self.definitions: m = match.group(name) if m is not None: yield (name, m) break if __name__ == "__main__": import sys import cgi PYTHON_KEYWORDS = ['and', 'del', 'from', 'not', 'while', 'as', 'elif', 'global', 'or', 'with', 'assert', 'else', 'if', 'pass', 'yield', 'break', 'except', 'import', 'print', 'class', 'exec', 'in', 'raise', 'continue', 'finally', 'is', 'return', 'def', 'for', 'lambda', 'try'] # define the various parts of the language the lexer will recognize definitions = [ # all python keywords ("keyword" , r"\b(%s)\b" % "|".join(PYTHON_KEYWORDS)), # both double-quote and single-qoute delimited string with support for \ escaping ("string" , r"[ru]?(\"([^\"\\]|(\\.))*\")|('([^\'\\]|(\\.))*')"), # python operators ("operators" , r"[%s]+" % re.escape("<>=*/+-~!%&()|{}[],.?:")), # function method and class names (not very accurate) ("function" , r"(?<=def )([A-Za-z_][A-Za-z0-9_]*)\s*(?=[(:])"), ("class" , r"(?<=class )([A-Za-z_][A-Za-z0-9_]*)\s*(?=[(:])"), # legal identifiers ("ident" , r"[A-Za-z_][A-Za-z0-9_]*"), # legal numbers (no support for hexadecimal and octal notations) ("number" , r"[0-9.]+"), # comments and whitespace ("comment" , r"#.*$"), ("whitespace" , r"\s+"), # unrecognized patterns will be consumed by other ("other" , r".+") ] # read the source file to colorize text = open(sys.argv[0], "r").read() lexer = Lexer(definitions) # html templates for colorizing certain language parts templates = { "operators" : "<b><font size='3'>%s</font></b>", "string" : "<span style='color:#808'>%s</span>", "keyword" : "<b><span style='color:#007'>%s</span></b>", "comment" : "<i><span style='color:#0a0'>%s</span></i>", "function" : "<b><span style='color:#0aa'>%s</span></b>", "class" : "<b><span style='color:#00e'>%s</span></b>", } # run the lexer and create the html output output = open("pylexer.html", "w") print >>output, "<html><body><pre>" for tokenType, tokenValue in lexer.parse(text): template = templates.get(tokenType) or "%s" output.write(template % cgi.escape(tokenValue)) print >>output, "</pre></body></html>" output.close()