import sys import os, os.path EXTENSIONS = [".cs"] MIN_MATCH = 75 class HashedFile(object): def __init__(self, path): self.path = path f = open(path) self.lineHashes = [hash(l) for l in f.xreadlines()] f.close() class Block(object): def __init__(self, path1, start1, path2, start2, data=""): self.path1 = path1 self.start1 = start1 self.path2 = path2 self.start2 = start2 self.data = data def get_filenames(path): for root, dirs, files in os.walk(path): for filename in files: name, ext = os.path.splitext(filename) if ext in EXTENSIONS: yield os.path.join(root, filename) def hash(text): import md5 m = md5.new() m.update(text) return m.digest() def compare(hashedFile1, hashedFile2): import difflib sm = difflib.SequenceMatcher(None, hashedFile1.lineHashes, hashedFile2.lineHashes) for (start1, start2, lineCount) in sm.get_matching_blocks(): if lineCount >= MIN_MATCH: yield Block(hashedFile1.path, start1+1, hashedFile2.path, start2+1) if __name__ == "__main__": # generate hashes for all lines in all files hashedFiles = [] for filename in get_filenames("."): sys.stderr.write(".") hashedFile = HashedFile(filename) hashedFiles.append(hashedFile) print >>sys.stderr # compare all files to each other using hashed lines for i in xrange(len(hashedFiles)): print >>sys.stderr, hashedFiles[i].path, "..." for j in xrange(i+1, len(hashedFiles)): for block in compare(hashedFiles[i], hashedFiles[j]): print "file 1: %s:%d" % (block.path1, block.start1) print "file 2: %s:%d" % (block.path2, block.start2)