#!/usr/bin/python import re f = open('stevef-charsets.txt', 'r') blobs = f.read().split('\x00') labels = {} for line in blobs: line = line.lower() match = re.match(r'text/html\s*;\s*charset\s*=\s*["\']?([^"\'\s>;]+)', line) if not match: continue label = match.group(1) if label in labels: labels[label] = labels[label] + 1 else: labels[label] = 1 f.close() arr = labels.items() arr.sort(key=lambda a:a[1], reverse=True) o = open('stevef-charsets-count.txt', 'w') for item in arr: o.write(item[0]+': '+str(item[1]) + '\n') o.close()