#!/usr/bin/python import re f = open('charsets.txt', 'r') blobs = f.read().split('web200904\x00') labels = {} for line in blobs: line = line.lower() match = re.match(r'text/html\s*;\s*charset\s*=\s*["\']?([^"\'\s>;]+)', line) if not match: continue label = match.group(1) if label in labels: labels[label] = labels[label] + 1 else: labels[label] = 1 f.close() arr = labels.items() arr.sort(key=lambda a:a[1], reverse=True) o = open('charsets-count.txt', 'w') for item in arr: o.write(item[0]+': '+str(item[1]) + '\n') o.close()