import re file = open('labels-with-urls.txt', 'rb') content = file.read() file.close() chunks = content.split('\x00--\n') opera_labels = ['cp1250', 'cp1251', 'cp1252', 'cp1254', 'cswindows31j', 'euc-cn', 'iso8859-15', 'iso8859-9', 'iso88591', 'iso_8859-1:1987', 'ks_c_5601-1987', 'ms932', 'ms936', 'sjis', 'x-mac-turkish', 'x-user-defined'] standard_labels = ["unicode-1-1-utf-8","utf-8","utf8","cp864","ibm864","cp866","ibm866","csisolatin2","iso-8859-2","iso-ir-101","iso8859-2","iso_8859-2","l2","latin2","csisolatin3","iso-8859-3","iso-ir-109","iso_8859-3","l3","latin3","csisolatin4","iso-8859-4","iso-ir-110","iso_8859-4","l4","latin4","csisolatincyrillic","cyrillic","iso-8859-5","iso-ir-144","iso_8859-5","arabic","csisolatinarabic","ecma-114","iso-8859-6","iso-ir-127","iso_8859-6","csisolatingreek","ecma-118","elot_928","greek","greek8","iso-8859-7","iso-ir-126","iso_8859-7","csisolatinhebrew","hebrew","iso-8859-8","iso-8859-8-i","iso-ir-138","iso_8859-8","visual","csisolatin6","iso-8859-10","iso-ir-157","iso8859-10","l6","latin6","iso-8859-13","iso-8859-14","iso8859-14","iso-8859-15","iso_8859-15","iso-8859-16","koi8-r","koi8_r","koi8-u","csmacintosh","mac","macintosh","x-mac-roman","iso-8859-11","tis-620","windows-874","windows-1250","x-cp1250","windows-1251","x-cp1251","ansi_x3.4-1968","ascii","csisolatin1","iso-8859-1","iso8859-1","iso_8859-1","l1","latin1","us-ascii","windows-1252","cp1253","windows-1253","csisolatin5","iso-8859-9","iso-ir-148","l5","latin5","windows-1254","cp1255","windows-1255","cp1256","windows-1256","windows-1257","cp1258","windows-1258","x-mac-cyrillic","x-mac-ukrainian","chinese","csgb2312","csiso58gb231280","gb2312","gb_2312","gb_2312-80","gbk","iso-ir-58","x-gbk","gb18030","hz-gb-2312","big5","big5-hkscs","cn-big5","csbig5","x-x-big5","cseucjpkdfmtjapanese","euc-jp","x-euc-jp","csiso2022jp","iso-2022-jp","csshiftjis","ms_kanji","shift-jis","shift_jis","windows-31j","x-sjis","csksc56011987","csueckr","euc-kr","iso-ir-149","korean","ks_c_5601-1989","ksc5601","ksc_5601","windows-949","csiso2022kr","iso-2022-kr","utf-16","utf-16le","utf-16be"] mapping = { 'cp1250':["windows-1250","x-cp1250"], 'cp1251':["windows-1251","x-cp1251"], 'cp1252':["ansi_x3.4-1968","ascii","csisolatin1","iso-8859-1","iso8859-1","iso_8859-1","l1","latin1","us-ascii","windows-1252"], 'cp1254':["csisolatin5","iso-8859-9","iso-ir-148","l5","latin5","windows-1254"], 'cswindows31j':["csshiftjis","ms_kanji","shift-jis","shift_jis","windows-31j","x-sjis"], 'euc-cn':["chinese","csgb2312","csiso58gb231280","gb2312","gb_2312","gb_2312-80","gbk","iso-ir-58","x-gbk"], 'iso8859-15':["iso-8859-15","iso_8859-15"], 'iso8859-9':["csisolatin5","iso-8859-9","iso-ir-148","l5","latin5","windows-1254"], 'iso88591':["ansi_x3.4-1968","ascii","csisolatin1","iso-8859-1","iso8859-1","iso_8859-1","l1","latin1","us-ascii","windows-1252"], 'iso_8859-1:1987':["ansi_x3.4-1968","ascii","csisolatin1","iso-8859-1","iso8859-1","iso_8859-1","l1","latin1","us-ascii","windows-1252"], 'ks_c_5601-1987':["csksc56011987","csueckr","euc-kr","iso-ir-149","korean","ks_c_5601-1989","ksc5601","ksc_5601","windows-949"], 'ms932':["csshiftjis","ms_kanji","shift-jis","shift_jis","windows-31j","x-sjis"], 'ms936':["chinese","csgb2312","csiso58gb231280","gb2312","gb_2312","gb_2312-80","gbk","iso-ir-58","x-gbk"], 'sjis':["csshiftjis","ms_kanji","shift-jis","shift_jis","windows-31j","x-sjis"], 'x-mac-turkish':["csisolatin5","iso-8859-9","iso-ir-148","l5","latin5","windows-1254"], 'x-user-defined':[] } conclusion = {} for label in opera_labels: conclusion[label] = {'only':[], 'same':[], 'other':[], 'else':[]} for chunk in chunks: if re.search(r'[\x80-\xFF]', chunk): decls = re.findall(r'text/html\s*;\s*charset\s*=\s*["\']?[^\s"\'>]+', chunk, re.I) url = chunk.split('\x00')[0] labels = [] for decl in decls: labels.append(decl.split('=')[1].strip().lower()) if labels[0] in opera_labels: if len(labels) == 1: conclusion[labels[0]]['only'].append(url) elif labels[1] in mapping[labels[0]]: conclusion[labels[0]]['same'].append(url) elif labels[1] in standard_labels: conclusion[labels[0]]['other'].append(url) else: conclusion[labels[0]]['else'].append(url) out = open('labels-with-nonascii-categorized.txt', 'w') for label in conclusion: out.write(label+'\n' 'pages with only one decl: '+str(len(conclusion[label]['only']))+' '+(' '.join(conclusion[label]['only']))+'\n'+ 'pages with later supported decl for same encoding: '+str(len(conclusion[label]['same']))+' '+(' '.join(conclusion[label]['same']))+'\n'+ 'pages with later supported decl for other encoding: '+str(len(conclusion[label]['other']))+' '+(' '.join(conclusion[label]['other']))+'\n'+ 'pages with later unsupported decl: '+str(len(conclusion[label]['else']))+' '+(' '.join(conclusion[label]['else']))+'\n\n') out.close()