Viewing file: d.php (3.45 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php ob_start("ob_gzhandler"); ?> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <style media="screen, projection"> @import "http://bits.wikimedia.org/skins-1.5/monobook/main.css"; </style> <body> <?php
require_once('encodingroutines.php');
set_time_limit(1200);
function gather_encodings($s) { $encodings = Array(); foreach(mb_list_encodings() as $encoding) { if (in_array($encoding,Array('pass','auto'))) continue; if (in_array($encoding,Array('UUENCODE','BASE64','Quoted-Printable','UTF-7','UTF7-IMAP','UCS-2','UCS-2BE','UCS-2LE','UTF-16','UTF-16BE','UTF-16LE','byte2be','byte2le','byte4be','UCS-4','UCS-4BE','UTF-32','UTF-32BE','byte4le','UCS-4LE','UTF-32LE','ArmSCII-8'))) continue; if (mb_check_encoding($s,$encoding)==FALSE) continue; $result=mb_convert_encoding($s,"UTF-8",$encoding); $encodings[$result][]=$encoding; } return $encodings; }
function score_string($s) { mb_regex_encoding("UTF-8"); $score=0; $patterns=split("\n",file_get_contents("patterns.txt")); foreach($patterns as $pattern) { if (strlen($pattern)==0) continue; //echo "pattern:"; var_dump($pattern); $score+=count(mb_split($pattern,$s))-1; } //print_r($patterns); return $score; }
//$score = score_string('And you know I paid Kev £500 to not fight');; //echo "score:", $score; //return;
$workitem = unserialize(file_get_contents("column_issues_utf8.txt"));
mb_regex_set_options('i'.mb_regex_set_options());
foreach($workitem as $schemename=>$scheme) { foreach($scheme as $tablename=>$table) { foreach($table as $columnname=>$column) { if (count($column)==0) continue; # array is empty, no need to process echo "<h1>".$schemename."/".$tablename."/".$columnname." - ".count($column)." items</h1>"; $textcounter=0; $itemcounter=0; foreach($column as $text) { $textcounter++; $text=str_replace(chr(160),chr(32),$text); if (is_ascii($text)) continue;
$encodings=gather_encodings($text); $scores=Array(); foreach($encodings as $result=>$encodinglist) { $score = score_string($result); $scores[$score][]=array($encodinglist,$result); } ksort($scores); if (count($scores)==2) { // found exactly one non-zero score - accept it as definitive $k = array_keys($scores); if ($k[0]==0) continue; } if (count($scores)>1) { // found more than one score unset($scores[0]); } krsort($scores);
$itemcounter++; echo '<table border width="100%">'; echo "<caption>".strlen($text)." bytes: ".htmlspecialchars($text); if (strlen($text)>50) { $text = get_shortened_text($text); echo "<br>(distilled to: ".htmlspecialchars($text)." )"; } echo '</caption>'; echo '<tr><th>score</th><th>result</th><th>encoding(s)</th></tr>'; foreach($scores as $score=>$encodinglist) { //var_dump($score); var_dump($encodinglist); foreach($encodinglist as $encodingrow) { echo "<tr>"; echo "<td>".($score?$score:"")."</td>"; echo "<td>".htmlspecialchars(strlen($encodingrow[1])<50?$encodingrow[1]:mb_convert_encoding(get_shortened_text($text),"UTF-8",$encodingrow[0][0]))."</td>"; echo "<td>".htmlspecialchars(join($encodingrow[0],", "))."</td>"; echo "</tr>"; } } echo "</table>"; echo "<h1>itemcounting: ".$itemcounter." unhandled of ".$textcounter." processed</h1>"; ob_flush(); } echo "<h1>unresolved items: ".$itemcounter." (of ".count($column).")</h1>"; echo "<hr>"; ob_flush(); } } } #print_r($workitem);
?> </body>
|