Viewing file: encodingroutines.php (6.37 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php
function is_ascii($s) { //$ascii_charset="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890 _.-!&':()/@\"#,?{}*+`^>;=~"; for($i=0;$i<strlen($s);$i++) { $code = ord($s[$i]); if ($code > 127) { //print "character '".$s[$i]."' (ord ".ord($s[$i]).") is not in ASCII charset.\n"; return FALSE; } } return TRUE; }
function is_utf8($str) { $len = strlen($str); for($i = 0; $i < $len; $i++){ $c = ord($str[$i]); if ($c > 128) { if (($c > 247)) return false; elseif ($c > 239) $bytes = 4; elseif ($c > 223) $bytes = 3; elseif ($c > 191) $bytes = 2; else return false; if (($i + $bytes) > $len) return false; while ($bytes > 1) { $i++; $b = ord($str[$i]); if ($b < 128 || $b > 191) return false; $bytes--; } } } return true; } // end of check_utf8
function get_high_bit_sequences($s) { $sequence_array=array(); $in_sequence=FALSE; $current_sequence=""; for($i=0;$i<strlen($s);$i++) { $char=$s[$i]; $code=ord($char); $is_ascii= ( $code < 128 ); if ($is_ascii) { if ($in_sequence) { $sequence_array[$current_sequence]++; $current_sequence=""; $in_sequence=FALSE; } } else { // NON-ASCII if ($in_sequence) { $current_sequence.=$char; } else { $current_sequence=$char; $in_sequence=TRUE; } } } if ($in_sequence) { $sequence_array[$current_sequence]++; $current_sequence=""; $in_sequence=FALSE; } return $sequence_array; }
function get_shortened_text($s) { $highlight = Array(); for($i=0;$i<strlen($s);$i++) $highlight[$i]=ord($s[$i])>127; for($i=0;$i<strlen($s);$i++) { if (ord($s[$i])>127) { $highlight[$i]=TRUE; for($j=$i-1;$j>=0;$j--) { if ($j<$i-1 && ord($s[$j])==32 || $highlight[$j]) break; $highlight[$j]=TRUE; } for($j=$i+1;$j<strlen($s);$j++) { if ($j>$i+1 && ord($s[$j])==32 || $highlight[$j]) break; $highlight[$j]=TRUE; } } } $parts = Array(); $laststate=FALSE; for($i=0;$i<strlen($s);$i++) { if($highlight[$i]!=$laststate) { if ($highlight[$i]) { $startpos=$i; $laststate=TRUE; } else { $endpos=$i; $parts[] = substr($s,$startpos,$endpos-$startpos); $laststate=FALSE; } } } if ($laststate) { $endpos=strlen($s); $parts[] = substr($s,$startpos,$endpos-$startpos); } return join($parts,' | '); }
function dump_string($s) { $line1=""; $line2=""; for($i=0;$i<strlen($s);$i++) { $char = $s[$i]; $code = ord($char); $line1.= sprintf("%4s",$char); $line2.= sprintf("%4d",$code); } print $line1."\n"; print $line2."\n"; }
function gather_encodings($s) { $encodings = Array(); foreach(mb_list_encodings() as $encoding) { if (in_array($encoding,Array('pass','auto'))) continue; //if (in_array($encoding,Array('UUENCODE','BASE64','Quoted-Printable','UTF-7','UTF7-IMAP','UCS-2','UCS-2BE','UCS-2LE','UTF-16','UTF-16BE','UTF-16LE','byte2be','byte2le','byte4be','UCS-4','UCS-4BE','UTF-32','UTF-32BE','byte4le','UCS-4LE','UTF-32LE','ArmSCII-8'))) continue; if (in_array($encoding,Array('UUENCODE','BASE64','Quoted-Printable','UTF-7','UTF7-IMAP','UCS-2','UCS-2BE','UCS-2LE','UTF-16','UTF-16BE','UTF-16LE','byte2be','byte2le','byte4be','UCS-4','UCS-4BE','UTF-32','UTF-32BE','byte4le','UCS-4LE','UTF-32LE'))) continue; if (mb_check_encoding($s,$encoding)==FALSE) continue; $encodings[mb_convert_encoding($s,"UTF-8",$encoding)][]=$encoding; } if (!array_key_exists($s,$encodings)) $encodings[$s][]="pass"; return $encodings; }
mb_regex_set_options('i'.mb_regex_set_options());
function score_string($s) { mb_regex_encoding("UTF-8"); $score=0; $patterns=split("\n",file_get_contents("patterns.txt")); foreach($patterns as $pattern) { if (strlen($pattern)==0) continue; //echo "pattern:"; var_dump($pattern); $score+=count(mb_split($pattern,$s))-1; } //print_r($patterns); return $score; }
function render_choices($id,$text,$itemcounter=NULL,$textcounter=NULL) { $scores=Array(); foreach(gather_encodings($text) as $result=>$encodinglist) { $scores[score_string(get_shortened_text($result))][]=array($encodinglist,$result); //$scores[0][]=array($encodinglist,$result); } ksort($scores); if (count($scores)==2) { // found exactly one non-zero score - accept it as definitive $k = array_keys($scores); //if ($k[0]==0) continue; } else if (count($scores)>1) { // found more than one score unset($scores[0]); } krsort($scores);
echo '<div id="'.htmlspecialchars($id).'">'; echo '<table border width="100%">'; echo "<caption>"; echo "id: ".htmlspecialchars($id)."<br>"; echo htmlspecialchars($text); if (strlen($text)>50) { $text = get_shortened_text($text); echo "<br>(problem parts: ".htmlspecialchars($text)." )"; } echo '</caption>'; echo '<tr><th>score</th><th>result</th><th></th><th>encoding(s)</th></tr>'; foreach($scores as $score=>$encodinglist) { //var_dump($score); var_dump($encodinglist); foreach($encodinglist as $encodingrow) { echo '<tr id="'.addslashes($encodingrow[0][0]).'">'; echo "<td>".($score?$score:"")."</td>"; echo '<td>'; echo '<button type="button" title="accept this resolution" onclick="acceptresolution('.htmlspecialchars($id).',\''.htmlspecialchars($encodingrow[0][0]).'\');">✔</button>'; //echo '<a title="accept this resolution" onclick="acceptresolution('.htmlspecialchars($id).',\''.htmlspecialchars($encodingrow[0][0]).'\');">✔</a>'; echo '<button type="button" title="hide this resolution" onclick="hideresolution('.htmlspecialchars($id).',\''.htmlspecialchars($encodingrow[0][0]).'\');">✘</button>'; //echo '<button type="button" onclick="edit('.htmlspecialchars($id).',\''.htmlspecialchars($encodingrow[0][0]).'\');">✍</button>'; echo '</td>'; echo '<td>'; echo htmlspecialchars(strlen($encodingrow[1])>50?mb_convert_encoding($text,"UTF-8",$encodingrow[0][0]):$encodingrow[1]); echo '</td>'; echo "<td>".htmlspecialchars(join($encodingrow[0],", "))."</td>"; echo "</tr>"; } } echo '<tr><td></td><td>'; echo '<button type="button" title="mark this message for checkup" onclick="markresolution('.htmlspecialchars($id).',\''.htmlspecialchars($encodingrow[0][0]).'\');">⁇</button>'; echo '</td><td></td><td>'; echo '</td></tr>'; echo "</table>"; //echo "<h1>itemcounting: ".$itemcounter." unhandled of ".$textcounter." processed</h1>"; echo "</div>"; }
?>
|