Convert HTML named entities to numeric entities

This code generates a lookup map using get_html_translation_table of named html entities to numeric entities. str_replace is then used on a string to convert those entities.

First two functions are from http://stackoverflow.com/a/12848889

 function utf8_chr ($ord) {
 switch (TRUE) {
 case $ord < 0x80:
 return pack('C*', $ord & 0x7F);
 case $ord < 0x0800:
 return pack('C*', (($ord & 0x07C0) >> 6) | 0xC0, ($ord & 0x3F) | 0x80);
 case $ord < 0x010000:
 return pack('C*', (($ord & 0xF000) >> 12) | 0xE0, (($ord & 0x0FC0) >> 6) | 0x80, ($ord & 0x3F) | 0x80);
 case $ord < 0x110000:
 return pack('C*', (($ord & 0x1C0000) >> 18) | 0xF0, (($ord & 0x03F000) >> 12) | 0x80, (($ord & 0x0FC0) >> 6) | 0x80, ($ord & 0x3F) | 0x80);
 }
 return FALSE;
}  function utf8_ord ($chr) {
 $bytes = array_values(unpack('C*', $chr));
 switch (count($bytes)) {
 case 1:
 if ($bytes[0] < 0x80) {
 return $bytes[0];
 }
 break;
 case 2:
 if (($bytes[0] & 0xE0) === 0xC0 && ($bytes[1] & 0xC0) === 0x80) {
 return (($bytes[0] & 0x1F) << 6) | ($bytes[1] & 0x3F);
 }
 break;
 case 3:
 if (($bytes[0] & 0xF0) === 0xE0 && ($bytes[1] & 0xC0) === 0x80 && ($bytes[2] & 0xC0) === 0x80) {
 return (($bytes[0] & 0x0F) << 12) | (($bytes[1] & 0x3F) << 6) | ($bytes[2] & 0x3F);
 }
 break;
 case 4:
 if (($bytes[0] & 0xF8) === 0xF0 && ($bytes[1] & 0xC0) === 0x80 && ($bytes[2] & 0xC0) === 0x80 && ($bytes[3] & 0xC0) === 0x80) {
 return (($bytes[0] & 0x07) << 18) | (($bytes[1] & 0x3F) << 12) | (($bytes[2] & 0x3F) << 6) | ($bytes[3] & 0x3F);
 }
 break;
 }
 return FALSE;
}  $input = '&lt;title&gt;My Web Page&lt;/title&gt; this &amp; that&trade;';  $map = array_map(function($a) { return '&#' . utf8_ord($a) . ';'; }, array_flip(get_html_translation_table(HTML_ENTITIES, ENT_HTML401)));
$output = str_replace(array_keys($map), array_values($map), $input);  echo "$output\n";

Comments

    Leave a comment