Convert HTML named entities to numeric entities

2014/10/24

This code generates a lookup map using get_html_translation_table of named html entities to numeric entities. str_replace is then used on a string to convert those entities.

First two functions are from http://stackoverflow.com/a/12848889

function utf8_chr ($ord) {
switch (TRUE) {
case $ord < 0x80:
return pack('C*', $ord & 0x7F);
case $ord < 0x0800:
return pack('C*', (($ord & 0x07C0) >> 6) | 0xC0, ($ord & 0x3F) | 0x80);
case $ord < 0x010000:
return pack('C*', (($ord & 0xF000) >> 12) | 0xE0, (($ord & 0x0FC0) >> 6) | 0x80, ($ord & 0x3F) | 0x80);
case $ord < 0x110000:
return pack('C*', (($ord & 0x1C0000) >> 18) | 0xF0, (($ord & 0x03F000) >> 12) | 0x80, (($ord & 0x0FC0) >> 6) | 0x80, ($ord & 0x3F) | 0x80);
}
return FALSE;
}

function utf8_ord ($chr) {
$bytes = array_values(unpack('C*', $chr));
switch (count($bytes)) {
case 1:
if ($bytes[0] < 0x80) {
return $bytes[0];
}
break;
case 2:
if (($bytes[0] & 0xE0) === 0xC0 && ($bytes[1] & 0xC0) === 0x80) {
return (($bytes[0] & 0x1F) << 6) | ($bytes[1] & 0x3F);
}
break;
case 3:
if (($bytes[0] & 0xF0) === 0xE0 && ($bytes[1] & 0xC0) === 0x80 && ($bytes[2] & 0xC0) === 0x80) {
return (($bytes[0] & 0x0F) << 12) | (($bytes[1] & 0x3F) << 6) | ($bytes[2] & 0x3F);
}
break;
case 4:
if (($bytes[0] & 0xF8) === 0xF0 && ($bytes[1] & 0xC0) === 0x80 && ($bytes[2] & 0xC0) === 0x80 && ($bytes[3] & 0xC0) === 0x80) {
return (($bytes[0] & 0x07) << 18) | (($bytes[1] & 0x3F) << 12) | (($bytes[2] & 0x3F) << 6) | ($bytes[3] & 0x3F);
}
break;
}
return FALSE;
}

$input = '&lt;title&gt;My Web Page&lt;/title&gt; this &amp; that&trade;';

$map = array_map(function($a) { return '&#' . utf8_ord($a) . ';'; }, array_flip(get_html_translation_table(HTML_ENTITIES, ENT_HTML401)));
$output = str_replace(array_keys($map), array_values($map), $input);

echo "$output\n";

Comments