Browse code

bug #2751 Fix multi-byte UFT-8 in escape('html_attr') (nicolas-grekas)

This PR was merged into the 1.x branch.

Discussion
----------

Fix multi-byte UFT-8 in escape('html_attr')

same as #2750 but on 1.x

Commits
-------

8863b68c Fix multi-byte UFT-8 in escape('html_attr')

Fabien Potencier authored on 20/09/2018 14:23:43
Showing 2 changed files
... ...
@@ -995,6 +995,10 @@ function twig_escape_filter(Twig_Environment $env, $string, $strategy = 'html',
995 995
         }
996 996
     }
997 997
 
998
+    if ('' === $string) {
999
+        return '';
1000
+    }
1001
+
998 1002
     if (null === $charset) {
999 1003
         $charset = $env->getCharset();
1000 1004
     }
... ...
@@ -1046,7 +1050,7 @@ function twig_escape_filter(Twig_Environment $env, $string, $strategy = 'html',
1046 1050
                 $string = twig_convert_encoding($string, 'UTF-8', $charset);
1047 1051
             }
1048 1052
 
1049
-            if (0 == strlen($string) ? false : 1 !== preg_match('/^./su', $string)) {
1053
+            if (!preg_match('//u', $string)) {
1050 1054
                 throw new Twig_Error_Runtime('The string to escape is not a valid UTF-8 string.');
1051 1055
             }
1052 1056
 
... ...
@@ -1063,7 +1067,7 @@ function twig_escape_filter(Twig_Environment $env, $string, $strategy = 'html',
1063 1067
                 $string = twig_convert_encoding($string, 'UTF-8', $charset);
1064 1068
             }
1065 1069
 
1066
-            if (0 == strlen($string) ? false : 1 !== preg_match('/^./su', $string)) {
1070
+            if (!preg_match('//u', $string)) {
1067 1071
                 throw new Twig_Error_Runtime('The string to escape is not a valid UTF-8 string.');
1068 1072
             }
1069 1073
 
... ...
@@ -1080,7 +1084,7 @@ function twig_escape_filter(Twig_Environment $env, $string, $strategy = 'html',
1080 1084
                 $string = twig_convert_encoding($string, 'UTF-8', $charset);
1081 1085
             }
1082 1086
 
1083
-            if (0 == strlen($string) ? false : 1 !== preg_match('/^./su', $string)) {
1087
+            if (!preg_match('//u', $string)) {
1084 1088
                 throw new Twig_Error_Runtime('The string to escape is not a valid UTF-8 string.');
1085 1089
             }
1086 1090
 
... ...
@@ -1149,6 +1153,29 @@ if (function_exists('mb_convert_encoding')) {
1149 1153
     }
1150 1154
 }
1151 1155
 
1156
+if (function_exists('mb_ord')) {
1157
+    function twig_ord($string)
1158
+    {
1159
+        return mb_ord($string, 'UTF-8');
1160
+    }
1161
+} else {
1162
+    function twig_ord($string)
1163
+    {
1164
+        $code = ($string = unpack('C*', substr($string, 0, 4))) ? $string[1] : 0;
1165
+        if (0xF0 <= $code) {
1166
+            return (($code - 0xF0) << 18) + (($string[2] - 0x80) << 12) + (($string[3] - 0x80) << 6) + $string[4] - 0x80;
1167
+        }
1168
+        if (0xE0 <= $code) {
1169
+            return (($code - 0xE0) << 12) + (($string[2] - 0x80) << 6) + $string[3] - 0x80;
1170
+        }
1171
+        if (0xC0 <= $code) {
1172
+            return (($code - 0xC0) << 6) + $string[2] - 0x80;
1173
+        }
1174
+
1175
+        return $code;
1176
+    }
1177
+}
1178
+
1152 1179
 function _twig_escape_js_callback($matches)
1153 1180
 {
1154 1181
     $char = $matches[0];
... ...
@@ -1187,20 +1214,7 @@ function _twig_escape_css_callback($matches)
1187 1214
 {
1188 1215
     $char = $matches[0];
1189 1216
 
1190
-    // \xHH
1191
-    if (!isset($char[1])) {
1192
-        $hex = ltrim(strtoupper(bin2hex($char)), '0');
1193
-        if (0 === strlen($hex)) {
1194
-            $hex = '0';
1195
-        }
1196
-
1197
-        return '\\'.$hex.' ';
1198
-    }
1199
-
1200
-    // \uHHHH
1201
-    $char = twig_convert_encoding($char, 'UTF-16BE', 'UTF-8');
1202
-
1203
-    return '\\'.ltrim(strtoupper(bin2hex($char)), '0').' ';
1217
+    return sprintf('\\%X ', 1 === strlen($char) ? ord($char) : twig_ord($char));
1204 1218
 }
1205 1219
 
1206 1220
 /**
... ...
@@ -1211,19 +1225,6 @@ function _twig_escape_css_callback($matches)
1211 1225
  */
1212 1226
 function _twig_escape_html_attr_callback($matches)
1213 1227
 {
1214
-    /*
1215
-     * While HTML supports far more named entities, the lowest common denominator
1216
-     * has become HTML5's XML Serialisation which is restricted to the those named
1217
-     * entities that XML supports. Using HTML entities would result in this error:
1218
-     *     XML Parsing Error: undefined entity
1219
-     */
1220
-    static $entityMap = array(
1221
-        34 => 'quot', /* quotation mark */
1222
-        38 => 'amp',  /* ampersand */
1223
-        60 => 'lt',   /* less-than sign */
1224
-        62 => 'gt',   /* greater-than sign */
1225
-    );
1226
-
1227 1228
     $chr = $matches[0];
1228 1229
     $ord = ord($chr);
1229 1230
 
... ...
@@ -1240,22 +1241,31 @@ function _twig_escape_html_attr_callback($matches)
1240 1241
      * replace it with while grabbing the hex value of the character.
1241 1242
      */
1242 1243
     if (1 == strlen($chr)) {
1243
-        $hex = strtoupper(substr('00'.bin2hex($chr), -2));
1244
-    } else {
1245
-        $chr = twig_convert_encoding($chr, 'UTF-16BE', 'UTF-8');
1246
-        $hex = strtoupper(substr('0000'.bin2hex($chr), -4));
1247
-    }
1244
+        /*
1245
+         * While HTML supports far more named entities, the lowest common denominator
1246
+         * has become HTML5's XML Serialisation which is restricted to the those named
1247
+         * entities that XML supports. Using HTML entities would result in this error:
1248
+         *     XML Parsing Error: undefined entity
1249
+         */
1250
+        static $entityMap = array(
1251
+            34 => '&quot;', /* quotation mark */
1252
+            38 => '&amp;',  /* ampersand */
1253
+            60 => '&lt;',   /* less-than sign */
1254
+            62 => '&gt;',   /* greater-than sign */
1255
+        );
1256
+
1257
+        if (isset($entityMap[$ord])) {
1258
+            return $entityMap[$ord];
1259
+        }
1248 1260
 
1249
-    $int = hexdec($hex);
1250
-    if (array_key_exists($int, $entityMap)) {
1251
-        return sprintf('&%s;', $entityMap[$int]);
1261
+        return sprintf('&#x%02X;', $ord);
1252 1262
     }
1253 1263
 
1254 1264
     /*
1255 1265
      * Per OWASP recommendations, we'll use hex entities for any other
1256 1266
      * characters where a named entity does not exist.
1257 1267
      */
1258
-    return sprintf('&#x%s;', $hex);
1268
+    return sprintf('&#x%04X;', twig_ord($chr));
1259 1269
 }
1260 1270
 
1261 1271
 // add multibyte extensions if possible
... ...
@@ -23,6 +23,7 @@ class Twig_Test_EscapingTest extends \PHPUnit\Framework\TestCase
23 23
         '\'' => '&#x27;',
24 24
         /* Characters beyond ASCII value 255 to unicode escape */
25 25
         'Ā' => '&#x0100;',
26
+        '😀' => '&#x1F600;',
26 27
         /* Immune chars excluded */
27 28
         ',' => ',',
28 29
         '.' => '.',