Experimental support for data-* attributes in HTML #2194

This commit is contained in:
Kijin Sung 2023-10-05 22:18:25 +09:00
parent a643ff60eb
commit 221602ceb8
2 changed files with 115 additions and 10 deletions

View file

@ -38,6 +38,18 @@ class HTMLFilter
'web-share' => true,
);
/**
* List of tags where data-* attributes are allowed.
*/
protected static $_data_allowed = array(
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p',
'a', 'span', 'img', 'picture', 'b', 'i', 'strong', 'em', 'u', 's', 'sub', 'sup',
'header', 'footer', 'nav', 'main', 'section', 'article', 'aside', 'details', 'summary',
'ul', 'ol', 'li', 'mark', 'wbr', 'figure', 'figcaption', 'caption',
'table', 'thead', 'tbody', 'tr', 'th', 'td', 'ins', 'del',
'iframe', 'video', 'audio', 'source', 'track', 'blockquote', 'code',
);
/**
* Prepend a pre-processing filter.
*
@ -216,13 +228,6 @@ class HTMLFilter
$config->set('Cache.SerializerPath', \RX_BASEDIR . 'files/cache/htmlpurifier');
Storage::createDirectory(\RX_BASEDIR . 'files/cache/htmlpurifier');
// Modify the HTML definition to support editor components and widgets.
$def = $config->getHTMLDefinition(true);
$def->addAttribute('img', 'editor_component', 'Text');
$def->addAttribute('div', 'editor_component', 'Text');
$def->addAttribute('img', 'rx_encoded_properties', 'Text');
$def->addAttribute('div', 'rx_encoded_properties', 'Text');
// Support HTML5 and CSS3.
self::_supportHTML5($config);
self::_supportCSS3($config);
@ -320,13 +325,24 @@ class HTMLFilter
$def->addAttribute('details', 'open', 'Bool');
$def->addAttribute('i', 'aria-hidden', 'Text');
$def->addAttribute('img', 'srcset', 'Text');
$def->addAttribute('img', 'data-file-srl', 'Number');
$def->addAttribute('iframe', 'allow', 'Text');
$def->addAttribute('iframe', 'allowfullscreen', 'Bool');
$def->addAttribute('iframe', 'referrerpolicy', 'Enum#no-referrer,no-referrer-when-downgrade,origin,origin-when-cross-origin,same-origin,strict-origin,strict-origin-when-cross-origin,unsafe-url');
// Support contenteditable="false" (#1710)
$def->addAttribute('div', 'contenteditable', 'Enum#false');
// Support editor components and widgets.
$def->addAttribute('img', 'editor_component', 'Text');
$def->addAttribute('div', 'editor_component', 'Text');
$def->addAttribute('img', 'rx_encoded_properties', 'Text');
$def->addAttribute('div', 'rx_encoded_properties', 'Text');
// Support encoded data-* attributes for some tags.
foreach (self::$_data_allowed as $tag)
{
$def->addAttribute($tag, 'rx_encoded_datas', 'Text');
}
}
/**
@ -497,6 +513,9 @@ class HTMLFilter
{
$content = self::_encodeWidgetsAndEditorComponents($content, $allow_editor_components, $allow_widgets);
}
// Encode data-* attributes.
$content = self::_encodeDataAttributes($content);
return $content;
}
@ -565,6 +584,9 @@ class HTMLFilter
// Restore widget and editor component properties.
$content = self::_decodeWidgetsAndEditorComponents($content, $allow_editor_components, $allow_widgets);
// Restore data-* attributes.
$content = self::_decodeDataAttributes($content);
return $content;
}
@ -605,7 +627,7 @@ class HTMLFilter
{
return $attr[0];
}
$attrval = utf8_normalize_spaces(utf8_clean(html_entity_decode($attr[2])));
$attrval = trim(utf8_normalize_spaces(utf8_clean(html_entity_decode($attr[2]))));
if (preg_match('/^javascript:/i', preg_replace('/\s+/', '', $attrval)))
{
return '';
@ -660,7 +682,69 @@ class HTMLFilter
}
foreach ($decoded_properties as $key => $val)
{
$attrs[] = $key . '="' . htmlspecialchars($val) . '"';
$attrs[] = $key . '="' . htmlspecialchars($val, ENT_QUOTES, 'UTF-8') . '"';
}
return str_replace($match[3], ' ' . implode(' ', $attrs), $match[0]);
}, $content);
}
/**
* Encode data-* attributes so that they will survive being passed through HTMLPurifier.
*
* @param string $content
* @return string
*/
protected static function _encodeDataAttributes(string $content): string
{
$tags = implode('|', self::$_data_allowed);
return preg_replace_callback('!<(' . $tags . ')\s([^>]+)>!i', function($match) {
$attrs = array();
$html = preg_replace_callback('!\s(data-[a-zA-Z0-9_-]+)="([^"]*)"!', function($attr) use(&$attrs) {
$attrkey = strtolower($attr[1]);
$attrval = trim(utf8_normalize_spaces(utf8_clean(html_entity_decode($attr[2]))));
if (preg_match('/^javascript:/i', preg_replace('/\s+/', '', $attrval)))
{
return '';
}
if (preg_match('/-srl$/i', $attrkey) && !ctype_digit($attrval))
{
return '';
}
$attrs[$attrkey] = $attrval;
return '';
}, $match[0]);
$encoded_datas = base64_encode(json_encode($attrs));
$encoded_datas = $encoded_datas . ':' . Security::createSignature($encoded_datas);
return substr($html, 0, -1) . ' rx_encoded_datas="' . $encoded_datas . '">';
}, $content);
}
/**
* Decode data-* attributes after processing.
*
* @param string $content
* @param bool $allow_editor_components (optional)
* @param bool $allow_widgets (optional)
* @return string
*/
protected static function _decodeDataAttributes(string $content): string
{
$tags = implode('|', self::$_data_allowed);
return preg_replace_callback('!<(' . $tags . ')([^>]*)(\srx_encoded_datas="([^"]+)")!i', function($match) {
$attrs = array();
list($encoded_datas, $signature) = explode(':', $match[4]);
if (!Security::verifySignature($encoded_datas, $signature))
{
return str_replace($match[3], '', $match[0]);
}
$encoded_datas = json_decode(base64_decode($encoded_datas));
if (!$encoded_datas)
{
return str_replace($match[3], '', $match[0]);
}
foreach ($encoded_datas as $key => $val)
{
$attrs[] = $key . '="' . htmlspecialchars($val, ENT_QUOTES, 'UTF-8') . '"';
}
return str_replace($match[3], ' ' . implode(' ', $attrs), $match[0]);
}, $content);

View file

@ -257,9 +257,30 @@ class HTMLFilterTest extends \Codeception\TestCase\Test
$target = '<p><img src="foo.jpg" alt="foobar" data-file-srl="1234" /></p>';
$this->assertEquals($target, Rhymix\Framework\Filters\HTMLFilter::clean($source));
$source = '<p><img src="foo.jpg" alt="foobar" data-file-srl="NaN" /></p>';
$target = '<p><img src="foo.jpg" alt="foobar" /></p>';
$this->assertEquals($target, Rhymix\Framework\Filters\HTMLFilter::clean($source));
$source = '<p><img src="foo.jpg" alt="foobar" data-file-srl="javascript:xss()" /></p>';
$target = '<p><img src="foo.jpg" alt="foobar" /></p>';
$this->assertEquals($target, Rhymix\Framework\Filters\HTMLFilter::clean($source));
// Other data-* attribute
$source = '<div data-foo="foobar" data-bar="bazz" style="width:100%;">Hello World</div>';
$target = '<div style="width:100%;" data-foo="foobar" data-bar="bazz">Hello World</div>';
$this->assertEquals($target, Rhymix\Framework\Filters\HTMLFilter::clean($source));
$source = '<a href="#" data-not-properly-encoded="Rhymix\'s Future">Hello World</a>';
$target = '<a href="#" data-not-properly-encoded="Rhymix&#039;s Future">Hello World</a>';
$this->assertEquals($target, Rhymix\Framework\Filters\HTMLFilter::clean($source));
$source = '<article nonsense="#" data-json="{&quot;foo&quot;:[&quot;bar&quot;,777]}"><p>Hello World<p></article>';
$target = '<article data-json="{&quot;foo&quot;:[&quot;bar&quot;,777]}"><p>Hello World</p><p></p></article>';
$this->assertEquals($target, Rhymix\Framework\Filters\HTMLFilter::clean($source));
$source = '<p data-dangerous=" javascript: xss() ">Hello World</p>';
$target = '<p>Hello World</p>';
$this->assertEquals($target, Rhymix\Framework\Filters\HTMLFilter::clean($source));
}
public function testHTMLFilterFixMediaUrls()