class-html-processing-helper.php
<?php
/**
* This file is part of the WooCommerce Email Editor package
*
* @package Automattic\WooCommerce\EmailEditor
*/
declare( strict_types = 1 );
namespace Automattic\WooCommerce\EmailEditor\Integrations\Utils;
/**
* Helper class for HTML processing and manipulation.
*/
class Html_Processing_Helper {
/**
* Clean CSS classes by removing background and border related classes.
*
* @param string $classes CSS classes to clean.
* @return string Cleaned CSS classes.
*/
public static function clean_css_classes( string $classes ): string {
// Limit input length to prevent DoS attacks.
if ( strlen( $classes ) > 1000 ) {
$classes = substr( $classes, 0, 1000 );
}
// Remove generic background classes but keep specific color classes.
$result = preg_replace( '/\bhas-background\b/', '', $classes );
if ( null === $result ) {
$classes = '';
} else {
$classes = $result;
}
// Remove border classes.
$result = preg_replace( '/\bhas-[a-z-]*border[a-z-]*\b/', '', $classes );
if ( null === $result ) {
$classes = '';
} else {
$classes = $result;
}
$result = preg_replace( '/\b[a-z-]+-border-[a-z-]+\b/', '', $classes );
if ( null === $result ) {
$classes = '';
} else {
$classes = $result;
}
// Clean up multiple spaces.
$result = preg_replace( '/\s+/', ' ', $classes );
if ( null === $result ) {
$classes = '';
} else {
$classes = $result;
}
return trim( $classes );
}
/**
* Sanitize CSS value to prevent injection attacks.
*
* @param string $value CSS value to sanitize.
* @return string Sanitized CSS value or empty string if invalid.
*/
public static function sanitize_css_value( string $value ): string {
// Remove dangerous script injection characters (angle brackets) but preserve quotes for CSS strings.
$result = preg_replace( '/[<>]/', '', $value );
if ( null === $result ) {
$value = '';
} else {
$value = $result;
}
// Remove dangerous CSS functions and expressions.
$dangerous_patterns = array(
'/expression\s*\(/i',
'/url\s*\(\s*javascript\s*:/i',
'/url\s*\(\s*data\s*:/i',
'/url\s*\(\s*vbscript\s*:/i',
'/import\s*\(/i',
'/behavior\s*:/i',
'/binding\s*:/i',
'/filter\s*:/i',
'/progid\s*:/i',
);
foreach ( $dangerous_patterns as $pattern ) {
if ( preg_match( $pattern, $value ) ) {
return '';
}
}
return trim( $value );
}
/**
* Sanitize dimension value to ensure it's a valid CSS dimension.
*
* Supports numeric values (converted to px) and standard CSS units.
*
* @param mixed $value The dimension value to sanitize.
* @return string Sanitized dimension value or empty string if invalid.
*/
public static function sanitize_dimension_value( $value ): string {
if ( ! is_string( $value ) && ! is_numeric( $value ) ) {
return '';
}
$value = (string) $value;
// If it's just a number, assume pixels.
if ( is_numeric( $value ) ) {
$value = $value . 'px';
}
// Use existing CSS value sanitization for security.
$sanitized_value = self::sanitize_css_value( $value );
// Additional validation for dimension-specific units.
if ( ! empty( $sanitized_value ) && preg_match( '/^(\d+(?:\.\d+)?)(px|em|rem|%|vh|vw|ex|ch|in|cm|mm|pt|pc)$/', $sanitized_value ) ) {
return $sanitized_value;
}
return '';
}
/**
* Sanitize color value to ensure it's a valid color format.
*
* Supports hex colors, rgb/rgba, hsl/hsla, named colors, and CSS variables.
*
* @param string $color The color value to sanitize.
* @return string Sanitized color value or safe default if invalid.
*/
public static function sanitize_color( string $color ): string {
// Remove any whitespace.
$color = trim( $color );
// Check if it's a valid hex color (#fff, #ffffff, #ffffffff).
if ( preg_match( '/^#([0-9a-fA-F]{3}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})$/', $color ) ) {
return strtolower( $color );
}
// Check for rgb/rgba colors.
if ( preg_match( '/^rgba?\(\s*(25[0-5]|2[0-4]\d|1\d{2}|\d{1,2})\s*,\s*(25[0-5]|2[0-4]\d|1\d{2}|\d{1,2})\s*,\s*(25[0-5]|2[0-4]\d|1\d{2}|\d{1,2})\s*(?:,\s*(?:1(?:\.0+)?|0(?:\.\d+)?|\.\d+)\s*)?\)$/', $color ) ) {
return $color;
}
// Check for hsl/hsla colors.
if ( preg_match( '/^hsla?\(\s*(360|3[0-5]\d|[12]\d{2}|\d{1,2})\s*,\s*(100|[1-9]?\d)%\s*,\s*(100|[1-9]?\d)%\s*(?:,\s*(?:1(?:\.0+)?|0(?:\.\d+)?|\.\d+)\s*)?\)$/', $color ) ) {
return $color;
}
// Check for named colors and other valid CSS color values.
// We use a permissive approach: accept any string that doesn't contain dangerous characters
// and let the CSS engine handle the actual validation.
if ( preg_match( '/^[a-zA-Z][a-zA-Z0-9-]*$/', $color ) && ! preg_match( '/^(expression|javascript|vbscript|data|import|behavior|binding|filter|progid)/i', $color ) ) {
return strtolower( $color );
}
// Check if it's a CSS variable (var(--variable-name)).
if ( preg_match( '/^var\(--[a-zA-Z0-9\-_]+\)$/', $color ) ) {
return $color;
}
// If not a valid color format, return a safe default.
return '#000000';
}
/**
* Normalize rel attribute by lowercasing, deduplicating tokens, and ensuring required tokens.
*
* @param string|null $rel_value Current rel attribute value.
* @param bool $require_security_tokens Whether to require noopener and noreferrer tokens.
* @return string Normalized rel attribute value.
*/
private static function normalize_rel_attribute( ?string $rel_value, bool $require_security_tokens = false ): string {
$allowed_tokens = array( 'noopener', 'noreferrer', 'nofollow', 'external' );
$required_tokens = $require_security_tokens ? array( 'noopener', 'noreferrer' ) : array();
// If no rel value and no required tokens, return empty.
if ( null === $rel_value && empty( $required_tokens ) ) {
return '';
}
// Start with required tokens.
$tokens = $required_tokens;
// If rel value exists, parse and normalize it.
if ( null !== $rel_value ) {
$existing_tokens = preg_split( '/\s+/', trim( $rel_value ) );
if ( false !== $existing_tokens ) {
// Normalize existing tokens: lowercase, remove empty, filter allowed.
$normalized_existing = array_filter(
array_map( 'strtolower', $existing_tokens ),
function ( $token ) use ( $allowed_tokens ) {
return ! empty( $token ) && in_array( $token, $allowed_tokens, true );
}
);
// Merge with required tokens, removing duplicates.
$tokens = array_unique( array_merge( $tokens, $normalized_existing ) );
}
}
// Return normalized rel attribute or empty string if no valid tokens.
return empty( $tokens ) ? '' : implode( ' ', $tokens );
}
/**
* Validate and sanitize specific caption attributes for security.
*
* @param \WP_HTML_Tag_Processor $html HTML tag processor.
* @param string $attr_name Attribute name to validate.
*/
public static function validate_caption_attribute( \WP_HTML_Tag_Processor $html, string $attr_name ): void {
$attr_value = $html->get_attribute( $attr_name );
if ( null === $attr_value ) {
return;
}
// Block all event handler attributes (on*) - Critical security fix.
if ( str_starts_with( $attr_name, 'on' ) ) {
$html->remove_attribute( $attr_name );
return;
}
switch ( $attr_name ) {
case 'href':
// Only allow http, https, mailto, and tel protocols.
if ( ! preg_match( '/^(https?:\/\/|mailto:|tel:)/i', (string) $attr_value ) ) {
$html->remove_attribute( $attr_name );
break;
}
// Sanitize and normalize the URL using WordPress's esc_url_raw.
$sanitized_url = esc_url_raw( (string) $attr_value );
if ( empty( $sanitized_url ) ) {
// If esc_url_raw returns empty, the URL was invalid - remove the attribute.
$html->remove_attribute( $attr_name );
} else {
// Set the attribute to the sanitized/normalized value.
$html->set_attribute( $attr_name, $sanitized_url );
}
break;
case 'target':
// Allow only common safe targets.
$allowed_targets = array( '_blank', '_self' );
$target_value = strtolower( (string) $attr_value );
if ( ! in_array( $target_value, $allowed_targets, true ) ) {
$html->remove_attribute( $attr_name );
} elseif ( '_blank' === $target_value ) {
// When target is "_blank", ensure rel attribute has noopener and noreferrer.
$current_rel = $html->get_attribute( 'rel' );
$rel_value = is_string( $current_rel ) ? $current_rel : null;
$normalized_rel = self::normalize_rel_attribute( $rel_value, true );
$html->set_attribute( 'rel', $normalized_rel );
}
break;
case 'rel':
// Normalize rel attribute: lowercase, deduplicate, preserve safe tokens.
$rel_value = is_string( $attr_value ) ? $attr_value : null;
$normalized_rel = self::normalize_rel_attribute( $rel_value, false );
if ( empty( $normalized_rel ) ) {
$html->remove_attribute( $attr_name );
} else {
$html->set_attribute( $attr_name, $normalized_rel );
}
break;
case 'style':
// Only allow safe CSS properties for typography and basic styling.
$safe_properties = self::get_safe_css_properties();
$sanitized_styles = array();
$style_parts = explode( ';', (string) $attr_value );
foreach ( $style_parts as $style_part ) {
$style_part = trim( $style_part );
if ( empty( $style_part ) ) {
continue;
}
$property_parts = explode( ':', $style_part, 2 );
if ( count( $property_parts ) !== 2 ) {
continue;
}
$property = trim( strtolower( $property_parts[0] ) );
$value = trim( $property_parts[1] );
// Only allow safe properties.
if ( in_array( $property, $safe_properties, true ) ) {
// Use centralized CSS value sanitization.
$sanitized_value = self::sanitize_css_value( $value );
if ( ! empty( $sanitized_value ) ) {
$sanitized_styles[] = $property . ': ' . $sanitized_value;
}
}
}
if ( empty( $sanitized_styles ) ) {
$html->remove_attribute( $attr_name );
} else {
$html->set_attribute( $attr_name, implode( '; ', $sanitized_styles ) );
}
break;
case 'class':
// Only allow alphanumeric characters, hyphens, and underscores.
if ( ! preg_match( '/^[a-zA-Z0-9\s\-_]+$/', (string) $attr_value ) ) {
$html->remove_attribute( $attr_name );
}
break;
case 'data-type':
case 'data-id':
// Only allow alphanumeric characters, hyphens, and underscores.
if ( ! preg_match( '/^[a-zA-Z0-9\-_]+$/', (string) $attr_value ) ) {
$html->remove_attribute( $attr_name );
}
break;
default:
// Handle data-* attributes with strict validation.
if ( str_starts_with( $attr_name, 'data-' ) ) {
if ( ! preg_match( '/^[a-zA-Z0-9\-_]+$/', (string) $attr_value ) ) {
$html->remove_attribute( $attr_name );
}
break;
}
// Default deny policy: Remove any attribute not explicitly allowed.
$html->remove_attribute( $attr_name );
break;
}
}
/**
* Get list of safe CSS properties for typography and basic styling.
*
* @return array Array of safe CSS property names.
*/
public static function get_safe_css_properties(): array {
return array(
'color',
'background-color',
'font-family',
'font-size',
'font-weight',
'font-style',
'text-decoration',
'text-align',
'line-height',
'letter-spacing',
'text-transform',
);
}
/**
* Get list of safe CSS properties for caption typography (excludes background properties).
*
* @return array Array of safe CSS property names for captions.
*/
public static function get_caption_css_properties(): array {
return array(
'font-family',
'font-size',
'font-weight',
'font-style',
'text-decoration',
'line-height',
'letter-spacing',
'text-transform',
);
}
/**
* Validate HTML container attributes for security before content extraction.
* This method checks if a container element (like figcaption, span) has safe attributes.
*
* @param string $container_html Full container HTML (e.g., <figcaption class="...">content</figcaption>).
* @return bool True if container attributes are safe, false otherwise.
*/
public static function validate_container_attributes( string $container_html ): bool {
// Use WP_HTML_Tag_Processor to validate container attributes.
$html = new \WP_HTML_Tag_Processor( $container_html );
if ( ! $html->next_tag() ) {
return false;
}
// Get all attributes and validate each one using our existing validation logic.
$attributes = $html->get_attribute_names_with_prefix( '' );
if ( is_array( $attributes ) ) {
foreach ( $attributes as $attr_name ) {
// Use the same validation logic as validate_caption_attribute for consistency.
$attr_value = $html->get_attribute( $attr_name );
if ( null === $attr_value ) {
continue;
}
// Block event handlers immediately.
if ( str_starts_with( $attr_name, 'on' ) ) {
return false;
}
// Apply the same validation rules as caption attributes.
// Create a temporary processor to test validation.
$escaped_value = htmlspecialchars( (string) $attr_value, ENT_QUOTES, 'UTF-8' );
$temp_html = new \WP_HTML_Tag_Processor( '<span ' . $attr_name . '="' . $escaped_value . '">test</span>' );
if ( $temp_html->next_tag() ) {
$original_value = $temp_html->get_attribute( $attr_name );
self::validate_caption_attribute( $temp_html, $attr_name );
$validated_value = $temp_html->get_attribute( $attr_name );
// If attribute was removed during validation, container is unsafe.
if ( null !== $original_value && null === $validated_value ) {
return false;
}
}
}
}
return true;
}
/**
* Sanitize caption HTML to allow only specific tags and attributes.
*
* @param string $caption_html Raw caption HTML.
* @return string Sanitized caption HTML.
*/
public static function sanitize_caption_html( string $caption_html ): string {
// If no HTML tags, return as-is.
if ( false === strpos( $caption_html, '<' ) ) {
return $caption_html;
}
// Remove dangerous content: script, style, and other executable elements.
$result = preg_replace( '/<(script|style|iframe|object|embed|form|input|button)\b[^>]*>.*?<\/\1>/is', '', $caption_html );
if ( null === $result ) {
$caption_html = '';
} else {
$caption_html = $result;
}
// Use a more conservative approach - only validate attributes, don't modify tags.
$allowed_tags = array( 'strong', 'em', 'a', 'mark', 'kbd', 's', 'sub', 'sup', 'span', 'br' );
$html = new \WP_HTML_Tag_Processor( $caption_html );
// First pass: Process attributes for allowed tags only.
while ( $html->next_tag() ) {
$tag_name = $html->get_tag();
// Skip processing for disallowed tags.
if ( ! in_array( $tag_name, $allowed_tags, true ) ) {
continue;
}
// Only process attributes for allowed tags.
$attributes = $html->get_attribute_names_with_prefix( '' );
if ( is_array( $attributes ) ) {
foreach ( $attributes as $attr_name ) {
// Validate and sanitize each attribute individually.
self::validate_caption_attribute( $html, $attr_name );
}
}
}
// Second pass: Remove disallowed tags using a simple regex approach.
$final_html = $html->get_updated_html();
// Create a regex pattern to match disallowed tags.
$allowed_tags_pattern = implode( '|', array_map( 'preg_quote', $allowed_tags ) );
// Remove disallowed opening and closing tags, keeping only their content.
$result = preg_replace( '/<(?!(?:' . $allowed_tags_pattern . ')\b)[^>]*>(.*?)<\/(?!(?:' . $allowed_tags_pattern . ')\b)[^>]*>/s', '$1', $final_html );
if ( null === $result ) {
$final_html = '';
} else {
$final_html = $result;
}
// Remove disallowed self-closing tags.
$result = preg_replace( '/<(?!(?:' . $allowed_tags_pattern . ')\b)[^>]*\/>/s', '', $final_html );
if ( null === $result ) {
$final_html = '';
} else {
$final_html = $result;
}
return $final_html;
}
/**
* Sanitize image HTML while preserving necessary attributes for email rendering.
*
* @param string $image_html Raw image HTML.
* @return string Sanitized image HTML.
*/
public static function sanitize_image_html( string $image_html ): string {
// If no HTML tags, return as-is.
if ( false === strpos( $image_html, '<' ) ) {
return $image_html;
}
// Extract img tag using regex for reliable processing.
if ( ! preg_match( '/<img[^>]*>/i', $image_html, $matches ) ) {
return $image_html;
}
$img_tag = $matches[0];
$sanitized_attributes = array();
$has_src = false;
// Extract and sanitize individual attributes using WP_HTML_Tag_Processor for attribute processing.
$html = new \WP_HTML_Tag_Processor( $img_tag );
if ( $html->next_tag() ) {
$attributes = $html->get_attribute_names_with_prefix( '' );
if ( is_array( $attributes ) ) {
foreach ( $attributes as $attr_name ) {
$attr_value = $html->get_attribute( $attr_name );
// Sanitize specific attributes.
switch ( $attr_name ) {
case 'src':
// Sanitize image source URL.
$sanitized_src = esc_url( (string) $attr_value );
if ( ! empty( $sanitized_src ) ) {
$sanitized_attributes[] = $attr_name . '="' . $sanitized_src . '"';
$has_src = true;
}
break;
case 'alt':
case 'width':
case 'height':
// Sanitize text attributes.
$sanitized_attributes[] = $attr_name . '="' . esc_attr( (string) $attr_value ) . '"';
break;
case 'class':
// Clean CSS classes.
$cleaned_classes = self::clean_css_classes( (string) $attr_value );
if ( ! empty( $cleaned_classes ) ) {
$sanitized_attributes[] = $attr_name . '="' . esc_attr( $cleaned_classes ) . '"';
}
break;
case 'style':
// Sanitize inline styles - only allow safe properties for email rendering.
$sanitized_styles = self::sanitize_image_styles( (string) $attr_value );
if ( ! empty( $sanitized_styles ) ) {
$sanitized_attributes[] = $attr_name . '="' . esc_attr( $sanitized_styles ) . '"';
}
break;
}
}
}
}
// If no valid src attribute, return empty string.
if ( ! $has_src ) {
return '';
}
// Rebuild the img tag with sanitized attributes.
if ( empty( $sanitized_attributes ) ) {
return '';
}
return '<img ' . implode( ' ', $sanitized_attributes ) . '>';
}
/**
* Sanitize inline styles for image elements - only allow safe properties for email rendering.
*
* @param string $style_value Raw style value.
* @return string Sanitized style value.
*/
private static function sanitize_image_styles( string $style_value ): string {
$sanitized_styles = array();
$style_parts = explode( ';', $style_value );
foreach ( $style_parts as $style_part ) {
$style_part = trim( $style_part );
if ( empty( $style_part ) ) {
continue;
}
$property_parts = explode( ':', $style_part, 2 );
if ( count( $property_parts ) !== 2 ) {
continue;
}
$property = trim( strtolower( $property_parts[0] ) );
$value = trim( $property_parts[1] );
// Allow safe CSS properties for images in email rendering.
$safe_properties = array( 'width', 'height', 'max-width', 'max-height', 'display', 'margin', 'padding', 'border', 'border-radius' );
if ( in_array( $property, $safe_properties, true ) ) {
$sanitized_value = self::sanitize_css_value( $value );
if ( ! empty( $sanitized_value ) ) {
$sanitized_styles[] = $property . ': ' . $sanitized_value;
}
}
}
return implode( '; ', $sanitized_styles );
}
}