<?php
declare(strict_types=1);
namespace App\Service;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMXPath;
final class MailHtmlSanitizer
{
/** Tags, die bleiben dürfen */
private const ALLOWED_TAGS = [
'p','div','br','span','b','strong','i','em','u','a',
'ul','ol','li',
'h1','h2','h3','h4','h5','h6',
'img',
'table','thead','tbody','tfoot','tr','td','th',
'blockquote','pre','code'
];
/** Erlaubte Attribute je Tag */
private const ALLOWED_ATTRS = [
'a' => ['href','title','name'],
'img' => ['src','alt','title','width','height'],
'td' => ['colspan','rowspan'],
'th' => ['colspan','rowspan'],
];
/** Für <img src="data:..."> erlaubte MIME-Typen (bewusst kein SVG) */
private const ALLOWED_DATA_IMAGE_MIME = [
'image/png','image/jpeg','image/gif','image/webp','image/avif',
];
function normalizeHtml(string $html): string
{
// 1) Microsoft Word Müll entfernen
$patterns = [
'/class="Mso[^"]*"/i',
'/style="[^"]*"/i',
'/<o:p>.*?<\/o:p>/i',
'/<o:p><\/o:p>/i',
'/mso-[a-zA-Z0-9\-]+:[^;"]*;?/i',
'/<!--.*?-->/s',
];
$html = preg_replace($patterns, '', $html);
// 2) Leere spans entfernen
$html = preg_replace('/<span[^>]*>\s*<\/span>/i', '', $html);
// 3) Word-spezifische Wrapper vereinfachen
$html = preg_replace('/<div[^>]*>/i', '<div>', $html);
// 4) Whitelist-Filter – jetzt MIT <img>
$allowed = '<p><ul><ol><li><br><strong><em><b><i><u><a><div><img>';
$html = strip_tags($html, $allowed);
// 5) Mehrfache BRs aufräumen
$html = preg_replace('/(<br\s*\/?>\s*){3,}/i', "<br><br>", $html);
// 6) Whitespaces normalisieren
$html = preg_replace('/\s+/', ' ', $html);
$html = trim($html);
// 7) Kleine kosmetische Bereinigung
$html = str_replace(['> <', '> <'], "><", $html);
return $html;
}
public function sanitize(string $html = null): string
{
if (null === $html) {
return "--";
}
if (trim($html) === '') {
return $html;
}
$html = html_entity_decode($html);
/*
// 0) Encoding normalisieren
$enc = mb_detect_encoding($html, ['UTF-8','Windows-1252','ISO-8859-1','ISO-8859-15'], true) ?: 'UTF-8';
$html = mb_convert_encoding($html, 'UTF-8', $enc);
*/
// Gesamtlänge des Inhalts bestimmen
if (strpos($html, '<body') !== false) {
$startpos = strpos($html, '<body');
$endpos = strpos($html, '</body');
if (($endpos - ($startpos - 1))<10) return $html;
$html = mb_substr($html, ($startpos - 1) , ($endpos - ($startpos - 1)), 'UTF-8');
}
$html = $this->normalizeHtml($html);
$html = $this->cleanTicketHtml($html);
return $html;
}
/**
* Bereinigt Word/Outlook-HTML so, dass es das Ticketsystem nicht mehr zerschießt.
* - packt das HTML in einen Wrapper <div id="wrapper">
* - lässt DOMDocument die Struktur reparieren (verschachtelte p, div-Mismatch, etc.)
* - entfernt leere Word-Absätze (p.MsoNormal ohne Inhalt)
* - gibt nur den INNEREN Inhalt des Wrappers zurück
*/
function cleanTicketHtml(string $html): string
{
// Sicherstellen, dass Encoding zu DOMDocument passt
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
$dom = new DOMDocument();
// Parser-Fehler unterdrücken, weil wir ja kaputtes HTML erwarten
libxml_use_internal_errors(true);
// Wir packen alles in einen Wrapper, damit zusätzliche </div> nicht das Layout des Systems sprengen
$dom->loadHTML(
'<div id="__wrapper__">'.$html.'</div>',
LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
);
libxml_clear_errors();
$xpath = new DOMXPath($dom);
// 1. Leere p.MsoNormal entfernen (typischer Outlook/Word-Müll)
foreach ($xpath->query('//p[@class="MsoNormal"]') as $p) {
/** @var DOMElement $p */
if (trim($p->textContent) === '' && !$p->hasChildNodes()) {
$p->parentNode->removeChild($p);
}
}
// 2. Wrapper-Inhalt extrahieren
$wrapper = $dom->getElementById('__wrapper__');
if (!$wrapper) {
// Fallback: alles zurückgeben
return $html;
}
$clean = '';
foreach ($wrapper->childNodes as $child) {
$clean .= $dom->saveHTML($child);
}
return $clean;
}
private function unwrap(\DOMElement $el): void
{
$p = $el->parentNode;
if (!$p) return;
while ($el->firstChild) {
$p->insertBefore($el->firstChild, $el);
}
$p->removeChild($el);
}
private function cleanAttributesDeep(\DOMNode $node): void
{
$walker = function(\DOMNode $n) use (&$walker) {
if ($n instanceof \DOMElement) {
// 1) Inline-Events & CSS weg
foreach (iterator_to_array($n->attributes) as $attr) {
$name = strtolower($attr->name);
$val = trim($attr->value);
if (str_starts_with($name, 'on') || $name === 'style' || $name === 'class') {
$n->removeAttributeNode($attr);
continue;
}
// 2) Hrefs nur http/https/mailto/tel/relative
if (in_array($name, ['href','xlink:href','formaction'], true)) {
if (preg_match('#^\s*(javascript:|data:)#i', $val)) {
$n->removeAttributeNode($attr);
}
}
// 3) IMG-Src erlauben: http/https/cid + data:image/(keine SVG)
if ($name === 'src' && $n->tagName === 'img') {
$ok = preg_match('#^\s*(https?|cid):#i', $val)
|| preg_match('#^\s*data:(image/(png|jpeg|jpg|gif|webp|avif));base64,#i', $val);
if (!$ok) {
// unsicheres Bild komplett entfernen
$n->parentNode?->removeChild($n);
return; // Node ist weg
}
}
// 4) Breite/Höhe/Spans: nur Zahlen
if (in_array($name, ['width','height','colspan','rowspan'], true)
&& !preg_match('/^\d{1,5}$/', $val)) {
$n->removeAttributeNode($attr);
}
}
}
// Rekursion
foreach (iterator_to_array($n->childNodes) as $c) {
$walker($c);
}
};
$walker($node);
}
private function walk(DOMNode $node): void
{
// Kopie, da NodeList live ist
$children = [];
foreach ($node->childNodes as $c) { $children[] = $c; }
foreach ($children as $child) {
if ($child instanceof DOMElement) {
$tag = strtolower($child->tagName);
// MS-Office/sonstige Namespace-Tags (z.B. o:p, v:shape, w:...) → entpacken
if (str_contains($tag, ':')) {
$this->unwrap($child);
continue;
}
// Nicht erlaubte Tags → entpacken (Kinder hochziehen)
if (!in_array($tag, self::ALLOWED_TAGS, true)) {
$this->unwrap($child);
continue;
}
// Erlaubte Tags: Attribute hart bereinigen
$this->cleanAttributes($child);
// Tag-spezifische Checks
if ($tag === 'img') {
$src = $child->getAttribute('src');
if (!$this->isSafeImgSrc($src)) {
$child->parentNode?->removeChild($child); // unsicher → Bild weg
continue;
}
} elseif ($tag === 'a') {
$href = $child->getAttribute('href');
if ($href !== '' && !$this->isSafeHref($href)) {
$child->removeAttribute('href'); // Link bleibt als Text
}
}
}
// Rekursion
$this->walk($child);
}
}
private function cleanAttributes(DOMElement $el): void
{
$tag = strtolower($el->tagName);
$allowed = self::ALLOWED_ATTRS[$tag] ?? [];
// Kopie der Attribute
$attrs = iterator_to_array($el->attributes);
foreach ($attrs as $attr) {
$name = strtolower($attr->name);
$value = trim($attr->value);
// Inline-Events (onclick, onerror, ...) immer verbieten
if (str_starts_with($name, 'on')) {
$el->removeAttributeNode($attr);
continue;
}
// CSS/Styling konsequent entfernen
if ($name === 'style' || $name === 'class') {
$el->removeAttributeNode($attr);
continue;
}
// Nur erlaubte Attributnamen behalten
if (!in_array($name, $allowed, true)) {
$el->removeAttributeNode($attr);
continue;
}
// Zusätzliche Plausibilisierung
if (in_array($name, ['width','height','colspan','rowspan'], true)) {
if (!preg_match('/^\d{1,5}$/', $value)) {
$el->removeAttributeNode($attr);
}
}
}
}
private function isSafeHref(string $href): bool
{
// Erlaubt: http, https, mailto, tel, sowie relative Links
if (preg_match('#^\s*(javascript:|data:)#i', $href)) {
return false;
}
if (preg_match('#^\s*([a-z][a-z0-9+\-.]*):#i', $href, $m)) {
$scheme = strtolower($m[1]);
return in_array($scheme, ['http','https','mailto','tel'], true);
}
return true; // relative URL
}
private function isSafeImgSrc(string $src): bool
{
// http/https/cid erlaubt
if (preg_match('#^\s*(https?|cid):#i', $src)) {
return true;
}
// data:image/<whitelist>;base64,...
if (preg_match('#^\s*data:([^;]+);base64,#i', $src, $m)) {
$mime = strtolower(trim($m[1]));
return in_array($mime, self::ALLOWED_DATA_IMAGE_MIME, true);
}
return false;
}
}