"""
HTML cleaning utilities for problem text.
Provides functions to clean HTML content while preserving:
- Inline MathML (fractions, superscripts, subscripts)
- Wiris math images (extracted from data-mathml attribute)
- Table structure (formatted with | separators)
- Image placeholders
"""
import html
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
def parse_mathml_element(elem):
"""Recursively parse a MathML element to text."""
if elem.name is None:
return str(elem).strip()
if elem.name == "mfrac":
children = [c for c in elem.children if c.name]
if len(children) >= 2:
num = parse_mathml_element(children[0])
denom = parse_mathml_element(children[1])
return f"({num}/{denom})"
return elem.get_text(strip=True)
elif elem.name == "msup":
children = [c for c in elem.children if c.name]
if len(children) >= 2:
base = parse_mathml_element(children[0])
exp = parse_mathml_element(children[1])
return f"{base}^{exp}"
return elem.get_text(strip=True)
elif elem.name == "msub":
children = [c for c in elem.children if c.name]
if len(children) >= 2:
base = parse_mathml_element(children[0])
sub = parse_mathml_element(children[1])
return f"{base}_{sub}"
return elem.get_text(strip=True)
elif elem.name == "msqrt":
content = parse_mathml_element_children(elem)
return f"√({content})"
elif elem.name == "mo":
op = elem.get_text(strip=True)
if op in ["÷", "×", "·", "+", "-", "=", "<", ">", "≤", "≥", "≠"]:
return f" {op} "
return op
elif elem.name in ["mn", "mi", "mtext"]:
return elem.get_text(strip=True)
elif elem.name in ["mrow", "math", "mpadded", "mstyle"]:
return parse_mathml_element_children(elem)
else:
return elem.get_text(strip=True)
def parse_mathml_element_children(elem):
"""Parse all children of a MathML element."""
parts = []
for child in elem.children:
if isinstance(child, NavigableString):
text = str(child).strip()
if text:
parts.append(text)
elif child.name:
parts.append(parse_mathml_element(child))
return "".join(parts)
def clean_problem_body(text):
"""
Clean HTML problem body with full MathML handling.
Handles:
- Inline MathML (