FoundationalASSIST / Code /clean_utils.py
martinakaduc's picture
Upload folder using huggingface_hub
6256eb9 verified
"""
HTML cleaning utilities for problem text.
Provides functions to clean HTML content while preserving:
- Inline MathML (fractions, superscripts, subscripts)
- Wiris math images (extracted from data-mathml attribute)
- Table structure (formatted with | separators)
- Image placeholders
"""
import html
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
def parse_mathml_element(elem):
"""Recursively parse a MathML element to text."""
if elem.name is None:
return str(elem).strip()
if elem.name == "mfrac":
children = [c for c in elem.children if c.name]
if len(children) >= 2:
num = parse_mathml_element(children[0])
denom = parse_mathml_element(children[1])
return f"({num}/{denom})"
return elem.get_text(strip=True)
elif elem.name == "msup":
children = [c for c in elem.children if c.name]
if len(children) >= 2:
base = parse_mathml_element(children[0])
exp = parse_mathml_element(children[1])
return f"{base}^{exp}"
return elem.get_text(strip=True)
elif elem.name == "msub":
children = [c for c in elem.children if c.name]
if len(children) >= 2:
base = parse_mathml_element(children[0])
sub = parse_mathml_element(children[1])
return f"{base}_{sub}"
return elem.get_text(strip=True)
elif elem.name == "msqrt":
content = parse_mathml_element_children(elem)
return f"√({content})"
elif elem.name == "mo":
op = elem.get_text(strip=True)
if op in ["÷", "×", "·", "+", "-", "=", "<", ">", "≤", "≥", "≠"]:
return f" {op} "
return op
elif elem.name in ["mn", "mi", "mtext"]:
return elem.get_text(strip=True)
elif elem.name in ["mrow", "math", "mpadded", "mstyle"]:
return parse_mathml_element_children(elem)
else:
return elem.get_text(strip=True)
def parse_mathml_element_children(elem):
"""Parse all children of a MathML element."""
parts = []
for child in elem.children:
if isinstance(child, NavigableString):
text = str(child).strip()
if text:
parts.append(text)
elif child.name:
parts.append(parse_mathml_element(child))
return "".join(parts)
def clean_problem_body(text):
"""
Clean HTML problem body with full MathML handling.
Handles:
- Inline MathML (<math>, <mfrac>, <msup>, etc.) → (4/3), x^2
- Wiris math images (data-mathml attribute) → [15÷12]
- Tables → [Table: Col1 | Col2 ...]
- Regular images → [image]
- HTML entities → decoded properly
"""
if pd.isna(text) or text == "":
return ""
soup = BeautifulSoup(str(text), "html.parser")
# 1. Handle inline MathML
for math in soup.find_all("math"):
parsed = parse_mathml_element(math)
math.replace_with(f" {parsed} ")
# 2. Handle Wiris images
for img in soup.find_all("img"):
alt = img.get("alt", "")
src = img.get("src", "")
data_mathml = img.get("data-mathml", "")
if "wiris" in src.lower() or "pluginwiris" in src:
if alt and alt.strip() and alt not in ["NO ALT", "NONE"]:
img.replace_with(f" [{alt.strip()}] ")
elif data_mathml:
math_str = (
data_mathml.replace("«", "<").replace("»", ">").replace("¨", '"')
)
msoup = BeautifulSoup(math_str, "html.parser")
math_elem = msoup.find("math")
if math_elem:
mtext = parse_mathml_element(math_elem)
else:
mtext = msoup.get_text(separator="")
mtext = mtext.replace("§#247;", "÷").replace("§#215;", "×")
mtext = (
mtext.replace("§#8722;", "-")
.replace("§#160;", " ")
.replace("§#183;", "·")
)
mtext = mtext.replace("§#", "&#")
mtext = html.unescape(mtext).strip()
img.replace_with(f" [{mtext}] " if mtext else " [math] ")
else:
img.replace_with(" [math] ")
elif alt and alt.strip():
img.replace_with(f" [Image: {alt.strip()[:100]}] ")
else:
img.replace_with(" [image] ")
# 3. Handle tables
for table in soup.find_all("table"):
rows = []
for tr in table.find_all("tr"):
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
if any(cells):
rows.append(" | ".join(cells))
if rows:
table.replace_with(f"\n[Table:\n{chr(10).join(rows)}]\n")
else:
table.decompose()
text = soup.get_text(separator=" ")
text = html.unescape(text)
text = " ".join(text.split())
return text.strip()