| """ |
| HTML cleaning utilities for problem text. |
| |
| Provides functions to clean HTML content while preserving: |
| - Inline MathML (fractions, superscripts, subscripts) |
| - Wiris math images (extracted from data-mathml attribute) |
| - Table structure (formatted with | separators) |
| - Image placeholders |
| """ |
|
|
| import html |
| import pandas as pd |
| from bs4 import BeautifulSoup, NavigableString |
|
|
|
|
| def parse_mathml_element(elem): |
| """Recursively parse a MathML element to text.""" |
| if elem.name is None: |
| return str(elem).strip() |
|
|
| if elem.name == "mfrac": |
| children = [c for c in elem.children if c.name] |
| if len(children) >= 2: |
| num = parse_mathml_element(children[0]) |
| denom = parse_mathml_element(children[1]) |
| return f"({num}/{denom})" |
| return elem.get_text(strip=True) |
|
|
| elif elem.name == "msup": |
| children = [c for c in elem.children if c.name] |
| if len(children) >= 2: |
| base = parse_mathml_element(children[0]) |
| exp = parse_mathml_element(children[1]) |
| return f"{base}^{exp}" |
| return elem.get_text(strip=True) |
|
|
| elif elem.name == "msub": |
| children = [c for c in elem.children if c.name] |
| if len(children) >= 2: |
| base = parse_mathml_element(children[0]) |
| sub = parse_mathml_element(children[1]) |
| return f"{base}_{sub}" |
| return elem.get_text(strip=True) |
|
|
| elif elem.name == "msqrt": |
| content = parse_mathml_element_children(elem) |
| return f"√({content})" |
|
|
| elif elem.name == "mo": |
| op = elem.get_text(strip=True) |
| if op in ["÷", "×", "·", "+", "-", "=", "<", ">", "≤", "≥", "≠"]: |
| return f" {op} " |
| return op |
|
|
| elif elem.name in ["mn", "mi", "mtext"]: |
| return elem.get_text(strip=True) |
|
|
| elif elem.name in ["mrow", "math", "mpadded", "mstyle"]: |
| return parse_mathml_element_children(elem) |
|
|
| else: |
| return elem.get_text(strip=True) |
|
|
|
|
| def parse_mathml_element_children(elem): |
| """Parse all children of a MathML element.""" |
| parts = [] |
| for child in elem.children: |
| if isinstance(child, NavigableString): |
| text = str(child).strip() |
| if text: |
| parts.append(text) |
| elif child.name: |
| parts.append(parse_mathml_element(child)) |
| return "".join(parts) |
|
|
|
|
| def clean_problem_body(text): |
| """ |
| Clean HTML problem body with full MathML handling. |
| |
| Handles: |
| - Inline MathML (<math>, <mfrac>, <msup>, etc.) → (4/3), x^2 |
| - Wiris math images (data-mathml attribute) → [15÷12] |
| - Tables → [Table: Col1 | Col2 ...] |
| - Regular images → [image] |
| - HTML entities → decoded properly |
| """ |
| if pd.isna(text) or text == "": |
| return "" |
| soup = BeautifulSoup(str(text), "html.parser") |
|
|
| |
| for math in soup.find_all("math"): |
| parsed = parse_mathml_element(math) |
| math.replace_with(f" {parsed} ") |
|
|
| |
| for img in soup.find_all("img"): |
| alt = img.get("alt", "") |
| src = img.get("src", "") |
| data_mathml = img.get("data-mathml", "") |
|
|
| if "wiris" in src.lower() or "pluginwiris" in src: |
| if alt and alt.strip() and alt not in ["NO ALT", "NONE"]: |
| img.replace_with(f" [{alt.strip()}] ") |
| elif data_mathml: |
| math_str = ( |
| data_mathml.replace("«", "<").replace("»", ">").replace("¨", '"') |
| ) |
| msoup = BeautifulSoup(math_str, "html.parser") |
| math_elem = msoup.find("math") |
| if math_elem: |
| mtext = parse_mathml_element(math_elem) |
| else: |
| mtext = msoup.get_text(separator="") |
| mtext = mtext.replace("§#247;", "÷").replace("§#215;", "×") |
| mtext = ( |
| mtext.replace("§#8722;", "-") |
| .replace("§#160;", " ") |
| .replace("§#183;", "·") |
| ) |
| mtext = mtext.replace("§#", "&#") |
| mtext = html.unescape(mtext).strip() |
| img.replace_with(f" [{mtext}] " if mtext else " [math] ") |
| else: |
| img.replace_with(" [math] ") |
| elif alt and alt.strip(): |
| img.replace_with(f" [Image: {alt.strip()[:100]}] ") |
| else: |
| img.replace_with(" [image] ") |
|
|
| |
| for table in soup.find_all("table"): |
| rows = [] |
| for tr in table.find_all("tr"): |
| cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])] |
| if any(cells): |
| rows.append(" | ".join(cells)) |
| if rows: |
| table.replace_with(f"\n[Table:\n{chr(10).join(rows)}]\n") |
| else: |
| table.decompose() |
|
|
| text = soup.get_text(separator=" ") |
| text = html.unescape(text) |
| text = " ".join(text.split()) |
| return text.strip() |
|
|