FoundationalASSIST / Code /clean_utils.py

Upload folder using huggingface_hub

6256eb9 verified 13 days ago

5.01 kB

	"""
	HTML cleaning utilities for problem text.

	Provides functions to clean HTML content while preserving:
	- Inline MathML (fractions, superscripts, subscripts)
	- Wiris math images (extracted from data-mathml attribute)
	- Table structure (formatted with \| separators)
	- Image placeholders
	"""

	import html
	import pandas as pd
	from bs4 import BeautifulSoup, NavigableString


	def parse_mathml_element(elem):
	"""Recursively parse a MathML element to text."""
	if elem.name is None:
	return str(elem).strip()

	if elem.name == "mfrac":
	children = [c for c in elem.children if c.name]
	if len(children) >= 2:
	num = parse_mathml_element(children[0])
	denom = parse_mathml_element(children[1])
	return f"({num}/{denom})"
	return elem.get_text(strip=True)

	elif elem.name == "msup":
	children = [c for c in elem.children if c.name]
	if len(children) >= 2:
	base = parse_mathml_element(children[0])
	exp = parse_mathml_element(children[1])
	return f"{base}^{exp}"
	return elem.get_text(strip=True)

	elif elem.name == "msub":
	children = [c for c in elem.children if c.name]
	if len(children) >= 2:
	base = parse_mathml_element(children[0])
	sub = parse_mathml_element(children[1])
	return f"{base}_{sub}"
	return elem.get_text(strip=True)

	elif elem.name == "msqrt":
	content = parse_mathml_element_children(elem)
	return f"√({content})"

	elif elem.name == "mo":
	op = elem.get_text(strip=True)
	if op in ["÷", "×", "·", "+", "-", "=", "<", ">", "≤", "≥", "≠"]:
	return f" {op} "
	return op

	elif elem.name in ["mn", "mi", "mtext"]:
	return elem.get_text(strip=True)

	elif elem.name in ["mrow", "math", "mpadded", "mstyle"]:
	return parse_mathml_element_children(elem)

	else:
	return elem.get_text(strip=True)


	def parse_mathml_element_children(elem):
	"""Parse all children of a MathML element."""
	parts = []
	for child in elem.children:
	if isinstance(child, NavigableString):
	text = str(child).strip()
	if text:
	parts.append(text)
	elif child.name:
	parts.append(parse_mathml_element(child))
	return "".join(parts)


	def clean_problem_body(text):
	"""
	Clean HTML problem body with full MathML handling.

	Handles:
	- Inline MathML (<math>, <mfrac>, <msup>, etc.) → (4/3), x^2
	- Wiris math images (data-mathml attribute) → [15÷12]
	- Tables → [Table: Col1 \| Col2 ...]
	- Regular images → [image]
	- HTML entities → decoded properly
	"""
	if pd.isna(text) or text == "":
	return ""
	soup = BeautifulSoup(str(text), "html.parser")

	# 1. Handle inline MathML
	for math in soup.find_all("math"):
	parsed = parse_mathml_element(math)
	math.replace_with(f" {parsed} ")

	# 2. Handle Wiris images
	for img in soup.find_all("img"):
	alt = img.get("alt", "")
	src = img.get("src", "")
	data_mathml = img.get("data-mathml", "")

	if "wiris" in src.lower() or "pluginwiris" in src:
	if alt and alt.strip() and alt not in ["NO ALT", "NONE"]:
	img.replace_with(f" [{alt.strip()}] ")
	elif data_mathml:
	math_str = (
	data_mathml.replace("«", "<").replace("»", ">").replace("¨", '"')
	)
	msoup = BeautifulSoup(math_str, "html.parser")
	math_elem = msoup.find("math")
	if math_elem:
	mtext = parse_mathml_element(math_elem)
	else:
	mtext = msoup.get_text(separator="")
	mtext = mtext.replace("§#247;", "÷").replace("§#215;", "×")
	mtext = (
	mtext.replace("§#8722;", "-")
	.replace("§#160;", " ")
	.replace("§#183;", "·")
	)
	mtext = mtext.replace("§#", "&#")
	mtext = html.unescape(mtext).strip()
	img.replace_with(f" [{mtext}] " if mtext else " [math] ")
	else:
	img.replace_with(" [math] ")
	elif alt and alt.strip():
	img.replace_with(f" [Image: {alt.strip()[:100]}] ")
	else:
	img.replace_with(" [image] ")

	# 3. Handle tables
	for table in soup.find_all("table"):
	rows = []
	for tr in table.find_all("tr"):
	cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
	if any(cells):
	rows.append(" \| ".join(cells))
	if rows:
	table.replace_with(f"\n[Table:\n{chr(10).join(rows)}]\n")
	else:
	table.decompose()

	text = soup.get_text(separator=" ")
	text = html.unescape(text)
	text = " ".join(text.split())
	return text.strip()