ai-agent-app / scripts /crawl /cleaner.py
MinhTai's picture
deploy: 80c6864
6f6557f
import re
from bs4 import BeautifulSoup, Tag
_MATH_KEYWORDS = frozenset({
# core math
"equation", "formula", "theorem", "proof", "solve", "function",
"derivative", "integral", "probability", "matrix", "angle", "triangle",
"polynomial", "prime", "modulo", "combination", "permutation",
# gap-fill topics
"radical", "inequality", "absolute value", "simplif", "parabola",
"linear function", "factoring", "remainder theorem", "nonlinear",
# statistics
"distribution", "variance", "deviation", "regression", "hypothesis",
"correlation", "confidence", "sample", "statistic", "population",
"mean", "median", "interval", "p-value", "chi-square", "t-test",
"anova", "normal", "binomial", "poisson", "inference", "estimat",
# Vietnamese
"phương trình", "định lý", "xác suất",
})
# table excluded: distribution tables, ANOVA tables, frequency tables carry real content
_STRIP_TAGS = ["script", "style", "nav", "header", "footer", "aside"]
# AoPS-specific navigation boilerplate — only applied when source == "aops"
_AOPS_BOILERPLATE_RE = re.compile(
r"(Retrieved from|This article|AoPS Wiki|Art of Problem Solving"
r"|Category\s*:|Navigation menu|Contents\s*\[|Jump to\s*(navigation|search)"
r"|See also|External links|References\s*\[)",
re.IGNORECASE,
)
# Generic web navigation boilerplate — applied to all sources
_GENERIC_BOILERPLATE_RE = re.compile(
r"(Skip to (main )?content|Cookie (policy|notice)|Privacy policy"
r"|Terms of (use|service)|All rights reserved|Breadcrumb"
r"|Table of [Cc]ontents|Back to top|Share this page)",
re.IGNORECASE,
)
# Asymptote diagram code blocks embedded in AoPS pages
_ASYMPTOTE_RE = re.compile(r"\[asy\].*?\[/asy\]", re.DOTALL | re.IGNORECASE)
def _replace_latex_imgs(soup: BeautifulSoup) -> None:
"""Replace <img> tags whose src contains 'latex' with their alt text (the LaTeX source)."""
for img in soup.find_all("img", src=True):
src = img.get("src", "")
if "latex" in src:
alt = img.get("alt", "").strip()
if alt:
img.replace_with(f" {alt} ")
else:
img.decompose()
def is_math_relevant(chunk: str) -> bool:
lower = chunk.lower()
return any(kw in lower for kw in _MATH_KEYWORDS)
def html_to_chunks(html: str, chunk_size: int = 3000, source: str = "generic") -> list[str]:
"""Strip HTML, recover LaTeX from img alt text, filter boilerplate, return quality chunks."""
soup = BeautifulSoup(html, "html.parser")
_replace_latex_imgs(soup)
for tag in _STRIP_TAGS:
for el in soup.find_all(tag):
el.decompose()
text = soup.get_text(separator="\n")
if source == "aops":
text = _ASYMPTOTE_RE.sub("", text)
paragraphs = text.split("\n\n")
cleaned: list[str] = []
for para in paragraphs:
if _GENERIC_BOILERPLATE_RE.search(para):
continue
if source == "aops" and _AOPS_BOILERPLATE_RE.search(para):
continue
stripped = para.strip()
if len(stripped) >= 80:
cleaned.append(stripped)
chunks: list[str] = []
current_parts: list[str] = []
current_len = 0
for para in cleaned:
if current_len + len(para) + 2 > chunk_size and current_parts:
chunks.append("\n\n".join(current_parts))
current_parts = []
current_len = 0
current_parts.append(para)
current_len += len(para) + 2
if current_parts:
chunks.append("\n\n".join(current_parts))
return [c for c in chunks if is_math_relevant(c)]