Spaces:

MinhTai
/

ai-agent-app

Running

App Files Files Community

ai-agent-app / scripts /crawl /cleaner.py

MinhTai

deploy: 80c6864

6f6557f about 10 hours ago

raw

history blame contribute delete

3.64 kB

	import re
	from bs4 import BeautifulSoup, Tag

	_MATH_KEYWORDS = frozenset({
	# core math
	"equation", "formula", "theorem", "proof", "solve", "function",
	"derivative", "integral", "probability", "matrix", "angle", "triangle",
	"polynomial", "prime", "modulo", "combination", "permutation",
	# gap-fill topics
	"radical", "inequality", "absolute value", "simplif", "parabola",
	"linear function", "factoring", "remainder theorem", "nonlinear",
	# statistics
	"distribution", "variance", "deviation", "regression", "hypothesis",
	"correlation", "confidence", "sample", "statistic", "population",
	"mean", "median", "interval", "p-value", "chi-square", "t-test",
	"anova", "normal", "binomial", "poisson", "inference", "estimat",
	# Vietnamese
	"phương trình", "định lý", "xác suất",
	})

	# table excluded: distribution tables, ANOVA tables, frequency tables carry real content
	_STRIP_TAGS = ["script", "style", "nav", "header", "footer", "aside"]

	# AoPS-specific navigation boilerplate — only applied when source == "aops"
	_AOPS_BOILERPLATE_RE = re.compile(
	r"(Retrieved from\|This article\|AoPS Wiki\|Art of Problem Solving"
	r"\|Category\s:\|Navigation menu\|Contents\s\[\|Jump to\s*(navigation\|search)"
	r"\|See also\|External links\|References\s*\[)",
	re.IGNORECASE,
	)

	# Generic web navigation boilerplate — applied to all sources
	_GENERIC_BOILERPLATE_RE = re.compile(
	r"(Skip to (main )?content\|Cookie (policy\|notice)\|Privacy policy"
	r"\|Terms of (use\|service)\|All rights reserved\|Breadcrumb"
	r"\|Table of [Cc]ontents\|Back to top\|Share this page)",
	re.IGNORECASE,
	)

	# Asymptote diagram code blocks embedded in AoPS pages
	_ASYMPTOTE_RE = re.compile(r"\[asy\].*?\[/asy\]", re.DOTALL \| re.IGNORECASE)


	def _replace_latex_imgs(soup: BeautifulSoup) -> None:
	"""Replace <img> tags whose src contains 'latex' with their alt text (the LaTeX source)."""
	for img in soup.find_all("img", src=True):
	src = img.get("src", "")
	if "latex" in src:
	alt = img.get("alt", "").strip()
	if alt:
	img.replace_with(f" {alt} ")
	else:
	img.decompose()


	def is_math_relevant(chunk: str) -> bool:
	lower = chunk.lower()
	return any(kw in lower for kw in _MATH_KEYWORDS)


	def html_to_chunks(html: str, chunk_size: int = 3000, source: str = "generic") -> list[str]:
	"""Strip HTML, recover LaTeX from img alt text, filter boilerplate, return quality chunks."""
	soup = BeautifulSoup(html, "html.parser")

	_replace_latex_imgs(soup)

	for tag in _STRIP_TAGS:
	for el in soup.find_all(tag):
	el.decompose()

	text = soup.get_text(separator="\n")
	if source == "aops":
	text = _ASYMPTOTE_RE.sub("", text)

	paragraphs = text.split("\n\n")
	cleaned: list[str] = []
	for para in paragraphs:
	if _GENERIC_BOILERPLATE_RE.search(para):
	continue
	if source == "aops" and _AOPS_BOILERPLATE_RE.search(para):
	continue
	stripped = para.strip()
	if len(stripped) >= 80:
	cleaned.append(stripped)

	chunks: list[str] = []
	current_parts: list[str] = []
	current_len = 0

	for para in cleaned:
	if current_len + len(para) + 2 > chunk_size and current_parts:
	chunks.append("\n\n".join(current_parts))
	current_parts = []
	current_len = 0
	current_parts.append(para)
	current_len += len(para) + 2

	if current_parts:
	chunks.append("\n\n".join(current_parts))

	return [c for c in chunks if is_math_relevant(c)]