Spaces:

MinhTai
/

ai-agent-app

Running

App Files Files Community

ai-agent-app / scripts /crawl /sources /aops.py

MinhTai

deploy: 80c6864

6f6557f about 10 hours ago

raw

history blame contribute delete

4.74 kB

	import logging
	import httpx
	from crawl.http_utils import fetch_with_retry, safe_json

	logger = logging.getLogger(__name__)

	_API_URL = "https://artofproblemsolving.com/wiki/api.php"
	_BASE_URL = "https://artofproblemsolving.com/wiki/index.php/"

	# Pages that are navigation/index stubs — not worth ingesting.
	_SKIP_TITLE_PREFIXES = (
	"Category:", "Talk:", "User:", "AoPS Wiki:", "Help:", "File:",
	)
	_SKIP_TITLES = frozenset({
	"Combinatorics", "Combinatorics/Introduction", "Combinatorics/Intermediate",
	"Combinatorics/Olympiad", "Number theory", "Geometry",
	"Statistical mechanics", "Discrete mathematics", "Discrete quantity",
	})


	async def _fetch_category_titles(category: str, seen_pageids: set[int]) -> list[tuple[str, str]]:
	"""Return (canonical_url, title) for all pages in an AoPS category."""
	titles: list[tuple[str, str]] = []
	params: dict = {
	"action": "query",
	"list": "categorymembers",
	"cmtitle": f"Category:{category}",
	"cmlimit": "100",
	"cmtype": "page",
	"format": "json",
	}
	while True:
	try:
	resp = await fetch_with_retry(_API_URL, params=params)
	data = safe_json(resp)
	for item in data.get("query", {}).get("categorymembers", []):
	title = item["title"]
	pageid = item["pageid"]
	if pageid in seen_pageids:
	continue
	if any(title.startswith(p) for p in _SKIP_TITLE_PREFIXES):
	continue
	if title in _SKIP_TITLES:
	continue
	seen_pageids.add(pageid)
	titles.append((_BASE_URL + title.replace(" ", "_"), title))
	# Handle continuation
	cont = data.get("continue", {}).get("cmcontinue")
	if not cont:
	break
	params = {**params, "cmcontinue": cont}
	except (httpx.HTTPStatusError, httpx.RequestError) as exc:
	logger.warning("AoPS category fetch failed for %r: %s", category, exc)
	break
	return titles


	async def fetch_aops(
	queries: list[str],
	seen: set[str],
	results_per_query: int = 3,
	category: str \| None = None,
	) -> tuple[list[tuple[str, str]], int]:
	"""Returns ([(page_url, html), ...], skipped_count).

	Discovers pages via keyword search (queries) and optionally via category listing.
	De-duplicates by pageid; skips URLs already in seen.
	"""
	seen_pageids: set[int] = set()
	titles_to_fetch: list[tuple[str, str]] = []
	skipped = 0

	# --- keyword search ---
	for query in queries:
	try:
	resp = await fetch_with_retry(
	_API_URL,
	params={
	"action": "query",
	"list": "search",
	"srsearch": query,
	"srlimit": results_per_query,
	"format": "json",
	},
	)
	data = safe_json(resp)
	for item in data.get("query", {}).get("search", []):
	pageid = item["pageid"]
	title = item["title"]
	if pageid in seen_pageids:
	continue
	seen_pageids.add(pageid)
	canonical_url = _BASE_URL + title.replace(" ", "_")
	if canonical_url in seen:
	skipped += 1
	else:
	titles_to_fetch.append((canonical_url, title))
	except (httpx.HTTPStatusError, httpx.RequestError) as exc:
	logger.warning("AoPS search failed for %r: %s", query, exc)

	# --- category listing (supplementary) ---
	if category:
	for canonical_url, title in await _fetch_category_titles(category, seen_pageids):
	if canonical_url in seen:
	skipped += 1
	else:
	titles_to_fetch.append((canonical_url, title))

	# --- fetch HTML for each discovered page ---
	results: list[tuple[str, str]] = []
	for canonical_url, title in titles_to_fetch:
	try:
	resp = await fetch_with_retry(
	_API_URL,
	params={
	"action": "parse",
	"page": title,
	"prop": "text",
	"format": "json",
	"redirects": "1",
	},
	)
	html = safe_json(resp).get("parse", {}).get("text", {}).get("*", "")
	if html:
	results.append((canonical_url, html))
	except (httpx.HTTPStatusError, httpx.RequestError) as exc:
	logger.warning("AoPS fetch failed for %r: %s", title, exc)

	return results, skipped