MinhTai's picture
deploy: 80c6864
6f6557f
import logging
import httpx
from crawl.http_utils import fetch_with_retry, safe_json
logger = logging.getLogger(__name__)
_API_URL = "https://artofproblemsolving.com/wiki/api.php"
_BASE_URL = "https://artofproblemsolving.com/wiki/index.php/"
# Pages that are navigation/index stubs — not worth ingesting.
_SKIP_TITLE_PREFIXES = (
"Category:", "Talk:", "User:", "AoPS Wiki:", "Help:", "File:",
)
_SKIP_TITLES = frozenset({
"Combinatorics", "Combinatorics/Introduction", "Combinatorics/Intermediate",
"Combinatorics/Olympiad", "Number theory", "Geometry",
"Statistical mechanics", "Discrete mathematics", "Discrete quantity",
})
async def _fetch_category_titles(category: str, seen_pageids: set[int]) -> list[tuple[str, str]]:
"""Return (canonical_url, title) for all pages in an AoPS category."""
titles: list[tuple[str, str]] = []
params: dict = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Category:{category}",
"cmlimit": "100",
"cmtype": "page",
"format": "json",
}
while True:
try:
resp = await fetch_with_retry(_API_URL, params=params)
data = safe_json(resp)
for item in data.get("query", {}).get("categorymembers", []):
title = item["title"]
pageid = item["pageid"]
if pageid in seen_pageids:
continue
if any(title.startswith(p) for p in _SKIP_TITLE_PREFIXES):
continue
if title in _SKIP_TITLES:
continue
seen_pageids.add(pageid)
titles.append((_BASE_URL + title.replace(" ", "_"), title))
# Handle continuation
cont = data.get("continue", {}).get("cmcontinue")
if not cont:
break
params = {**params, "cmcontinue": cont}
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("AoPS category fetch failed for %r: %s", category, exc)
break
return titles
async def fetch_aops(
queries: list[str],
seen: set[str],
results_per_query: int = 3,
category: str | None = None,
) -> tuple[list[tuple[str, str]], int]:
"""Returns ([(page_url, html), ...], skipped_count).
Discovers pages via keyword search (queries) and optionally via category listing.
De-duplicates by pageid; skips URLs already in seen.
"""
seen_pageids: set[int] = set()
titles_to_fetch: list[tuple[str, str]] = []
skipped = 0
# --- keyword search ---
for query in queries:
try:
resp = await fetch_with_retry(
_API_URL,
params={
"action": "query",
"list": "search",
"srsearch": query,
"srlimit": results_per_query,
"format": "json",
},
)
data = safe_json(resp)
for item in data.get("query", {}).get("search", []):
pageid = item["pageid"]
title = item["title"]
if pageid in seen_pageids:
continue
seen_pageids.add(pageid)
canonical_url = _BASE_URL + title.replace(" ", "_")
if canonical_url in seen:
skipped += 1
else:
titles_to_fetch.append((canonical_url, title))
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("AoPS search failed for %r: %s", query, exc)
# --- category listing (supplementary) ---
if category:
for canonical_url, title in await _fetch_category_titles(category, seen_pageids):
if canonical_url in seen:
skipped += 1
else:
titles_to_fetch.append((canonical_url, title))
# --- fetch HTML for each discovered page ---
results: list[tuple[str, str]] = []
for canonical_url, title in titles_to_fetch:
try:
resp = await fetch_with_retry(
_API_URL,
params={
"action": "parse",
"page": title,
"prop": "text",
"format": "json",
"redirects": "1",
},
)
html = safe_json(resp).get("parse", {}).get("text", {}).get("*", "")
if html:
results.append((canonical_url, html))
except (httpx.HTTPStatusError, httpx.RequestError) as exc:
logger.warning("AoPS fetch failed for %r: %s", title, exc)
return results, skipped