import logging import httpx from crawl.http_utils import fetch_with_retry, safe_json logger = logging.getLogger(__name__) _API_URL = "https://artofproblemsolving.com/wiki/api.php" _BASE_URL = "https://artofproblemsolving.com/wiki/index.php/" # Pages that are navigation/index stubs — not worth ingesting. _SKIP_TITLE_PREFIXES = ( "Category:", "Talk:", "User:", "AoPS Wiki:", "Help:", "File:", ) _SKIP_TITLES = frozenset({ "Combinatorics", "Combinatorics/Introduction", "Combinatorics/Intermediate", "Combinatorics/Olympiad", "Number theory", "Geometry", "Statistical mechanics", "Discrete mathematics", "Discrete quantity", }) async def _fetch_category_titles(category: str, seen_pageids: set[int]) -> list[tuple[str, str]]: """Return (canonical_url, title) for all pages in an AoPS category.""" titles: list[tuple[str, str]] = [] params: dict = { "action": "query", "list": "categorymembers", "cmtitle": f"Category:{category}", "cmlimit": "100", "cmtype": "page", "format": "json", } while True: try: resp = await fetch_with_retry(_API_URL, params=params) data = safe_json(resp) for item in data.get("query", {}).get("categorymembers", []): title = item["title"] pageid = item["pageid"] if pageid in seen_pageids: continue if any(title.startswith(p) for p in _SKIP_TITLE_PREFIXES): continue if title in _SKIP_TITLES: continue seen_pageids.add(pageid) titles.append((_BASE_URL + title.replace(" ", "_"), title)) # Handle continuation cont = data.get("continue", {}).get("cmcontinue") if not cont: break params = {**params, "cmcontinue": cont} except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("AoPS category fetch failed for %r: %s", category, exc) break return titles async def fetch_aops( queries: list[str], seen: set[str], results_per_query: int = 3, category: str | None = None, ) -> tuple[list[tuple[str, str]], int]: """Returns ([(page_url, html), ...], skipped_count). Discovers pages via keyword search (queries) and optionally via category listing. De-duplicates by pageid; skips URLs already in seen. """ seen_pageids: set[int] = set() titles_to_fetch: list[tuple[str, str]] = [] skipped = 0 # --- keyword search --- for query in queries: try: resp = await fetch_with_retry( _API_URL, params={ "action": "query", "list": "search", "srsearch": query, "srlimit": results_per_query, "format": "json", }, ) data = safe_json(resp) for item in data.get("query", {}).get("search", []): pageid = item["pageid"] title = item["title"] if pageid in seen_pageids: continue seen_pageids.add(pageid) canonical_url = _BASE_URL + title.replace(" ", "_") if canonical_url in seen: skipped += 1 else: titles_to_fetch.append((canonical_url, title)) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("AoPS search failed for %r: %s", query, exc) # --- category listing (supplementary) --- if category: for canonical_url, title in await _fetch_category_titles(category, seen_pageids): if canonical_url in seen: skipped += 1 else: titles_to_fetch.append((canonical_url, title)) # --- fetch HTML for each discovered page --- results: list[tuple[str, str]] = [] for canonical_url, title in titles_to_fetch: try: resp = await fetch_with_retry( _API_URL, params={ "action": "parse", "page": title, "prop": "text", "format": "json", "redirects": "1", }, ) html = safe_json(resp).get("parse", {}).get("text", {}).get("*", "") if html: results.append((canonical_url, html)) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("AoPS fetch failed for %r: %s", title, exc) return results, skipped