Spaces:
Running
Running
| import logging | |
| import httpx | |
| from crawl.http_utils import fetch_with_retry, safe_json | |
| logger = logging.getLogger(__name__) | |
| _API_URL = "https://artofproblemsolving.com/wiki/api.php" | |
| _BASE_URL = "https://artofproblemsolving.com/wiki/index.php/" | |
| # Pages that are navigation/index stubs — not worth ingesting. | |
| _SKIP_TITLE_PREFIXES = ( | |
| "Category:", "Talk:", "User:", "AoPS Wiki:", "Help:", "File:", | |
| ) | |
| _SKIP_TITLES = frozenset({ | |
| "Combinatorics", "Combinatorics/Introduction", "Combinatorics/Intermediate", | |
| "Combinatorics/Olympiad", "Number theory", "Geometry", | |
| "Statistical mechanics", "Discrete mathematics", "Discrete quantity", | |
| }) | |
| async def _fetch_category_titles(category: str, seen_pageids: set[int]) -> list[tuple[str, str]]: | |
| """Return (canonical_url, title) for all pages in an AoPS category.""" | |
| titles: list[tuple[str, str]] = [] | |
| params: dict = { | |
| "action": "query", | |
| "list": "categorymembers", | |
| "cmtitle": f"Category:{category}", | |
| "cmlimit": "100", | |
| "cmtype": "page", | |
| "format": "json", | |
| } | |
| while True: | |
| try: | |
| resp = await fetch_with_retry(_API_URL, params=params) | |
| data = safe_json(resp) | |
| for item in data.get("query", {}).get("categorymembers", []): | |
| title = item["title"] | |
| pageid = item["pageid"] | |
| if pageid in seen_pageids: | |
| continue | |
| if any(title.startswith(p) for p in _SKIP_TITLE_PREFIXES): | |
| continue | |
| if title in _SKIP_TITLES: | |
| continue | |
| seen_pageids.add(pageid) | |
| titles.append((_BASE_URL + title.replace(" ", "_"), title)) | |
| # Handle continuation | |
| cont = data.get("continue", {}).get("cmcontinue") | |
| if not cont: | |
| break | |
| params = {**params, "cmcontinue": cont} | |
| except (httpx.HTTPStatusError, httpx.RequestError) as exc: | |
| logger.warning("AoPS category fetch failed for %r: %s", category, exc) | |
| break | |
| return titles | |
| async def fetch_aops( | |
| queries: list[str], | |
| seen: set[str], | |
| results_per_query: int = 3, | |
| category: str | None = None, | |
| ) -> tuple[list[tuple[str, str]], int]: | |
| """Returns ([(page_url, html), ...], skipped_count). | |
| Discovers pages via keyword search (queries) and optionally via category listing. | |
| De-duplicates by pageid; skips URLs already in seen. | |
| """ | |
| seen_pageids: set[int] = set() | |
| titles_to_fetch: list[tuple[str, str]] = [] | |
| skipped = 0 | |
| # --- keyword search --- | |
| for query in queries: | |
| try: | |
| resp = await fetch_with_retry( | |
| _API_URL, | |
| params={ | |
| "action": "query", | |
| "list": "search", | |
| "srsearch": query, | |
| "srlimit": results_per_query, | |
| "format": "json", | |
| }, | |
| ) | |
| data = safe_json(resp) | |
| for item in data.get("query", {}).get("search", []): | |
| pageid = item["pageid"] | |
| title = item["title"] | |
| if pageid in seen_pageids: | |
| continue | |
| seen_pageids.add(pageid) | |
| canonical_url = _BASE_URL + title.replace(" ", "_") | |
| if canonical_url in seen: | |
| skipped += 1 | |
| else: | |
| titles_to_fetch.append((canonical_url, title)) | |
| except (httpx.HTTPStatusError, httpx.RequestError) as exc: | |
| logger.warning("AoPS search failed for %r: %s", query, exc) | |
| # --- category listing (supplementary) --- | |
| if category: | |
| for canonical_url, title in await _fetch_category_titles(category, seen_pageids): | |
| if canonical_url in seen: | |
| skipped += 1 | |
| else: | |
| titles_to_fetch.append((canonical_url, title)) | |
| # --- fetch HTML for each discovered page --- | |
| results: list[tuple[str, str]] = [] | |
| for canonical_url, title in titles_to_fetch: | |
| try: | |
| resp = await fetch_with_retry( | |
| _API_URL, | |
| params={ | |
| "action": "parse", | |
| "page": title, | |
| "prop": "text", | |
| "format": "json", | |
| "redirects": "1", | |
| }, | |
| ) | |
| html = safe_json(resp).get("parse", {}).get("text", {}).get("*", "") | |
| if html: | |
| results.append((canonical_url, html)) | |
| except (httpx.HTTPStatusError, httpx.RequestError) as exc: | |
| logger.warning("AoPS fetch failed for %r: %s", title, exc) | |
| return results, skipped | |