Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Gap-driven web crawler: AoPS Wiki + Paul's Notes → math_wiki DB.""" | |
| import argparse | |
| import asyncio | |
| import sys | |
| import os | |
| # Ensure both the scripts/ dir (for crawl.*) and backend/ (for app.*) are on the path. | |
| _SCRIPTS_DIR = os.path.dirname(__file__) | |
| _BACKEND_DIR = os.path.join(os.path.dirname(_SCRIPTS_DIR), "backend") | |
| for _p in (_SCRIPTS_DIR, _BACKEND_DIR): | |
| if _p not in sys.path: | |
| sys.path.insert(0, _p) | |
| from crawl.topic_map import AOPS_QUERIES | |
| from crawl.runner import crawl_and_ingest, fetch_gap_topics | |
| def _build_parser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser( | |
| description="Crawl AoPS Wiki and Paul's Notes, ingest into math_wiki DB." | |
| ) | |
| source_group = parser.add_mutually_exclusive_group(required=True) | |
| source_group.add_argument( | |
| "--gap-driven", | |
| action="store_true", | |
| help="Pull topics from /math-gaps on the running backend.", | |
| ) | |
| source_group.add_argument( | |
| "--topics", | |
| metavar="TOPICS", | |
| help='Comma-separated topic labels, or "all".', | |
| ) | |
| parser.add_argument( | |
| "--sources", | |
| default="aops,pauls,generic", | |
| help='Comma-separated sources: "aops", "pauls", "generic" (default: aops,pauls,generic).', | |
| ) | |
| parser.add_argument( | |
| "--limit", | |
| type=int, | |
| default=5, | |
| help="Max topics when --gap-driven (default: 5).", | |
| ) | |
| parser.add_argument( | |
| "--api-base", | |
| default="http://localhost:8000", | |
| help="Backend URL for gap fetch (default: http://localhost:8000).", | |
| ) | |
| parser.add_argument( | |
| "--database-url", | |
| default=os.environ.get("DATABASE_URL", ""), | |
| help="asyncpg-compatible Postgres URL (falls back to DATABASE_URL env var).", | |
| ) | |
| parser.add_argument( | |
| "--dry-run", | |
| action="store_true", | |
| help="Fetch and chunk, but skip concept_ingest and progress writes.", | |
| ) | |
| parser.add_argument( | |
| "--reset-progress", | |
| action="store_true", | |
| help="Clear crawl_progress.json before starting.", | |
| ) | |
| parser.add_argument( | |
| "--reset", | |
| action="store_true", | |
| help="Alias for --reset-progress: clear crawl_progress.json before starting (for local dev).", | |
| ) | |
| return parser | |
| async def _main(args: argparse.Namespace) -> int: | |
| import asyncpg | |
| import pgvector.asyncpg | |
| from app.dependencies import get_ai_client | |
| prefix = "[DRY RUN] " if args.dry_run else "" | |
| sources = [s.strip() for s in args.sources.split(",")] | |
| if args.gap_driven: | |
| print(f"{prefix}Fetching gap topics from {args.api_base} …") | |
| topics = await fetch_gap_topics(args.api_base, args.limit) | |
| else: | |
| raw = args.topics | |
| if raw == "all": | |
| topics = list(AOPS_QUERIES.keys()) | |
| else: | |
| topics = [t.strip() for t in raw.split(",")] | |
| unknown = [t for t in topics if t not in AOPS_QUERIES] | |
| if unknown: | |
| print(f"Unknown topics: {unknown}. Valid: {list(AOPS_QUERIES)}", file=sys.stderr) | |
| return 1 | |
| print(f"{prefix}Topics: {topics}") | |
| print(f"{prefix}Sources: {sources}") | |
| pool = None | |
| if args.database_url and not args.dry_run: | |
| async def _init_conn(conn): | |
| await pgvector.asyncpg.register_vector(conn) | |
| pool = await asyncpg.create_pool(args.database_url, init=_init_conn) | |
| print(f"Connected to Postgres ({args.database_url[:40]}…)") | |
| elif not args.dry_run: | |
| print("Warning: no DATABASE_URL — wiki units will NOT be saved to DB.", file=sys.stderr) | |
| if args.reset and not args.reset_progress: | |
| from crawl.progress import reset as _reset_progress | |
| _reset_progress() | |
| client = get_ai_client() | |
| try: | |
| stats = await crawl_and_ingest( | |
| client, | |
| topics=topics, | |
| sources=sources, | |
| dry_run=args.dry_run, | |
| reset_progress=args.reset_progress, | |
| pool=pool, | |
| ) | |
| finally: | |
| if pool: | |
| await pool.close() | |
| print( | |
| f"\n{prefix}Done.\n" | |
| f" Topics crawled : {stats['topics']}\n" | |
| f" Pages fetched : {stats['pages_fetched']}\n" | |
| f" Chunks sent : {stats['chunks_sent']}\n" | |
| f" Wiki units added: {stats['wiki_units_added']}\n" | |
| f" Pages skipped : {stats['skipped_seen']} (already seen)\n" | |
| f" Errors : {stats['errors']}" | |
| ) | |
| return 0 | |
| def main() -> None: | |
| parser = _build_parser() | |
| args = parser.parse_args() | |
| sys.exit(asyncio.run(_main(args))) | |
| if __name__ == "__main__": | |
| main() | |