ai-agent-app / scripts /crawl_wiki.py
MinhTai's picture
deploy: 51e906f
5bae9e6
#!/usr/bin/env python3
"""Gap-driven web crawler: AoPS Wiki + Paul's Notes → math_wiki DB."""
import argparse
import asyncio
import sys
import os
# Ensure both the scripts/ dir (for crawl.*) and backend/ (for app.*) are on the path.
_SCRIPTS_DIR = os.path.dirname(__file__)
_BACKEND_DIR = os.path.join(os.path.dirname(_SCRIPTS_DIR), "backend")
for _p in (_SCRIPTS_DIR, _BACKEND_DIR):
if _p not in sys.path:
sys.path.insert(0, _p)
from crawl.topic_map import AOPS_QUERIES
from crawl.runner import crawl_and_ingest, fetch_gap_topics
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Crawl AoPS Wiki and Paul's Notes, ingest into math_wiki DB."
)
source_group = parser.add_mutually_exclusive_group(required=True)
source_group.add_argument(
"--gap-driven",
action="store_true",
help="Pull topics from /math-gaps on the running backend.",
)
source_group.add_argument(
"--topics",
metavar="TOPICS",
help='Comma-separated topic labels, or "all".',
)
parser.add_argument(
"--sources",
default="aops,pauls,generic",
help='Comma-separated sources: "aops", "pauls", "generic" (default: aops,pauls,generic).',
)
parser.add_argument(
"--limit",
type=int,
default=5,
help="Max topics when --gap-driven (default: 5).",
)
parser.add_argument(
"--api-base",
default="http://localhost:8000",
help="Backend URL for gap fetch (default: http://localhost:8000).",
)
parser.add_argument(
"--database-url",
default=os.environ.get("DATABASE_URL", ""),
help="asyncpg-compatible Postgres URL (falls back to DATABASE_URL env var).",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Fetch and chunk, but skip concept_ingest and progress writes.",
)
parser.add_argument(
"--reset-progress",
action="store_true",
help="Clear crawl_progress.json before starting.",
)
parser.add_argument(
"--reset",
action="store_true",
help="Alias for --reset-progress: clear crawl_progress.json before starting (for local dev).",
)
return parser
async def _main(args: argparse.Namespace) -> int:
import asyncpg
import pgvector.asyncpg
from app.dependencies import get_ai_client
prefix = "[DRY RUN] " if args.dry_run else ""
sources = [s.strip() for s in args.sources.split(",")]
if args.gap_driven:
print(f"{prefix}Fetching gap topics from {args.api_base} …")
topics = await fetch_gap_topics(args.api_base, args.limit)
else:
raw = args.topics
if raw == "all":
topics = list(AOPS_QUERIES.keys())
else:
topics = [t.strip() for t in raw.split(",")]
unknown = [t for t in topics if t not in AOPS_QUERIES]
if unknown:
print(f"Unknown topics: {unknown}. Valid: {list(AOPS_QUERIES)}", file=sys.stderr)
return 1
print(f"{prefix}Topics: {topics}")
print(f"{prefix}Sources: {sources}")
pool = None
if args.database_url and not args.dry_run:
async def _init_conn(conn):
await pgvector.asyncpg.register_vector(conn)
pool = await asyncpg.create_pool(args.database_url, init=_init_conn)
print(f"Connected to Postgres ({args.database_url[:40]}…)")
elif not args.dry_run:
print("Warning: no DATABASE_URL — wiki units will NOT be saved to DB.", file=sys.stderr)
if args.reset and not args.reset_progress:
from crawl.progress import reset as _reset_progress
_reset_progress()
client = get_ai_client()
try:
stats = await crawl_and_ingest(
client,
topics=topics,
sources=sources,
dry_run=args.dry_run,
reset_progress=args.reset_progress,
pool=pool,
)
finally:
if pool:
await pool.close()
print(
f"\n{prefix}Done.\n"
f" Topics crawled : {stats['topics']}\n"
f" Pages fetched : {stats['pages_fetched']}\n"
f" Chunks sent : {stats['chunks_sent']}\n"
f" Wiki units added: {stats['wiki_units_added']}\n"
f" Pages skipped : {stats['skipped_seen']} (already seen)\n"
f" Errors : {stats['errors']}"
)
return 0
def main() -> None:
parser = _build_parser()
args = parser.parse_args()
sys.exit(asyncio.run(_main(args)))
if __name__ == "__main__":
main()