Spaces:

MinhTai
/

ai-agent-app

Running

App Files Files Community

ai-agent-app / scripts /crawl_wiki.py

MinhTai

deploy: 51e906f

5bae9e6 about 3 hours ago

raw

history blame contribute delete

4.62 kB

	#!/usr/bin/env python3
	"""Gap-driven web crawler: AoPS Wiki + Paul's Notes → math_wiki DB."""
	import argparse
	import asyncio
	import sys
	import os

	# Ensure both the scripts/ dir (for crawl.) and backend/ (for app.) are on the path.
	_SCRIPTS_DIR = os.path.dirname(__file__)
	_BACKEND_DIR = os.path.join(os.path.dirname(_SCRIPTS_DIR), "backend")
	for _p in (_SCRIPTS_DIR, _BACKEND_DIR):
	if _p not in sys.path:
	sys.path.insert(0, _p)

	from crawl.topic_map import AOPS_QUERIES
	from crawl.runner import crawl_and_ingest, fetch_gap_topics


	def _build_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(
	description="Crawl AoPS Wiki and Paul's Notes, ingest into math_wiki DB."
	)
	source_group = parser.add_mutually_exclusive_group(required=True)
	source_group.add_argument(
	"--gap-driven",
	action="store_true",
	help="Pull topics from /math-gaps on the running backend.",
	)
	source_group.add_argument(
	"--topics",
	metavar="TOPICS",
	help='Comma-separated topic labels, or "all".',
	)
	parser.add_argument(
	"--sources",
	default="aops,pauls,generic",
	help='Comma-separated sources: "aops", "pauls", "generic" (default: aops,pauls,generic).',
	)
	parser.add_argument(
	"--limit",
	type=int,
	default=5,
	help="Max topics when --gap-driven (default: 5).",
	)
	parser.add_argument(
	"--api-base",
	default="http://localhost:8000",
	help="Backend URL for gap fetch (default: http://localhost:8000).",
	)
	parser.add_argument(
	"--database-url",
	default=os.environ.get("DATABASE_URL", ""),
	help="asyncpg-compatible Postgres URL (falls back to DATABASE_URL env var).",
	)
	parser.add_argument(
	"--dry-run",
	action="store_true",
	help="Fetch and chunk, but skip concept_ingest and progress writes.",
	)
	parser.add_argument(
	"--reset-progress",
	action="store_true",
	help="Clear crawl_progress.json before starting.",
	)
	parser.add_argument(
	"--reset",
	action="store_true",
	help="Alias for --reset-progress: clear crawl_progress.json before starting (for local dev).",
	)
	return parser


	async def _main(args: argparse.Namespace) -> int:
	import asyncpg
	import pgvector.asyncpg
	from app.dependencies import get_ai_client

	prefix = "[DRY RUN] " if args.dry_run else ""
	sources = [s.strip() for s in args.sources.split(",")]

	if args.gap_driven:
	print(f"{prefix}Fetching gap topics from {args.api_base} …")
	topics = await fetch_gap_topics(args.api_base, args.limit)
	else:
	raw = args.topics
	if raw == "all":
	topics = list(AOPS_QUERIES.keys())
	else:
	topics = [t.strip() for t in raw.split(",")]

	unknown = [t for t in topics if t not in AOPS_QUERIES]
	if unknown:
	print(f"Unknown topics: {unknown}. Valid: {list(AOPS_QUERIES)}", file=sys.stderr)
	return 1

	print(f"{prefix}Topics: {topics}")
	print(f"{prefix}Sources: {sources}")

	pool = None
	if args.database_url and not args.dry_run:
	async def _init_conn(conn):
	await pgvector.asyncpg.register_vector(conn)
	pool = await asyncpg.create_pool(args.database_url, init=_init_conn)
	print(f"Connected to Postgres ({args.database_url[:40]}…)")
	elif not args.dry_run:
	print("Warning: no DATABASE_URL — wiki units will NOT be saved to DB.", file=sys.stderr)

	if args.reset and not args.reset_progress:
	from crawl.progress import reset as _reset_progress
	_reset_progress()

	client = get_ai_client()
	try:
	stats = await crawl_and_ingest(
	client,
	topics=topics,
	sources=sources,
	dry_run=args.dry_run,
	reset_progress=args.reset_progress,
	pool=pool,
	)
	finally:
	if pool:
	await pool.close()

	print(
	f"\n{prefix}Done.\n"
	f" Topics crawled : {stats['topics']}\n"
	f" Pages fetched : {stats['pages_fetched']}\n"
	f" Chunks sent : {stats['chunks_sent']}\n"
	f" Wiki units added: {stats['wiki_units_added']}\n"
	f" Pages skipped : {stats['skipped_seen']} (already seen)\n"
	f" Errors : {stats['errors']}"
	)
	return 0


	def main() -> None:
	parser = _build_parser()
	args = parser.parse_args()
	sys.exit(asyncio.run(_main(args)))


	if __name__ == "__main__":
	main()