Spaces:

MinhTai
/

ai-agent-app

Running

File size: 4,621 Bytes

06620da

#!/usr/bin/env python3
"""Gap-driven web crawler: AoPS Wiki + Paul's Notes → math_wiki DB."""
import argparse
import asyncio
import sys
import os

# Ensure both the scripts/ dir (for crawl.*) and backend/ (for app.*) are on the path.
_SCRIPTS_DIR = os.path.dirname(__file__)
_BACKEND_DIR = os.path.join(os.path.dirname(_SCRIPTS_DIR), "backend")
for _p in (_SCRIPTS_DIR, _BACKEND_DIR):
    if _p not in sys.path:
        sys.path.insert(0, _p)

from crawl.topic_map import AOPS_QUERIES
from crawl.runner import crawl_and_ingest, fetch_gap_topics


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Crawl AoPS Wiki and Paul's Notes, ingest into math_wiki DB."
    )
    source_group = parser.add_mutually_exclusive_group(required=True)
    source_group.add_argument(
        "--gap-driven",
        action="store_true",
        help="Pull topics from /math-gaps on the running backend.",
    )
    source_group.add_argument(
        "--topics",
        metavar="TOPICS",
        help='Comma-separated topic labels, or "all".',
    )
    parser.add_argument(
        "--sources",
        default="aops,pauls,generic",
        help='Comma-separated sources: "aops", "pauls", "generic" (default: aops,pauls,generic).',
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=5,
        help="Max topics when --gap-driven (default: 5).",
    )
    parser.add_argument(
        "--api-base",
        default="http://localhost:8000",
        help="Backend URL for gap fetch (default: http://localhost:8000).",
    )
    parser.add_argument(
        "--database-url",
        default=os.environ.get("DATABASE_URL", ""),
        help="asyncpg-compatible Postgres URL (falls back to DATABASE_URL env var).",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Fetch and chunk, but skip concept_ingest and progress writes.",
    )
    parser.add_argument(
        "--reset-progress",
        action="store_true",
        help="Clear crawl_progress.json before starting.",
    )
    parser.add_argument(
        "--reset",
        action="store_true",
        help="Alias for --reset-progress: clear crawl_progress.json before starting (for local dev).",
    )
    return parser


async def _main(args: argparse.Namespace) -> int:
    import asyncpg
    import pgvector.asyncpg
    from app.dependencies import get_ai_client

    prefix = "[DRY RUN] " if args.dry_run else ""
    sources = [s.strip() for s in args.sources.split(",")]

    if args.gap_driven:
        print(f"{prefix}Fetching gap topics from {args.api_base} …")
        topics = await fetch_gap_topics(args.api_base, args.limit)
    else:
        raw = args.topics
        if raw == "all":
            topics = list(AOPS_QUERIES.keys())
        else:
            topics = [t.strip() for t in raw.split(",")]

    unknown = [t for t in topics if t not in AOPS_QUERIES]
    if unknown:
        print(f"Unknown topics: {unknown}. Valid: {list(AOPS_QUERIES)}", file=sys.stderr)
        return 1

    print(f"{prefix}Topics: {topics}")
    print(f"{prefix}Sources: {sources}")

    pool = None
    if args.database_url and not args.dry_run:
        async def _init_conn(conn):
            await pgvector.asyncpg.register_vector(conn)
        pool = await asyncpg.create_pool(args.database_url, init=_init_conn)
        print(f"Connected to Postgres ({args.database_url[:40]}…)")
    elif not args.dry_run:
        print("Warning: no DATABASE_URL — wiki units will NOT be saved to DB.", file=sys.stderr)

    if args.reset and not args.reset_progress:
        from crawl.progress import reset as _reset_progress
        _reset_progress()

    client = get_ai_client()
    try:
        stats = await crawl_and_ingest(
            client,
            topics=topics,
            sources=sources,
            dry_run=args.dry_run,
            reset_progress=args.reset_progress,
            pool=pool,
        )
    finally:
        if pool:
            await pool.close()

    print(
        f"\n{prefix}Done.\n"
        f"  Topics crawled  : {stats['topics']}\n"
        f"  Pages fetched   : {stats['pages_fetched']}\n"
        f"  Chunks sent     : {stats['chunks_sent']}\n"
        f"  Wiki units added: {stats['wiki_units_added']}\n"
        f"  Pages skipped   : {stats['skipped_seen']} (already seen)\n"
        f"  Errors          : {stats['errors']}"
    )
    return 0


def main() -> None:
    parser = _build_parser()
    args = parser.parse_args()
    sys.exit(asyncio.run(_main(args)))


if __name__ == "__main__":
    main()