File size: 4,621 Bytes
06620da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""Gap-driven web crawler: AoPS Wiki + Paul's Notes → math_wiki DB."""
import argparse
import asyncio
import sys
import os

# Ensure both the scripts/ dir (for crawl.*) and backend/ (for app.*) are on the path.
_SCRIPTS_DIR = os.path.dirname(__file__)
_BACKEND_DIR = os.path.join(os.path.dirname(_SCRIPTS_DIR), "backend")
for _p in (_SCRIPTS_DIR, _BACKEND_DIR):
    if _p not in sys.path:
        sys.path.insert(0, _p)

from crawl.topic_map import AOPS_QUERIES
from crawl.runner import crawl_and_ingest, fetch_gap_topics


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Crawl AoPS Wiki and Paul's Notes, ingest into math_wiki DB."
    )
    source_group = parser.add_mutually_exclusive_group(required=True)
    source_group.add_argument(
        "--gap-driven",
        action="store_true",
        help="Pull topics from /math-gaps on the running backend.",
    )
    source_group.add_argument(
        "--topics",
        metavar="TOPICS",
        help='Comma-separated topic labels, or "all".',
    )
    parser.add_argument(
        "--sources",
        default="aops,pauls,generic",
        help='Comma-separated sources: "aops", "pauls", "generic" (default: aops,pauls,generic).',
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=5,
        help="Max topics when --gap-driven (default: 5).",
    )
    parser.add_argument(
        "--api-base",
        default="http://localhost:8000",
        help="Backend URL for gap fetch (default: http://localhost:8000).",
    )
    parser.add_argument(
        "--database-url",
        default=os.environ.get("DATABASE_URL", ""),
        help="asyncpg-compatible Postgres URL (falls back to DATABASE_URL env var).",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Fetch and chunk, but skip concept_ingest and progress writes.",
    )
    parser.add_argument(
        "--reset-progress",
        action="store_true",
        help="Clear crawl_progress.json before starting.",
    )
    parser.add_argument(
        "--reset",
        action="store_true",
        help="Alias for --reset-progress: clear crawl_progress.json before starting (for local dev).",
    )
    return parser


async def _main(args: argparse.Namespace) -> int:
    import asyncpg
    import pgvector.asyncpg
    from app.dependencies import get_ai_client

    prefix = "[DRY RUN] " if args.dry_run else ""
    sources = [s.strip() for s in args.sources.split(",")]

    if args.gap_driven:
        print(f"{prefix}Fetching gap topics from {args.api_base} …")
        topics = await fetch_gap_topics(args.api_base, args.limit)
    else:
        raw = args.topics
        if raw == "all":
            topics = list(AOPS_QUERIES.keys())
        else:
            topics = [t.strip() for t in raw.split(",")]

    unknown = [t for t in topics if t not in AOPS_QUERIES]
    if unknown:
        print(f"Unknown topics: {unknown}. Valid: {list(AOPS_QUERIES)}", file=sys.stderr)
        return 1

    print(f"{prefix}Topics: {topics}")
    print(f"{prefix}Sources: {sources}")

    pool = None
    if args.database_url and not args.dry_run:
        async def _init_conn(conn):
            await pgvector.asyncpg.register_vector(conn)
        pool = await asyncpg.create_pool(args.database_url, init=_init_conn)
        print(f"Connected to Postgres ({args.database_url[:40]}…)")
    elif not args.dry_run:
        print("Warning: no DATABASE_URL — wiki units will NOT be saved to DB.", file=sys.stderr)

    if args.reset and not args.reset_progress:
        from crawl.progress import reset as _reset_progress
        _reset_progress()

    client = get_ai_client()
    try:
        stats = await crawl_and_ingest(
            client,
            topics=topics,
            sources=sources,
            dry_run=args.dry_run,
            reset_progress=args.reset_progress,
            pool=pool,
        )
    finally:
        if pool:
            await pool.close()

    print(
        f"\n{prefix}Done.\n"
        f"  Topics crawled  : {stats['topics']}\n"
        f"  Pages fetched   : {stats['pages_fetched']}\n"
        f"  Chunks sent     : {stats['chunks_sent']}\n"
        f"  Wiki units added: {stats['wiki_units_added']}\n"
        f"  Pages skipped   : {stats['skipped_seen']} (already seen)\n"
        f"  Errors          : {stats['errors']}"
    )
    return 0


def main() -> None:
    parser = _build_parser()
    args = parser.parse_args()
    sys.exit(asyncio.run(_main(args)))


if __name__ == "__main__":
    main()