Spaces:
Running
Running
File size: 4,621 Bytes
06620da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/usr/bin/env python3
"""Gap-driven web crawler: AoPS Wiki + Paul's Notes → math_wiki DB."""
import argparse
import asyncio
import sys
import os
# Ensure both the scripts/ dir (for crawl.*) and backend/ (for app.*) are on the path.
_SCRIPTS_DIR = os.path.dirname(__file__)
_BACKEND_DIR = os.path.join(os.path.dirname(_SCRIPTS_DIR), "backend")
for _p in (_SCRIPTS_DIR, _BACKEND_DIR):
if _p not in sys.path:
sys.path.insert(0, _p)
from crawl.topic_map import AOPS_QUERIES
from crawl.runner import crawl_and_ingest, fetch_gap_topics
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Crawl AoPS Wiki and Paul's Notes, ingest into math_wiki DB."
)
source_group = parser.add_mutually_exclusive_group(required=True)
source_group.add_argument(
"--gap-driven",
action="store_true",
help="Pull topics from /math-gaps on the running backend.",
)
source_group.add_argument(
"--topics",
metavar="TOPICS",
help='Comma-separated topic labels, or "all".',
)
parser.add_argument(
"--sources",
default="aops,pauls,generic",
help='Comma-separated sources: "aops", "pauls", "generic" (default: aops,pauls,generic).',
)
parser.add_argument(
"--limit",
type=int,
default=5,
help="Max topics when --gap-driven (default: 5).",
)
parser.add_argument(
"--api-base",
default="http://localhost:8000",
help="Backend URL for gap fetch (default: http://localhost:8000).",
)
parser.add_argument(
"--database-url",
default=os.environ.get("DATABASE_URL", ""),
help="asyncpg-compatible Postgres URL (falls back to DATABASE_URL env var).",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Fetch and chunk, but skip concept_ingest and progress writes.",
)
parser.add_argument(
"--reset-progress",
action="store_true",
help="Clear crawl_progress.json before starting.",
)
parser.add_argument(
"--reset",
action="store_true",
help="Alias for --reset-progress: clear crawl_progress.json before starting (for local dev).",
)
return parser
async def _main(args: argparse.Namespace) -> int:
import asyncpg
import pgvector.asyncpg
from app.dependencies import get_ai_client
prefix = "[DRY RUN] " if args.dry_run else ""
sources = [s.strip() for s in args.sources.split(",")]
if args.gap_driven:
print(f"{prefix}Fetching gap topics from {args.api_base} …")
topics = await fetch_gap_topics(args.api_base, args.limit)
else:
raw = args.topics
if raw == "all":
topics = list(AOPS_QUERIES.keys())
else:
topics = [t.strip() for t in raw.split(",")]
unknown = [t for t in topics if t not in AOPS_QUERIES]
if unknown:
print(f"Unknown topics: {unknown}. Valid: {list(AOPS_QUERIES)}", file=sys.stderr)
return 1
print(f"{prefix}Topics: {topics}")
print(f"{prefix}Sources: {sources}")
pool = None
if args.database_url and not args.dry_run:
async def _init_conn(conn):
await pgvector.asyncpg.register_vector(conn)
pool = await asyncpg.create_pool(args.database_url, init=_init_conn)
print(f"Connected to Postgres ({args.database_url[:40]}…)")
elif not args.dry_run:
print("Warning: no DATABASE_URL — wiki units will NOT be saved to DB.", file=sys.stderr)
if args.reset and not args.reset_progress:
from crawl.progress import reset as _reset_progress
_reset_progress()
client = get_ai_client()
try:
stats = await crawl_and_ingest(
client,
topics=topics,
sources=sources,
dry_run=args.dry_run,
reset_progress=args.reset_progress,
pool=pool,
)
finally:
if pool:
await pool.close()
print(
f"\n{prefix}Done.\n"
f" Topics crawled : {stats['topics']}\n"
f" Pages fetched : {stats['pages_fetched']}\n"
f" Chunks sent : {stats['chunks_sent']}\n"
f" Wiki units added: {stats['wiki_units_added']}\n"
f" Pages skipped : {stats['skipped_seen']} (already seen)\n"
f" Errors : {stats['errors']}"
)
return 0
def main() -> None:
parser = _build_parser()
args = parser.parse_args()
sys.exit(asyncio.run(_main(args)))
if __name__ == "__main__":
main()
|