File size: 3,780 Bytes
e63c592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import argparse
import itertools
import sys
from typing import List

import httpx


TOPICS: List[str] = [
    "retrieval augmented generation",
    "vector databases",
    "semantic search",
    "information retrieval",
    "large language models",
    "transformer architectures",
    "question answering",
    "document similarity",
    "embedding models",
    "knowledge graphs",
    "few-shot learning",
    "self supervised learning",
    "contrastive learning",
    "neural search",
    "dense passage retrieval",
    "sparse retrieval",
    "multi modal retrieval",
    "open domain question answering",
    "context windows",
    "memory in llms",
    "hallucination mitigation",
    "prompt engineering",
    "evaluation of rag",
    "document chunking",
    "vector compression",
    "approximate nearest neighbors",
    "Pinecone vector database",
    "OpenAlex scholarly graph",
    "arXiv preprint search",
    "retrieval pipelines",
]


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Seed the RAG backend with documents from arXiv and OpenAlex."
    )
    parser.add_argument(
        "--base-url",
        type=str,
        default="http://localhost:8000",
        help="Base URL of the backend (default: http://localhost:8000)",
    )
    parser.add_argument(
        "--namespace",
        type=str,
        default="dev",
        help="Target Pinecone namespace (default: dev)",
    )
    parser.add_argument(
        "--mailto",
        type=str,
        required=True,
        help="Contact email for OpenAlex API (required)",
    )
    parser.add_argument(
        "--max-docs",
        type=int,
        default=20,
        help="Max documents per topic per source (capped at 20)",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    max_docs = min(args.max_docs, 20)

    print(
        f"Seeding backend at {args.base_url} into namespace='{args.namespace}' "
        f"with up to {max_docs} docs per topic per source.",
        file=sys.stderr,
    )

    arxiv_url = f"{args.base_url.rstrip('/')}/ingest/arxiv"
    openalex_url = f"{args.base_url.rstrip('/')}/ingest/openalex"

    with httpx.Client(timeout=30.0) as client:
        for idx, topic in enumerate(TOPICS, start=1):
            print(f"[{idx}/{len(TOPICS)}] Topic: {topic}", file=sys.stderr)

            try:
                arxiv_resp = client.post(
                    arxiv_url,
                    json={
                        "query": topic,
                        "max_docs": max_docs,
                        "namespace": args.namespace,
                        "category": "papers",
                    },
                )
                arxiv_resp.raise_for_status()
                print(
                    f"  arXiv: {arxiv_resp.json()}",
                    file=sys.stderr,
                )
            except Exception as exc:  # noqa: BLE001
                print(f"  arXiv error: {exc}", file=sys.stderr)

            try:
                openalex_resp = client.post(
                    openalex_url,
                    json={
                        "query": topic,
                        "max_docs": max_docs,
                        "namespace": args.namespace,
                        "mailto": args.mailto,
                    },
                )
                openalex_resp.raise_for_status()
                print(
                    f"  OpenAlex: {openalex_resp.json()}",
                    file=sys.stderr,
                )
            except Exception as exc:  # noqa: BLE001
                print(f"  OpenAlex error: {exc}", file=sys.stderr)

    print("Seeding complete.", file=sys.stderr)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())