Spaces:
Sleeping
Sleeping
File size: 3,780 Bytes
e63c592 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import argparse
import itertools
import sys
from typing import List
import httpx
TOPICS: List[str] = [
"retrieval augmented generation",
"vector databases",
"semantic search",
"information retrieval",
"large language models",
"transformer architectures",
"question answering",
"document similarity",
"embedding models",
"knowledge graphs",
"few-shot learning",
"self supervised learning",
"contrastive learning",
"neural search",
"dense passage retrieval",
"sparse retrieval",
"multi modal retrieval",
"open domain question answering",
"context windows",
"memory in llms",
"hallucination mitigation",
"prompt engineering",
"evaluation of rag",
"document chunking",
"vector compression",
"approximate nearest neighbors",
"Pinecone vector database",
"OpenAlex scholarly graph",
"arXiv preprint search",
"retrieval pipelines",
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Seed the RAG backend with documents from arXiv and OpenAlex."
)
parser.add_argument(
"--base-url",
type=str,
default="http://localhost:8000",
help="Base URL of the backend (default: http://localhost:8000)",
)
parser.add_argument(
"--namespace",
type=str,
default="dev",
help="Target Pinecone namespace (default: dev)",
)
parser.add_argument(
"--mailto",
type=str,
required=True,
help="Contact email for OpenAlex API (required)",
)
parser.add_argument(
"--max-docs",
type=int,
default=20,
help="Max documents per topic per source (capped at 20)",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
max_docs = min(args.max_docs, 20)
print(
f"Seeding backend at {args.base_url} into namespace='{args.namespace}' "
f"with up to {max_docs} docs per topic per source.",
file=sys.stderr,
)
arxiv_url = f"{args.base_url.rstrip('/')}/ingest/arxiv"
openalex_url = f"{args.base_url.rstrip('/')}/ingest/openalex"
with httpx.Client(timeout=30.0) as client:
for idx, topic in enumerate(TOPICS, start=1):
print(f"[{idx}/{len(TOPICS)}] Topic: {topic}", file=sys.stderr)
try:
arxiv_resp = client.post(
arxiv_url,
json={
"query": topic,
"max_docs": max_docs,
"namespace": args.namespace,
"category": "papers",
},
)
arxiv_resp.raise_for_status()
print(
f" arXiv: {arxiv_resp.json()}",
file=sys.stderr,
)
except Exception as exc: # noqa: BLE001
print(f" arXiv error: {exc}", file=sys.stderr)
try:
openalex_resp = client.post(
openalex_url,
json={
"query": topic,
"max_docs": max_docs,
"namespace": args.namespace,
"mailto": args.mailto,
},
)
openalex_resp.raise_for_status()
print(
f" OpenAlex: {openalex_resp.json()}",
file=sys.stderr,
)
except Exception as exc: # noqa: BLE001
print(f" OpenAlex error: {exc}", file=sys.stderr)
print("Seeding complete.", file=sys.stderr)
return 0
if __name__ == "__main__":
raise SystemExit(main()) |