BrejBala's picture
Deploy backend Docker app
e63c592
import argparse
import itertools
import sys
from typing import List
import httpx
TOPICS: List[str] = [
"retrieval augmented generation",
"vector databases",
"semantic search",
"information retrieval",
"large language models",
"transformer architectures",
"question answering",
"document similarity",
"embedding models",
"knowledge graphs",
"few-shot learning",
"self supervised learning",
"contrastive learning",
"neural search",
"dense passage retrieval",
"sparse retrieval",
"multi modal retrieval",
"open domain question answering",
"context windows",
"memory in llms",
"hallucination mitigation",
"prompt engineering",
"evaluation of rag",
"document chunking",
"vector compression",
"approximate nearest neighbors",
"Pinecone vector database",
"OpenAlex scholarly graph",
"arXiv preprint search",
"retrieval pipelines",
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Seed the RAG backend with documents from arXiv and OpenAlex."
)
parser.add_argument(
"--base-url",
type=str,
default="http://localhost:8000",
help="Base URL of the backend (default: http://localhost:8000)",
)
parser.add_argument(
"--namespace",
type=str,
default="dev",
help="Target Pinecone namespace (default: dev)",
)
parser.add_argument(
"--mailto",
type=str,
required=True,
help="Contact email for OpenAlex API (required)",
)
parser.add_argument(
"--max-docs",
type=int,
default=20,
help="Max documents per topic per source (capped at 20)",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
max_docs = min(args.max_docs, 20)
print(
f"Seeding backend at {args.base_url} into namespace='{args.namespace}' "
f"with up to {max_docs} docs per topic per source.",
file=sys.stderr,
)
arxiv_url = f"{args.base_url.rstrip('/')}/ingest/arxiv"
openalex_url = f"{args.base_url.rstrip('/')}/ingest/openalex"
with httpx.Client(timeout=30.0) as client:
for idx, topic in enumerate(TOPICS, start=1):
print(f"[{idx}/{len(TOPICS)}] Topic: {topic}", file=sys.stderr)
try:
arxiv_resp = client.post(
arxiv_url,
json={
"query": topic,
"max_docs": max_docs,
"namespace": args.namespace,
"category": "papers",
},
)
arxiv_resp.raise_for_status()
print(
f" arXiv: {arxiv_resp.json()}",
file=sys.stderr,
)
except Exception as exc: # noqa: BLE001
print(f" arXiv error: {exc}", file=sys.stderr)
try:
openalex_resp = client.post(
openalex_url,
json={
"query": topic,
"max_docs": max_docs,
"namespace": args.namespace,
"mailto": args.mailto,
},
)
openalex_resp.raise_for_status()
print(
f" OpenAlex: {openalex_resp.json()}",
file=sys.stderr,
)
except Exception as exc: # noqa: BLE001
print(f" OpenAlex error: {exc}", file=sys.stderr)
print("Seeding complete.", file=sys.stderr)
return 0
if __name__ == "__main__":
raise SystemExit(main())