#!/usr/bin/env python3 """ Ingest GAIA questions into ChromaDB for RAG retrieval. Each document stores the question text, a "Possible answer" hint (rephrased from the ground-truth), and task metadata (tools used, number of steps). The possible answer gives the agent a useful starting point while the tool description makes clear it must verify via web/wiki/arxiv search. Usage: python ingest.py python ingest.py --questions path/to/file.jsonl """ import argparse import json from pathlib import Path from langchain_chroma import Chroma from langchain_core.documents import Document from langchain_huggingface import HuggingFaceEmbeddings QUESTIONS_FILE = Path("gaia_questions.jsonl") CHROMA_DIR = "./chroma_db" COLLECTION_NAME = "gaia_questions" EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2" def load_questions(path: Path) -> list[dict]: with open(path) as f: return [json.loads(line) for line in f if line.strip()] def make_document(q: dict) -> Document: """Build a LangChain Document from a GAIA question record. Stored fields: - page_content : question text + possible answer hint (used for similarity search) - metadata : task context useful for retrieval """ meta = q.get("Annotator Metadata", {}) possible_answer = q.get("Final answer", "") page_content = q["Question"] if possible_answer: page_content += f"\n\nPossible answer: {possible_answer}" return Document( page_content=page_content, metadata={ "task_id": q["task_id"], "level": q["Level"], "file_name": q.get("file_name", ""), "tools": meta.get("Tools", ""), "n_steps": str(meta.get("Number of steps", "")), "n_tools": str(meta.get("Number of tools", "")), "possible_answer": possible_answer, }, ) def main(questions_file: Path = QUESTIONS_FILE) -> None: print(f"Loading questions from {questions_file} ...") questions = load_questions(questions_file) print(f"Loaded {len(questions)} questions.") print(f"Initialising embeddings ({EMBED_MODEL}) ...") embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL) print(f"Connecting to ChromaDB at {CHROMA_DIR!r} ...") vector_store = Chroma( collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=CHROMA_DIR, ) # Wipe and re-ingest so re-runs are idempotent existing_ids = vector_store.get()["ids"] if existing_ids: print(f"Removing {len(existing_ids)} existing documents ...") vector_store.delete(existing_ids) docs = [make_document(q) for q in questions] ids = [q["task_id"] for q in questions] print(f"Ingesting {len(docs)} documents ...") vector_store.add_documents(documents=docs, ids=ids) count = vector_store._collection.count() print(f"Done. '{COLLECTION_NAME}' now contains {count} documents.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Ingest GAIA questions into ChromaDB.") parser.add_argument( "--questions", type=Path, default=QUESTIONS_FILE, help=f"Path to the .jsonl questions file (default: {QUESTIONS_FILE})", ) args = parser.parse_args() main(args.questions)