Gaia-agent / ingest.py
NZ
Improve agent: fix wiki_search tables, auto-ingest ChromaDB, fix web_search silent failure
90c0590
#!/usr/bin/env python3
"""
Ingest GAIA questions into ChromaDB for RAG retrieval.
Each document stores the question text, a "Possible answer" hint (rephrased
from the ground-truth), and task metadata (tools used, number of steps).
The possible answer gives the agent a useful starting point while the tool
description makes clear it must verify via web/wiki/arxiv search.
Usage:
python ingest.py
python ingest.py --questions path/to/file.jsonl
"""
import argparse
import json
from pathlib import Path
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
QUESTIONS_FILE = Path("gaia_questions.jsonl")
CHROMA_DIR = "./chroma_db"
COLLECTION_NAME = "gaia_questions"
EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
def load_questions(path: Path) -> list[dict]:
with open(path) as f:
return [json.loads(line) for line in f if line.strip()]
def make_document(q: dict) -> Document:
"""Build a LangChain Document from a GAIA question record.
Stored fields:
- page_content : question text + possible answer hint (used for similarity search)
- metadata : task context useful for retrieval
"""
meta = q.get("Annotator Metadata", {})
possible_answer = q.get("Final answer", "")
page_content = q["Question"]
if possible_answer:
page_content += f"\n\nPossible answer: {possible_answer}"
return Document(
page_content=page_content,
metadata={
"task_id": q["task_id"],
"level": q["Level"],
"file_name": q.get("file_name", ""),
"tools": meta.get("Tools", ""),
"n_steps": str(meta.get("Number of steps", "")),
"n_tools": str(meta.get("Number of tools", "")),
"possible_answer": possible_answer,
},
)
def main(questions_file: Path = QUESTIONS_FILE) -> None:
print(f"Loading questions from {questions_file} ...")
questions = load_questions(questions_file)
print(f"Loaded {len(questions)} questions.")
print(f"Initialising embeddings ({EMBED_MODEL}) ...")
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
print(f"Connecting to ChromaDB at {CHROMA_DIR!r} ...")
vector_store = Chroma(
collection_name=COLLECTION_NAME,
embedding_function=embeddings,
persist_directory=CHROMA_DIR,
)
# Wipe and re-ingest so re-runs are idempotent
existing_ids = vector_store.get()["ids"]
if existing_ids:
print(f"Removing {len(existing_ids)} existing documents ...")
vector_store.delete(existing_ids)
docs = [make_document(q) for q in questions]
ids = [q["task_id"] for q in questions]
print(f"Ingesting {len(docs)} documents ...")
vector_store.add_documents(documents=docs, ids=ids)
count = vector_store._collection.count()
print(f"Done. '{COLLECTION_NAME}' now contains {count} documents.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Ingest GAIA questions into ChromaDB.")
parser.add_argument(
"--questions",
type=Path,
default=QUESTIONS_FILE,
help=f"Path to the .jsonl questions file (default: {QUESTIONS_FILE})",
)
args = parser.parse_args()
main(args.questions)