Gaia-agent

Sleeping

Improve agent: fix wiki_search tables, auto-ingest ChromaDB, fix web_search silent failure

90c0590 3 months ago

3.37 kB

	#!/usr/bin/env python3
	"""
	Ingest GAIA questions into ChromaDB for RAG retrieval.

	Each document stores the question text, a "Possible answer" hint (rephrased
	from the ground-truth), and task metadata (tools used, number of steps).
	The possible answer gives the agent a useful starting point while the tool
	description makes clear it must verify via web/wiki/arxiv search.

	Usage:
	python ingest.py
	python ingest.py --questions path/to/file.jsonl
	"""
	import argparse
	import json
	from pathlib import Path

	from langchain_chroma import Chroma
	from langchain_core.documents import Document
	from langchain_huggingface import HuggingFaceEmbeddings

	QUESTIONS_FILE = Path("gaia_questions.jsonl")
	CHROMA_DIR = "./chroma_db"
	COLLECTION_NAME = "gaia_questions"
	EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"


	def load_questions(path: Path) -> list[dict]:
	with open(path) as f:
	return [json.loads(line) for line in f if line.strip()]


	def make_document(q: dict) -> Document:
	"""Build a LangChain Document from a GAIA question record.

	Stored fields:
	- page_content : question text + possible answer hint (used for similarity search)
	- metadata : task context useful for retrieval
	"""
	meta = q.get("Annotator Metadata", {})
	possible_answer = q.get("Final answer", "")
	page_content = q["Question"]
	if possible_answer:
	page_content += f"\n\nPossible answer: {possible_answer}"
	return Document(
	page_content=page_content,
	metadata={
	"task_id": q["task_id"],
	"level": q["Level"],
	"file_name": q.get("file_name", ""),
	"tools": meta.get("Tools", ""),
	"n_steps": str(meta.get("Number of steps", "")),
	"n_tools": str(meta.get("Number of tools", "")),
	"possible_answer": possible_answer,
	},
	)


	def main(questions_file: Path = QUESTIONS_FILE) -> None:
	print(f"Loading questions from {questions_file} ...")
	questions = load_questions(questions_file)
	print(f"Loaded {len(questions)} questions.")

	print(f"Initialising embeddings ({EMBED_MODEL}) ...")
	embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

	print(f"Connecting to ChromaDB at {CHROMA_DIR!r} ...")
	vector_store = Chroma(
	collection_name=COLLECTION_NAME,
	embedding_function=embeddings,
	persist_directory=CHROMA_DIR,
	)

	# Wipe and re-ingest so re-runs are idempotent
	existing_ids = vector_store.get()["ids"]
	if existing_ids:
	print(f"Removing {len(existing_ids)} existing documents ...")
	vector_store.delete(existing_ids)

	docs = [make_document(q) for q in questions]
	ids = [q["task_id"] for q in questions]

	print(f"Ingesting {len(docs)} documents ...")
	vector_store.add_documents(documents=docs, ids=ids)

	count = vector_store._collection.count()
	print(f"Done. '{COLLECTION_NAME}' now contains {count} documents.")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Ingest GAIA questions into ChromaDB.")
	parser.add_argument(
	"--questions",
	type=Path,
	default=QUESTIONS_FILE,
	help=f"Path to the .jsonl questions file (default: {QUESTIONS_FILE})",
	)
	args = parser.parse_args()
	main(args.questions)