Spaces:
Sleeping
Sleeping
NZ
Improve agent: fix wiki_search tables, auto-ingest ChromaDB, fix web_search silent failure
90c0590 | #!/usr/bin/env python3 | |
| """ | |
| Ingest GAIA questions into ChromaDB for RAG retrieval. | |
| Each document stores the question text, a "Possible answer" hint (rephrased | |
| from the ground-truth), and task metadata (tools used, number of steps). | |
| The possible answer gives the agent a useful starting point while the tool | |
| description makes clear it must verify via web/wiki/arxiv search. | |
| Usage: | |
| python ingest.py | |
| python ingest.py --questions path/to/file.jsonl | |
| """ | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from langchain_chroma import Chroma | |
| from langchain_core.documents import Document | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| QUESTIONS_FILE = Path("gaia_questions.jsonl") | |
| CHROMA_DIR = "./chroma_db" | |
| COLLECTION_NAME = "gaia_questions" | |
| EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2" | |
| def load_questions(path: Path) -> list[dict]: | |
| with open(path) as f: | |
| return [json.loads(line) for line in f if line.strip()] | |
| def make_document(q: dict) -> Document: | |
| """Build a LangChain Document from a GAIA question record. | |
| Stored fields: | |
| - page_content : question text + possible answer hint (used for similarity search) | |
| - metadata : task context useful for retrieval | |
| """ | |
| meta = q.get("Annotator Metadata", {}) | |
| possible_answer = q.get("Final answer", "") | |
| page_content = q["Question"] | |
| if possible_answer: | |
| page_content += f"\n\nPossible answer: {possible_answer}" | |
| return Document( | |
| page_content=page_content, | |
| metadata={ | |
| "task_id": q["task_id"], | |
| "level": q["Level"], | |
| "file_name": q.get("file_name", ""), | |
| "tools": meta.get("Tools", ""), | |
| "n_steps": str(meta.get("Number of steps", "")), | |
| "n_tools": str(meta.get("Number of tools", "")), | |
| "possible_answer": possible_answer, | |
| }, | |
| ) | |
| def main(questions_file: Path = QUESTIONS_FILE) -> None: | |
| print(f"Loading questions from {questions_file} ...") | |
| questions = load_questions(questions_file) | |
| print(f"Loaded {len(questions)} questions.") | |
| print(f"Initialising embeddings ({EMBED_MODEL}) ...") | |
| embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL) | |
| print(f"Connecting to ChromaDB at {CHROMA_DIR!r} ...") | |
| vector_store = Chroma( | |
| collection_name=COLLECTION_NAME, | |
| embedding_function=embeddings, | |
| persist_directory=CHROMA_DIR, | |
| ) | |
| # Wipe and re-ingest so re-runs are idempotent | |
| existing_ids = vector_store.get()["ids"] | |
| if existing_ids: | |
| print(f"Removing {len(existing_ids)} existing documents ...") | |
| vector_store.delete(existing_ids) | |
| docs = [make_document(q) for q in questions] | |
| ids = [q["task_id"] for q in questions] | |
| print(f"Ingesting {len(docs)} documents ...") | |
| vector_store.add_documents(documents=docs, ids=ids) | |
| count = vector_store._collection.count() | |
| print(f"Done. '{COLLECTION_NAME}' now contains {count} documents.") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Ingest GAIA questions into ChromaDB.") | |
| parser.add_argument( | |
| "--questions", | |
| type=Path, | |
| default=QUESTIONS_FILE, | |
| help=f"Path to the .jsonl questions file (default: {QUESTIONS_FILE})", | |
| ) | |
| args = parser.parse_args() | |
| main(args.questions) | |