| """Build RAG index from GAIA validation dataset with Annotator Metadata.""" |
|
|
| import json |
| from pathlib import Path |
|
|
| from datasets import load_dataset |
| from langchain_chroma import Chroma |
| from langchain_core.documents import Document |
| from langchain_huggingface import HuggingFaceEmbeddings |
|
|
| CHROMA_PATH = "./chroma_gaia_db" |
|
|
|
|
| def build_index(): |
| """Load GAIA validation set and index questions with metadata.""" |
| print("Loading GAIA dataset...") |
| ds = load_dataset("gaia-benchmark/GAIA", "2023_all", split="validation") |
|
|
| print(f"Found {len(ds)} examples") |
|
|
| |
| documents = [] |
| for item in ds: |
| question = item.get("Question", "") |
| answer = item.get("Final answer", "") |
| level = item.get("Level", "") |
| task_id = item.get("task_id", "") |
| metadata_raw = item.get("Annotator Metadata", {}) |
|
|
| |
| if isinstance(metadata_raw, str): |
| try: |
| metadata_raw = json.loads(metadata_raw) |
| except json.JSONDecodeError: |
| metadata_raw = {} |
|
|
| steps = metadata_raw.get("Steps", "") |
| tools = metadata_raw.get("Tools", "") |
| num_steps = metadata_raw.get("Number of steps", "") |
|
|
| |
| content = f"""Question: {question} |
| |
| Final Answer: {answer} |
| |
| Steps to solve: |
| {steps} |
| |
| Tools needed: {tools}""" |
|
|
| doc = Document( |
| page_content=content, |
| metadata={ |
| "task_id": task_id, |
| "question": question, |
| "answer": answer, |
| "level": str(level), |
| "num_steps": str(num_steps), |
| "tools": tools, |
| }, |
| ) |
| documents.append(doc) |
|
|
| print(f"Created {len(documents)} documents") |
|
|
| |
| print("Initializing embeddings...") |
| embeddings = HuggingFaceEmbeddings( |
| model_name="sentence-transformers/all-mpnet-base-v2" |
| ) |
|
|
| |
| chroma_path = Path(CHROMA_PATH) |
| if chroma_path.exists(): |
| import shutil |
|
|
| shutil.rmtree(chroma_path) |
| print("Cleared existing index") |
|
|
| |
| print("Building vector store...") |
| vectorstore = Chroma.from_documents( |
| documents=documents, |
| embedding=embeddings, |
| persist_directory=CHROMA_PATH, |
| ) |
|
|
| print(f"Indexed {len(documents)} documents to {CHROMA_PATH}") |
|
|
| |
| print("\nTesting retrieval...") |
| test_query = ( |
| "How many studio albums did Mercedes Sosa release between 2000 and 2009?" |
| ) |
| results = vectorstore.similarity_search(test_query, k=2) |
| print(f"Query: {test_query}") |
| for i, doc in enumerate(results): |
| print(f"\n--- Result {i+1} ---") |
| print(f"Question: {doc.metadata.get('question', '')[:100]}...") |
| print(f"Answer: {doc.metadata.get('answer', '')}") |
|
|
|
|
| if __name__ == "__main__": |
| build_index() |
|
|