fa_agents / build_rag_index.py
j14i's picture
Got 45%
e04e3db
"""Build RAG index from GAIA validation dataset with Annotator Metadata."""
import json
from pathlib import Path
from datasets import load_dataset
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
CHROMA_PATH = "./chroma_gaia_db"
def build_index():
"""Load GAIA validation set and index questions with metadata."""
print("Loading GAIA dataset...")
ds = load_dataset("gaia-benchmark/GAIA", "2023_all", split="validation")
print(f"Found {len(ds)} examples")
# Create documents from dataset
documents = []
for item in ds:
question = item.get("Question", "")
answer = item.get("Final answer", "")
level = item.get("Level", "")
task_id = item.get("task_id", "")
metadata_raw = item.get("Annotator Metadata", {})
# Parse annotator metadata
if isinstance(metadata_raw, str):
try:
metadata_raw = json.loads(metadata_raw)
except json.JSONDecodeError:
metadata_raw = {}
steps = metadata_raw.get("Steps", "")
tools = metadata_raw.get("Tools", "")
num_steps = metadata_raw.get("Number of steps", "")
# Build document content with question, answer, and reasoning
content = f"""Question: {question}
Final Answer: {answer}
Steps to solve:
{steps}
Tools needed: {tools}"""
doc = Document(
page_content=content,
metadata={
"task_id": task_id,
"question": question,
"answer": answer,
"level": str(level),
"num_steps": str(num_steps),
"tools": tools,
},
)
documents.append(doc)
print(f"Created {len(documents)} documents")
# Initialize embeddings
print("Initializing embeddings...")
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2"
)
# Clear existing index if present
chroma_path = Path(CHROMA_PATH)
if chroma_path.exists():
import shutil
shutil.rmtree(chroma_path)
print("Cleared existing index")
# Create and persist vector store
print("Building vector store...")
vectorstore = Chroma.from_documents(
documents=documents,
embedding=embeddings,
persist_directory=CHROMA_PATH,
)
print(f"Indexed {len(documents)} documents to {CHROMA_PATH}")
# Test retrieval
print("\nTesting retrieval...")
test_query = (
"How many studio albums did Mercedes Sosa release between 2000 and 2009?"
)
results = vectorstore.similarity_search(test_query, k=2)
print(f"Query: {test_query}")
for i, doc in enumerate(results):
print(f"\n--- Result {i+1} ---")
print(f"Question: {doc.metadata.get('question', '')[:100]}...")
print(f"Answer: {doc.metadata.get('answer', '')}")
if __name__ == "__main__":
build_index()