GAIA_Agent_DeepResearch / setup_chromadb.py
humblebanana
1st
176a845
"""Setup ChromaDB with metadata from metadata.jsonl"""
import json
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
def setup_vector_store():
"""Load metadata.jsonl and populate ChromaDB"""
print("Loading embeddings model...")
# Set HuggingFace mirror for China (optional)
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# Use a smaller, faster embedding model that's less likely to timeout
embeddings = HuggingFaceEmbeddings(
model_name="all-MiniLM-L6-v2", # Smaller model, faster download
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
print("Loading metadata.jsonl...")
documents = []
with open("metadata.jsonl", "r", encoding="utf-8") as f:
for i, line in enumerate(f):
if i >= 50: # Limit to first 50 for quick setup
break
record = json.loads(line)
# Combine question and answer as the document content
content = f"Question: {record.get('Question', '')}\nFinal Answer: {record.get('Final answer', '')}"
doc = Document(
page_content=content,
metadata={
"task_id": record.get("task_id", ""),
"question": record.get("Question", "")
}
)
documents.append(doc)
print(f"Creating ChromaDB with {len(documents)} documents...")
vector_store = Chroma.from_documents(
documents=documents,
embedding=embeddings,
collection_name="gaia_questions",
persist_directory="./chroma_db"
)
print("βœ… ChromaDB setup complete!")
print(f"Stored {len(documents)} question-answer pairs")
# Test retrieval
print("\nπŸ” Testing retrieval...")
test_query = "What is the capital of France?"
results = vector_store.similarity_search(test_query, k=1)
if results:
print(f"Query: {test_query}")
print(f"Similar result: {results[0].page_content[:200]}...")
return vector_store
if __name__ == "__main__":
setup_vector_store()