"""Setup ChromaDB with metadata from metadata.jsonl""" import json import os from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.schema import Document def setup_vector_store(): """Load metadata.jsonl and populate ChromaDB""" print("Loading embeddings model...") # Set HuggingFace mirror for China (optional) # os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' # Use a smaller, faster embedding model that's less likely to timeout embeddings = HuggingFaceEmbeddings( model_name="all-MiniLM-L6-v2", # Smaller model, faster download model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) print("Loading metadata.jsonl...") documents = [] with open("metadata.jsonl", "r", encoding="utf-8") as f: for i, line in enumerate(f): if i >= 50: # Limit to first 50 for quick setup break record = json.loads(line) # Combine question and answer as the document content content = f"Question: {record.get('Question', '')}\nFinal Answer: {record.get('Final answer', '')}" doc = Document( page_content=content, metadata={ "task_id": record.get("task_id", ""), "question": record.get("Question", "") } ) documents.append(doc) print(f"Creating ChromaDB with {len(documents)} documents...") vector_store = Chroma.from_documents( documents=documents, embedding=embeddings, collection_name="gaia_questions", persist_directory="./chroma_db" ) print("āœ… ChromaDB setup complete!") print(f"Stored {len(documents)} question-answer pairs") # Test retrieval print("\nšŸ” Testing retrieval...") test_query = "What is the capital of France?" results = vector_store.similarity_search(test_query, k=1) if results: print(f"Query: {test_query}") print(f"Similar result: {results[0].page_content[:200]}...") return vector_store if __name__ == "__main__": setup_vector_store()