Spaces:
Sleeping
Sleeping
| """Setup ChromaDB with metadata from metadata.jsonl""" | |
| import json | |
| import os | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| from langchain.schema import Document | |
| def setup_vector_store(): | |
| """Load metadata.jsonl and populate ChromaDB""" | |
| print("Loading embeddings model...") | |
| # Set HuggingFace mirror for China (optional) | |
| # os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' | |
| # Use a smaller, faster embedding model that's less likely to timeout | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="all-MiniLM-L6-v2", # Smaller model, faster download | |
| model_kwargs={'device': 'cpu'}, | |
| encode_kwargs={'normalize_embeddings': True} | |
| ) | |
| print("Loading metadata.jsonl...") | |
| documents = [] | |
| with open("metadata.jsonl", "r", encoding="utf-8") as f: | |
| for i, line in enumerate(f): | |
| if i >= 50: # Limit to first 50 for quick setup | |
| break | |
| record = json.loads(line) | |
| # Combine question and answer as the document content | |
| content = f"Question: {record.get('Question', '')}\nFinal Answer: {record.get('Final answer', '')}" | |
| doc = Document( | |
| page_content=content, | |
| metadata={ | |
| "task_id": record.get("task_id", ""), | |
| "question": record.get("Question", "") | |
| } | |
| ) | |
| documents.append(doc) | |
| print(f"Creating ChromaDB with {len(documents)} documents...") | |
| vector_store = Chroma.from_documents( | |
| documents=documents, | |
| embedding=embeddings, | |
| collection_name="gaia_questions", | |
| persist_directory="./chroma_db" | |
| ) | |
| print("β ChromaDB setup complete!") | |
| print(f"Stored {len(documents)} question-answer pairs") | |
| # Test retrieval | |
| print("\nπ Testing retrieval...") | |
| test_query = "What is the capital of France?" | |
| results = vector_store.similarity_search(test_query, k=1) | |
| if results: | |
| print(f"Query: {test_query}") | |
| print(f"Similar result: {results[0].page_content[:200]}...") | |
| return vector_store | |
| if __name__ == "__main__": | |
| setup_vector_store() | |