File size: 2,207 Bytes
176a845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""Setup ChromaDB with metadata from metadata.jsonl"""
import json
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document

def setup_vector_store():
    """Load metadata.jsonl and populate ChromaDB"""
    print("Loading embeddings model...")

    # Set HuggingFace mirror for China (optional)
    # os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

    # Use a smaller, faster embedding model that's less likely to timeout
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",  # Smaller model, faster download
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )

    print("Loading metadata.jsonl...")
    documents = []
    with open("metadata.jsonl", "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= 50:  # Limit to first 50 for quick setup
                break
            record = json.loads(line)
            # Combine question and answer as the document content
            content = f"Question: {record.get('Question', '')}\nFinal Answer: {record.get('Final answer', '')}"
            doc = Document(
                page_content=content,
                metadata={
                    "task_id": record.get("task_id", ""),
                    "question": record.get("Question", "")
                }
            )
            documents.append(doc)

    print(f"Creating ChromaDB with {len(documents)} documents...")
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        collection_name="gaia_questions",
        persist_directory="./chroma_db"
    )

    print("✅ ChromaDB setup complete!")
    print(f"Stored {len(documents)} question-answer pairs")

    # Test retrieval
    print("\n🔍 Testing retrieval...")
    test_query = "What is the capital of France?"
    results = vector_store.similarity_search(test_query, k=1)
    if results:
        print(f"Query: {test_query}")
        print(f"Similar result: {results[0].page_content[:200]}...")

    return vector_store

if __name__ == "__main__":
    setup_vector_store()