File size: 3,401 Bytes
8e453ef
7e9a861
dff2a93
 
 
 
8e453ef
7e9a861
8e453ef
 
 
 
 
 
 
7e9a861
 
 
 
 
457ebe6
7e9a861
457ebe6
0ec83d3
7e9a861
 
 
 
 
 
 
 
8e453ef
 
 
 
7e9a861
8e453ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# vector_store_builder.py
import os

# Disable telemetry before importing chromadb
os.environ.setdefault("POSTHOG_DISABLED", "true")
os.environ.setdefault("CHROMA_TELEMETRY_DISABLED", "true")
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from legal_aid_context import ALL_CONTEXT_DOCS
import uuid

def build_vector_store():
    """Build ChromaDB vector store with legal context"""
    
    # Initialize ChromaDB client with telemetry disabled and writable paths
    chroma_path = os.getenv('CHROMA_DB_PATH', './legal_vector_db')
    os.makedirs(chroma_path, exist_ok=True)

    default_cache_root = os.getenv('CACHE_ROOT', './cache')
    os.environ.setdefault('HOME', os.path.abspath('.'))
    os.makedirs(default_cache_root, exist_ok=True)
    os.makedirs(os.path.join(os.environ['HOME'], '.cache'), exist_ok=True)
    os.makedirs(os.path.join(os.environ['HOME'], '.cache', 'chroma'), exist_ok=True)
    os.environ.setdefault('HF_HOME', os.path.join(default_cache_root, 'hf'))
    os.environ.setdefault('TRANSFORMERS_CACHE', os.path.join(default_cache_root, 'transformers'))
    os.environ.setdefault('SENTENCE_TRANSFORMERS_HOME', os.path.join(default_cache_root, 'sentence-transformers'))
    os.environ.setdefault('XDG_CACHE_HOME', default_cache_root)
    for env_key in ['HF_HOME', 'TRANSFORMERS_CACHE', 'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME']:
        os.makedirs(os.environ[env_key], exist_ok=True)

    client = chromadb.PersistentClient(path=chroma_path, settings=Settings(anonymized_telemetry=False))
    
    # Create collection with default embedding function
    embedding_function = embedding_functions.DefaultEmbeddingFunction()
    
    collection = client.get_or_create_collection(name="legal_context", embedding_function=embedding_function)
    
    # Prepare documents for vector store
    documents = []
    metadatas = []
    ids = []
    
    for doc in ALL_CONTEXT_DOCS:
        # Split long documents into chunks if needed
        content = doc['content']
        if len(content) > 500:  # Simple chunking
            chunks = [content[i:i+500] for i in range(0, len(content), 400)]  # 100 char overlap
            for i, chunk in enumerate(chunks):
                documents.append(f"{doc['title']}\n\n{chunk}")
                metadatas.append({
                    'title': doc['title'],
                    'source': doc['source'],
                    'process': doc['process'],
                    'chunk': i,
                    'doc_id': doc['id']
                })
                ids.append(f"{doc['id']}_chunk_{i}")
        else:
            documents.append(f"{doc['title']}\n\n{content}")
            metadatas.append({
                'title': doc['title'],
                'source': doc['source'], 
                'process': doc['process'],
                'doc_id': doc['id']
            })
            ids.append(doc['id'])
    
    # Add documents to collection
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    
    print(f"Vector store built with {len(documents)} documents")
    return collection

if __name__ == "__main__":
    collection = build_vector_store()
    
    # Test query
    results = collection.query(
        query_texts=["How do I apply for legal aid?"],
        n_results=3
    )
    print("Test query results:", results)