Spaces:
Running
Running
File size: 3,401 Bytes
8e453ef 7e9a861 dff2a93 8e453ef 7e9a861 8e453ef 7e9a861 457ebe6 7e9a861 457ebe6 0ec83d3 7e9a861 8e453ef 7e9a861 8e453ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# vector_store_builder.py
import os
# Disable telemetry before importing chromadb
os.environ.setdefault("POSTHOG_DISABLED", "true")
os.environ.setdefault("CHROMA_TELEMETRY_DISABLED", "true")
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from legal_aid_context import ALL_CONTEXT_DOCS
import uuid
def build_vector_store():
"""Build ChromaDB vector store with legal context"""
# Initialize ChromaDB client with telemetry disabled and writable paths
chroma_path = os.getenv('CHROMA_DB_PATH', './legal_vector_db')
os.makedirs(chroma_path, exist_ok=True)
default_cache_root = os.getenv('CACHE_ROOT', './cache')
os.environ.setdefault('HOME', os.path.abspath('.'))
os.makedirs(default_cache_root, exist_ok=True)
os.makedirs(os.path.join(os.environ['HOME'], '.cache'), exist_ok=True)
os.makedirs(os.path.join(os.environ['HOME'], '.cache', 'chroma'), exist_ok=True)
os.environ.setdefault('HF_HOME', os.path.join(default_cache_root, 'hf'))
os.environ.setdefault('TRANSFORMERS_CACHE', os.path.join(default_cache_root, 'transformers'))
os.environ.setdefault('SENTENCE_TRANSFORMERS_HOME', os.path.join(default_cache_root, 'sentence-transformers'))
os.environ.setdefault('XDG_CACHE_HOME', default_cache_root)
for env_key in ['HF_HOME', 'TRANSFORMERS_CACHE', 'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME']:
os.makedirs(os.environ[env_key], exist_ok=True)
client = chromadb.PersistentClient(path=chroma_path, settings=Settings(anonymized_telemetry=False))
# Create collection with default embedding function
embedding_function = embedding_functions.DefaultEmbeddingFunction()
collection = client.get_or_create_collection(name="legal_context", embedding_function=embedding_function)
# Prepare documents for vector store
documents = []
metadatas = []
ids = []
for doc in ALL_CONTEXT_DOCS:
# Split long documents into chunks if needed
content = doc['content']
if len(content) > 500: # Simple chunking
chunks = [content[i:i+500] for i in range(0, len(content), 400)] # 100 char overlap
for i, chunk in enumerate(chunks):
documents.append(f"{doc['title']}\n\n{chunk}")
metadatas.append({
'title': doc['title'],
'source': doc['source'],
'process': doc['process'],
'chunk': i,
'doc_id': doc['id']
})
ids.append(f"{doc['id']}_chunk_{i}")
else:
documents.append(f"{doc['title']}\n\n{content}")
metadatas.append({
'title': doc['title'],
'source': doc['source'],
'process': doc['process'],
'doc_id': doc['id']
})
ids.append(doc['id'])
# Add documents to collection
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
print(f"Vector store built with {len(documents)} documents")
return collection
if __name__ == "__main__":
collection = build_vector_store()
# Test query
results = collection.query(
query_texts=["How do I apply for legal aid?"],
n_results=3
)
print("Test query results:", results)
|