# vector_store_builder.py import os # Disable telemetry before importing chromadb os.environ.setdefault("POSTHOG_DISABLED", "true") os.environ.setdefault("CHROMA_TELEMETRY_DISABLED", "true") import chromadb from chromadb.config import Settings from chromadb.utils import embedding_functions from legal_aid_context import ALL_CONTEXT_DOCS import uuid def build_vector_store(): """Build ChromaDB vector store with legal context""" # Initialize ChromaDB client with telemetry disabled and writable paths chroma_path = os.getenv('CHROMA_DB_PATH', './legal_vector_db') os.makedirs(chroma_path, exist_ok=True) default_cache_root = os.getenv('CACHE_ROOT', './cache') os.environ.setdefault('HOME', os.path.abspath('.')) os.makedirs(default_cache_root, exist_ok=True) os.makedirs(os.path.join(os.environ['HOME'], '.cache'), exist_ok=True) os.makedirs(os.path.join(os.environ['HOME'], '.cache', 'chroma'), exist_ok=True) os.environ.setdefault('HF_HOME', os.path.join(default_cache_root, 'hf')) os.environ.setdefault('TRANSFORMERS_CACHE', os.path.join(default_cache_root, 'transformers')) os.environ.setdefault('SENTENCE_TRANSFORMERS_HOME', os.path.join(default_cache_root, 'sentence-transformers')) os.environ.setdefault('XDG_CACHE_HOME', default_cache_root) for env_key in ['HF_HOME', 'TRANSFORMERS_CACHE', 'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME']: os.makedirs(os.environ[env_key], exist_ok=True) client = chromadb.PersistentClient(path=chroma_path, settings=Settings(anonymized_telemetry=False)) # Create collection with default embedding function embedding_function = embedding_functions.DefaultEmbeddingFunction() collection = client.get_or_create_collection(name="legal_context", embedding_function=embedding_function) # Prepare documents for vector store documents = [] metadatas = [] ids = [] for doc in ALL_CONTEXT_DOCS: # Split long documents into chunks if needed content = doc['content'] if len(content) > 500: # Simple chunking chunks = [content[i:i+500] for i in range(0, len(content), 400)] # 100 char overlap for i, chunk in enumerate(chunks): documents.append(f"{doc['title']}\n\n{chunk}") metadatas.append({ 'title': doc['title'], 'source': doc['source'], 'process': doc['process'], 'chunk': i, 'doc_id': doc['id'] }) ids.append(f"{doc['id']}_chunk_{i}") else: documents.append(f"{doc['title']}\n\n{content}") metadatas.append({ 'title': doc['title'], 'source': doc['source'], 'process': doc['process'], 'doc_id': doc['id'] }) ids.append(doc['id']) # Add documents to collection collection.add( documents=documents, metadatas=metadatas, ids=ids ) print(f"Vector store built with {len(documents)} documents") return collection if __name__ == "__main__": collection = build_vector_store() # Test query results = collection.query( query_texts=["How do I apply for legal aid?"], n_results=3 ) print("Test query results:", results)