Spaces:
Running
Running
| # vector_store_builder.py | |
| import os | |
| # Disable telemetry before importing chromadb | |
| os.environ.setdefault("POSTHOG_DISABLED", "true") | |
| os.environ.setdefault("CHROMA_TELEMETRY_DISABLED", "true") | |
| import chromadb | |
| from chromadb.config import Settings | |
| from chromadb.utils import embedding_functions | |
| from legal_aid_context import ALL_CONTEXT_DOCS | |
| import uuid | |
| def build_vector_store(): | |
| """Build ChromaDB vector store with legal context""" | |
| # Initialize ChromaDB client with telemetry disabled and writable paths | |
| chroma_path = os.getenv('CHROMA_DB_PATH', './legal_vector_db') | |
| os.makedirs(chroma_path, exist_ok=True) | |
| default_cache_root = os.getenv('CACHE_ROOT', './cache') | |
| os.environ.setdefault('HOME', os.path.abspath('.')) | |
| os.makedirs(default_cache_root, exist_ok=True) | |
| os.makedirs(os.path.join(os.environ['HOME'], '.cache'), exist_ok=True) | |
| os.makedirs(os.path.join(os.environ['HOME'], '.cache', 'chroma'), exist_ok=True) | |
| os.environ.setdefault('HF_HOME', os.path.join(default_cache_root, 'hf')) | |
| os.environ.setdefault('TRANSFORMERS_CACHE', os.path.join(default_cache_root, 'transformers')) | |
| os.environ.setdefault('SENTENCE_TRANSFORMERS_HOME', os.path.join(default_cache_root, 'sentence-transformers')) | |
| os.environ.setdefault('XDG_CACHE_HOME', default_cache_root) | |
| for env_key in ['HF_HOME', 'TRANSFORMERS_CACHE', 'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME']: | |
| os.makedirs(os.environ[env_key], exist_ok=True) | |
| client = chromadb.PersistentClient(path=chroma_path, settings=Settings(anonymized_telemetry=False)) | |
| # Create collection with default embedding function | |
| embedding_function = embedding_functions.DefaultEmbeddingFunction() | |
| collection = client.get_or_create_collection(name="legal_context", embedding_function=embedding_function) | |
| # Prepare documents for vector store | |
| documents = [] | |
| metadatas = [] | |
| ids = [] | |
| for doc in ALL_CONTEXT_DOCS: | |
| # Split long documents into chunks if needed | |
| content = doc['content'] | |
| if len(content) > 500: # Simple chunking | |
| chunks = [content[i:i+500] for i in range(0, len(content), 400)] # 100 char overlap | |
| for i, chunk in enumerate(chunks): | |
| documents.append(f"{doc['title']}\n\n{chunk}") | |
| metadatas.append({ | |
| 'title': doc['title'], | |
| 'source': doc['source'], | |
| 'process': doc['process'], | |
| 'chunk': i, | |
| 'doc_id': doc['id'] | |
| }) | |
| ids.append(f"{doc['id']}_chunk_{i}") | |
| else: | |
| documents.append(f"{doc['title']}\n\n{content}") | |
| metadatas.append({ | |
| 'title': doc['title'], | |
| 'source': doc['source'], | |
| 'process': doc['process'], | |
| 'doc_id': doc['id'] | |
| }) | |
| ids.append(doc['id']) | |
| # Add documents to collection | |
| collection.add( | |
| documents=documents, | |
| metadatas=metadatas, | |
| ids=ids | |
| ) | |
| print(f"Vector store built with {len(documents)} documents") | |
| return collection | |
| if __name__ == "__main__": | |
| collection = build_vector_store() | |
| # Test query | |
| results = collection.query( | |
| query_texts=["How do I apply for legal aid?"], | |
| n_results=3 | |
| ) | |
| print("Test query results:", results) | |