Legal-assistant / vector_store_builder.py
Rivalcoder
Add Files
8e453ef
raw
history blame
2.19 kB
# vector_store_builder.py
import chromadb
from chromadb.utils import embedding_functions
from legal_aid_context import ALL_CONTEXT_DOCS
import uuid
def build_vector_store():
"""Build ChromaDB vector store with legal context"""
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./legal_vector_db")
# Create collection with default embedding function
embedding_function = embedding_functions.DefaultEmbeddingFunction()
collection = client.get_or_create_collection(
name="legal_context",
embedding_function=embedding_function
)
# Prepare documents for vector store
documents = []
metadatas = []
ids = []
for doc in ALL_CONTEXT_DOCS:
# Split long documents into chunks if needed
content = doc['content']
if len(content) > 500: # Simple chunking
chunks = [content[i:i+500] for i in range(0, len(content), 400)] # 100 char overlap
for i, chunk in enumerate(chunks):
documents.append(f"{doc['title']}\n\n{chunk}")
metadatas.append({
'title': doc['title'],
'source': doc['source'],
'process': doc['process'],
'chunk': i,
'doc_id': doc['id']
})
ids.append(f"{doc['id']}_chunk_{i}")
else:
documents.append(f"{doc['title']}\n\n{content}")
metadatas.append({
'title': doc['title'],
'source': doc['source'],
'process': doc['process'],
'doc_id': doc['id']
})
ids.append(doc['id'])
# Add documents to collection
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
print(f"Vector store built with {len(documents)} documents")
return collection
if __name__ == "__main__":
collection = build_vector_store()
# Test query
results = collection.query(
query_texts=["How do I apply for legal aid?"],
n_results=3
)
print("Test query results:", results)