Babu Pallam
Add ChromaDB vector store integration
56fe00f
Raw
History Blame Contribute Delete
5.01 kB
# ============================================================
# FILE: src/vector_store.py
# ============================================================
# PURPOSE:
# Store and search document chunks using ChromaDB.
#
# ChromaDB is excellent for local demos and prototypes.
#
# For larger production systems, you may consider:
# - Qdrant
# - Weaviate
# - Milvus
# - Pinecone
# - PostgreSQL with pgvector
# - OpenSearch vector search
#
# But the concepts remain the same:
# chunk -> embed -> store -> retrieve
# ============================================================
from pathlib import Path
from typing import Any, Dict, List
import chromadb
from chromadb.config import Settings
from src.chunker import Chunk
class ChromaVectorStore:
"""
Thin wrapper around ChromaDB.
This makes the rest of the app independent from Chroma-specific code.
"""
def __init__(
self,
persist_directory: Path,
collection_name: str,
embedding_model_name: str,
) -> None:
"""
Create a persistent ChromaDB client.
persistent directory:
- stores the vector database on disk
- allows reuse after app restart
"""
self.persist_directory = persist_directory
self.collection_name = collection_name
self.embedding_model_name = embedding_model_name
self.persist_directory.mkdir(parents=True, exist_ok=True)
self.client = chromadb.PersistentClient(
path=str(self.persist_directory),
settings=Settings(anonymized_telemetry=False),
)
self.collection = self.client.get_or_create_collection(
name=self.collection_name,
metadata={
"description": "KnowFlow AI document knowledge base",
"embedding_model": self.embedding_model_name,
},
)
def reset_collection(self) -> None:
"""
Delete and recreate the collection.
Good for demos and development.
Production alternative:
- upsert changed documents only
- delete old chunks for changed files
- maintain document versions
"""
try:
self.client.delete_collection(self.collection_name)
except Exception:
pass
self.collection = self.client.get_or_create_collection(
name=self.collection_name,
metadata={
"description": "KnowFlow AI document knowledge base",
"embedding_model": self.embedding_model_name,
},
)
def count(self) -> int:
"""
Return the number of vectors stored.
"""
return self.collection.count()
def add_chunks(
self,
chunks: List[Chunk],
embeddings: List[List[float]],
) -> None:
"""
Add chunks and their embeddings into ChromaDB.
Metadata is important because it allows the final answer to show:
- source file
- chunk number
- character count
"""
if not chunks:
return
ids = [chunk.id for chunk in chunks]
documents = [chunk.text for chunk in chunks]
metadatas = [
{
"source": chunk.source,
"chunk_index": chunk.chunk_index,
"character_count": chunk.character_count,
}
for chunk in chunks
]
self.collection.add(
ids=ids,
documents=documents,
metadatas=metadatas,
embeddings=embeddings,
)
def query(
self,
query_embedding: List[float],
top_k: int,
) -> List[Dict[str, Any]]:
"""
Query the vector database using a query embedding.
Returns:
A list of retrieved chunks with metadata and distance.
Distance:
Lower usually means more similar.
"""
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
include=[
"documents",
"metadatas",
"distances",
],
)
retrieved = []
documents_list = results.get("documents", [[]])[0]
metadatas_list = results.get("metadatas", [[]])[0]
distances_list = results.get("distances", [[]])[0]
for rank, (document_text, metadata, distance) in enumerate(
zip(documents_list, metadatas_list, distances_list),
start=1,
):
retrieved.append(
{
"rank": rank,
"text": document_text,
"source": metadata.get("source", "unknown"),
"chunk_index": metadata.get("chunk_index", -1),
"character_count": metadata.get("character_count", 0),
"distance": float(distance),
}
)
return retrieved