Spaces:

babupallam
/

knowflow-ai-rag-document-chatbot

Sleeping

knowflow-ai-rag-document-chatbot / src /vector_store.py

Babu Pallam

Add ChromaDB vector store integration

56fe00f about 1 month ago

5.01 kB

	# ============================================================
	# FILE: src/vector_store.py
	# ============================================================
	# PURPOSE:
	# Store and search document chunks using ChromaDB.
	#
	# ChromaDB is excellent for local demos and prototypes.
	#
	# For larger production systems, you may consider:
	# - Qdrant
	# - Weaviate
	# - Milvus
	# - Pinecone
	# - PostgreSQL with pgvector
	# - OpenSearch vector search
	#
	# But the concepts remain the same:
	# chunk -> embed -> store -> retrieve
	# ============================================================

	from pathlib import Path
	from typing import Any, Dict, List

	import chromadb
	from chromadb.config import Settings

	from src.chunker import Chunk


	class ChromaVectorStore:
	"""
	Thin wrapper around ChromaDB.

	This makes the rest of the app independent from Chroma-specific code.
	"""

	def __init__(
	self,
	persist_directory: Path,
	collection_name: str,
	embedding_model_name: str,
	) -> None:
	"""
	Create a persistent ChromaDB client.

	persistent directory:
	- stores the vector database on disk
	- allows reuse after app restart
	"""

	self.persist_directory = persist_directory
	self.collection_name = collection_name
	self.embedding_model_name = embedding_model_name

	self.persist_directory.mkdir(parents=True, exist_ok=True)

	self.client = chromadb.PersistentClient(
	path=str(self.persist_directory),
	settings=Settings(anonymized_telemetry=False),
	)

	self.collection = self.client.get_or_create_collection(
	name=self.collection_name,
	metadata={
	"description": "KnowFlow AI document knowledge base",
	"embedding_model": self.embedding_model_name,
	},
	)

	def reset_collection(self) -> None:
	"""
	Delete and recreate the collection.

	Good for demos and development.

	Production alternative:
	- upsert changed documents only
	- delete old chunks for changed files
	- maintain document versions
	"""

	try:
	self.client.delete_collection(self.collection_name)
	except Exception:
	pass

	self.collection = self.client.get_or_create_collection(
	name=self.collection_name,
	metadata={
	"description": "KnowFlow AI document knowledge base",
	"embedding_model": self.embedding_model_name,
	},
	)

	def count(self) -> int:
	"""
	Return the number of vectors stored.
	"""
	return self.collection.count()

	def add_chunks(
	self,
	chunks: List[Chunk],
	embeddings: List[List[float]],
	) -> None:
	"""
	Add chunks and their embeddings into ChromaDB.

	Metadata is important because it allows the final answer to show:
	- source file
	- chunk number
	- character count
	"""

	if not chunks:
	return

	ids = [chunk.id for chunk in chunks]
	documents = [chunk.text for chunk in chunks]

	metadatas = [
	{
	"source": chunk.source,
	"chunk_index": chunk.chunk_index,
	"character_count": chunk.character_count,
	}
	for chunk in chunks
	]

	self.collection.add(
	ids=ids,
	documents=documents,
	metadatas=metadatas,
	embeddings=embeddings,
	)

	def query(
	self,
	query_embedding: List[float],
	top_k: int,
	) -> List[Dict[str, Any]]:
	"""
	Query the vector database using a query embedding.

	Returns:
	A list of retrieved chunks with metadata and distance.

	Distance:
	Lower usually means more similar.
	"""

	results = self.collection.query(
	query_embeddings=[query_embedding],
	n_results=top_k,
	include=[
	"documents",
	"metadatas",
	"distances",
	],
	)

	retrieved = []

	documents_list = results.get("documents", [[]])[0]
	metadatas_list = results.get("metadatas", [[]])[0]
	distances_list = results.get("distances", [[]])[0]

	for rank, (document_text, metadata, distance) in enumerate(
	zip(documents_list, metadatas_list, distances_list),
	start=1,
	):
	retrieved.append(
	{
	"rank": rank,
	"text": document_text,
	"source": metadata.get("source", "unknown"),
	"chunk_index": metadata.get("chunk_index", -1),
	"character_count": metadata.get("character_count", 0),
	"distance": float(distance),
	}
	)

	return retrieved