Spaces:

babupallam
/

knowflow-ai-rag-document-chatbot

Sleeping

File size: 5,014 Bytes

56fe00f

# ============================================================
# FILE: src/vector_store.py
# ============================================================
# PURPOSE:
# Store and search document chunks using ChromaDB.
#
# ChromaDB is excellent for local demos and prototypes.
#
# For larger production systems, you may consider:
# - Qdrant
# - Weaviate
# - Milvus
# - Pinecone
# - PostgreSQL with pgvector
# - OpenSearch vector search
#
# But the concepts remain the same:
# chunk -> embed -> store -> retrieve
# ============================================================

from pathlib import Path
from typing import Any, Dict, List

import chromadb
from chromadb.config import Settings

from src.chunker import Chunk


class ChromaVectorStore:
    """
    Thin wrapper around ChromaDB.

    This makes the rest of the app independent from Chroma-specific code.
    """

    def __init__(
        self,
        persist_directory: Path,
        collection_name: str,
        embedding_model_name: str,
    ) -> None:
        """
        Create a persistent ChromaDB client.

        persistent directory:
        - stores the vector database on disk
        - allows reuse after app restart
        """

        self.persist_directory = persist_directory
        self.collection_name = collection_name
        self.embedding_model_name = embedding_model_name

        self.persist_directory.mkdir(parents=True, exist_ok=True)

        self.client = chromadb.PersistentClient(
            path=str(self.persist_directory),
            settings=Settings(anonymized_telemetry=False),
        )

        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={
                "description": "KnowFlow AI document knowledge base",
                "embedding_model": self.embedding_model_name,
            },
        )

    def reset_collection(self) -> None:
        """
        Delete and recreate the collection.

        Good for demos and development.

        Production alternative:
        - upsert changed documents only
        - delete old chunks for changed files
        - maintain document versions
        """

        try:
            self.client.delete_collection(self.collection_name)
        except Exception:
            pass

        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={
                "description": "KnowFlow AI document knowledge base",
                "embedding_model": self.embedding_model_name,
            },
        )

    def count(self) -> int:
        """
        Return the number of vectors stored.
        """
        return self.collection.count()

    def add_chunks(
        self,
        chunks: List[Chunk],
        embeddings: List[List[float]],
    ) -> None:
        """
        Add chunks and their embeddings into ChromaDB.

        Metadata is important because it allows the final answer to show:
        - source file
        - chunk number
        - character count
        """

        if not chunks:
            return

        ids = [chunk.id for chunk in chunks]
        documents = [chunk.text for chunk in chunks]

        metadatas = [
            {
                "source": chunk.source,
                "chunk_index": chunk.chunk_index,
                "character_count": chunk.character_count,
            }
            for chunk in chunks
        ]

        self.collection.add(
            ids=ids,
            documents=documents,
            metadatas=metadatas,
            embeddings=embeddings,
        )

    def query(
        self,
        query_embedding: List[float],
        top_k: int,
    ) -> List[Dict[str, Any]]:
        """
        Query the vector database using a query embedding.

        Returns:
        A list of retrieved chunks with metadata and distance.

        Distance:
        Lower usually means more similar.
        """

        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k,
            include=[
                "documents",
                "metadatas",
                "distances",
            ],
        )

        retrieved = []

        documents_list = results.get("documents", [[]])[0]
        metadatas_list = results.get("metadatas", [[]])[0]
        distances_list = results.get("distances", [[]])[0]

        for rank, (document_text, metadata, distance) in enumerate(
            zip(documents_list, metadatas_list, distances_list),
            start=1,
        ):
            retrieved.append(
                {
                    "rank": rank,
                    "text": document_text,
                    "source": metadata.get("source", "unknown"),
                    "chunk_index": metadata.get("chunk_index", -1),
                    "character_count": metadata.get("character_count", 0),
                    "distance": float(distance),
                }
            )

        return retrieved