""" vector_store.py --------------- Creates, persists, and loads a FAISS vector index. Key behaviours -------------- * build() — embeds documents and stores FAISS index * load() — loads existing FAISS index from disk * search() — returns top-K (Document, normalized score) """ import logging from pathlib import Path from typing import List, Tuple from langchain.schema import Document from langchain_community.vectorstores import FAISS from app.config import VECTOR_DB_PATH from components.embedder import HuggingFaceEmbedder logger = logging.getLogger(__name__) class VectorStore: """ FAISS vector store wrapper (stable + production-safe). """ def __init__( self, embedder: HuggingFaceEmbedder | None = None, index_path: str = VECTOR_DB_PATH, ) -> None: self.embedder = embedder or HuggingFaceEmbedder() self.index_path = index_path self._store: FAISS | None = None # ───────────────────────────────────────────── # Build index # ───────────────────────────────────────────── def build(self, documents: List[Document]) -> None: if not documents: raise ValueError("Cannot build vector store from empty documents.") logger.info("Building FAISS index from %d chunks …", len(documents)) self._store = FAISS.from_documents(documents, self.embedder) self._persist() logger.info("FAISS index saved to '%s'.", self.index_path) # ───────────────────────────────────────────── # Load index # ───────────────────────────────────────────── def load(self) -> bool: index_file = Path(self.index_path) / "index.faiss" if not index_file.exists(): logger.info("No existing FAISS index found at '%s'.", self.index_path) return False logger.info("Loading FAISS index from '%s' …", self.index_path) self._store = FAISS.load_local( self.index_path, self.embedder, allow_dangerous_deserialization=True, ) logger.info("FAISS index loaded (%d vectors).", self._store.index.ntotal) return True # ───────────────────────────────────────────── # SEARCH (FIXED VERSION) # ───────────────────────────────────────────── def search( self, query: str, k: int = 4, ) -> List[Tuple[Document, float]]: """ Returns: List of (Document, relevance_score 0–1) """ self._require_store() # STEP 1: get raw FAISS distances (stable) results = self._store.similarity_search_with_score(query, k=k) processed: List[Tuple[Document, float]] = [] for doc, distance in results: # Convert distance → similarity score (0–1) score = 1.0 / (1.0 + float(distance)) processed.append((doc, score)) return processed # ───────────────────────────────────────────── # Add documents # ───────────────────────────────────────────── def add_documents(self, documents: List[Document]) -> None: self._require_store() self._store.add_documents(documents) self._persist() logger.info("Added %d chunks to index.", len(documents)) # ───────────────────────────────────────────── # Status # ───────────────────────────────────────────── @property def is_ready(self) -> bool: return self._store is not None # ───────────────────────────────────────────── # Internal helpers # ───────────────────────────────────────────── def _persist(self) -> None: Path(self.index_path).mkdir(parents=True, exist_ok=True) self._store.save_local(self.index_path) def _require_store(self) -> None: if self._store is None: raise RuntimeError( "Vector store not ready. Call build() or load() first." )