rag-chatbot / components /vector_store.py
Mobiworks's picture
Sync from GitHub via hub-sync
c302758 verified
Raw
History Blame Contribute Delete
5.03 kB
"""
vector_store.py
---------------
Creates, persists, and loads a FAISS vector index.
Key behaviours
--------------
* build() β€” embeds documents and stores FAISS index
* load() β€” loads existing FAISS index from disk
* search() β€” returns top-K (Document, normalized score)
"""
import logging
from pathlib import Path
from typing import List, Tuple
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from app.config import VECTOR_DB_PATH
from components.embedder import HuggingFaceEmbedder
logger = logging.getLogger(__name__)
class VectorStore:
"""
FAISS vector store wrapper (stable + production-safe).
"""
def __init__(
self,
embedder: HuggingFaceEmbedder | None = None,
index_path: str = VECTOR_DB_PATH,
) -> None:
self.embedder = embedder or HuggingFaceEmbedder()
self.index_path = index_path
self._store: FAISS | None = None
# ─────────────────────────────────────────────
# Build index
# ─────────────────────────────────────────────
def build(self, documents: List[Document]) -> None:
if not documents:
raise ValueError("Cannot build vector store from empty documents.")
logger.info("Building FAISS index from %d chunks …", len(documents))
self._store = FAISS.from_documents(documents, self.embedder)
self._persist()
logger.info("FAISS index saved to '%s'.", self.index_path)
# ─────────────────────────────────────────────
# Load index
# ─────────────────────────────────────────────
def load(self) -> bool:
index_file = Path(self.index_path) / "index.faiss"
if not index_file.exists():
logger.info("No existing FAISS index found at '%s'.", self.index_path)
return False
logger.info("Loading FAISS index from '%s' …", self.index_path)
self._store = FAISS.load_local(
self.index_path,
self.embedder,
allow_dangerous_deserialization=True,
)
logger.info("FAISS index loaded (%d vectors).", self._store.index.ntotal)
return True
# ─────────────────────────────────────────────
# SEARCH (FIXED VERSION)
# ─────────────────────────────────────────────
def search(
self,
query: str,
k: int = 4,
) -> List[Tuple[Document, float]]:
"""
Returns:
List of (Document, relevance_score 0–1)
"""
self._require_store()
# STEP 1: get raw FAISS distances (stable)
results = self._store.similarity_search_with_score(query, k=k)
processed: List[Tuple[Document, float]] = []
for doc, distance in results:
# Convert distance β†’ similarity score (0–1)
score = 1.0 / (1.0 + float(distance))
processed.append((doc, score))
return processed
# ─────────────────────────────────────────────
# Add documents
# ─────────────────────────────────────────────
def add_documents(self, documents: List[Document]) -> None:
self._require_store()
self._store.add_documents(documents)
self._persist()
logger.info("Added %d chunks to index.", len(documents))
# ─────────────────────────────────────────────
# Status
# ─────────────────────────────────────────────
@property
def is_ready(self) -> bool:
return self._store is not None
# ─────────────────────────────────────────────
# Internal helpers
# ─────────────────────────────────────────────
def _persist(self) -> None:
Path(self.index_path).mkdir(parents=True, exist_ok=True)
self._store.save_local(self.index_path)
def _require_store(self) -> None:
if self._store is None:
raise RuntimeError(
"Vector store not ready. Call build() or load() first."
)