""" vector_store.py ─────────────── Handles text chunking, embedding, and FAISS vector store creation/querying. Responsibilities: - Split raw Documents into overlapping chunks - Embed chunks using a local HuggingFace sentence-transformer - Build and expose a FAISS index for similarity search - Provide a clean retrieve() function used by the RAG pipeline """ import logging from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from config import cfg logger = logging.getLogger(__name__) # ── Public API ──────────────────────────────────────────────────────────────── def build_index(documents: list[Document]) -> FAISS: """ Chunk → embed → index the supplied documents. Parameters ---------- documents : list[Document] Raw documents returned by data_loader.load_documents(). Returns ------- FAISS A ready-to-query FAISS vector store. """ chunks = _chunk_documents(documents) embeddings = _load_embeddings() index = _create_faiss_index(chunks, embeddings) return index def retrieve(index: FAISS, query: str, k: int | None = None) -> list[Document]: """ Retrieve the top-k most relevant chunks for a given query. Parameters ---------- index : FAISS The FAISS vector store built by build_index(). query : str The user's natural-language question. k : int, optional Number of results to return. Defaults to cfg.top_k. Returns ------- list[Document] Retrieved chunks, most relevant first. """ k = k or cfg.top_k results = index.similarity_search(query, k=k) logger.debug("Retrieved %d chunks for query: '%s'", len(results), query[:80]) return results # ── Internal helpers ────────────────────────────────────────────────────────── def _chunk_documents(documents: list[Document]) -> list[Document]: """Split documents into smaller overlapping chunks.""" splitter = RecursiveCharacterTextSplitter( chunk_size=cfg.chunk_size, chunk_overlap=cfg.chunk_overlap, separators=["\n\n", "\n", ". ", " ", ""], ) chunks = splitter.split_documents(documents) logger.info( "Chunking: %d raw docs → %d chunks (size=%d, overlap=%d)", len(documents), len(chunks), cfg.chunk_size, cfg.chunk_overlap, ) return chunks def _load_embeddings() -> HuggingFaceEmbeddings: """Load the local sentence-transformer embedding model (cached after first call).""" logger.info("Loading embedding model: %s", cfg.embed_model) return HuggingFaceEmbeddings( model_name=cfg.embed_model, model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": True}, ) def _create_faiss_index(chunks: list[Document], embeddings: HuggingFaceEmbeddings) -> FAISS: """Embed all chunks and build the FAISS index.""" logger.info("Building FAISS index over %d chunks …", len(chunks)) index = FAISS.from_documents(chunks, embeddings) logger.info("FAISS index built ✓ (vectors: %d)", index.index.ntotal) return index