Spaces:
Sleeping
Sleeping
File size: 3,657 Bytes
721ca73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | """
vector_store.py
βββββββββββββββ
Handles text chunking, embedding, and FAISS vector store creation/querying.
Responsibilities:
- Split raw Documents into overlapping chunks
- Embed chunks using a local HuggingFace sentence-transformer
- Build and expose a FAISS index for similarity search
- Provide a clean retrieve() function used by the RAG pipeline
"""
import logging
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from config import cfg
logger = logging.getLogger(__name__)
# ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def build_index(documents: list[Document]) -> FAISS:
"""
Chunk β embed β index the supplied documents.
Parameters
----------
documents : list[Document]
Raw documents returned by data_loader.load_documents().
Returns
-------
FAISS
A ready-to-query FAISS vector store.
"""
chunks = _chunk_documents(documents)
embeddings = _load_embeddings()
index = _create_faiss_index(chunks, embeddings)
return index
def retrieve(index: FAISS, query: str, k: int | None = None) -> list[Document]:
"""
Retrieve the top-k most relevant chunks for a given query.
Parameters
----------
index : FAISS
The FAISS vector store built by build_index().
query : str
The user's natural-language question.
k : int, optional
Number of results to return. Defaults to cfg.top_k.
Returns
-------
list[Document]
Retrieved chunks, most relevant first.
"""
k = k or cfg.top_k
results = index.similarity_search(query, k=k)
logger.debug("Retrieved %d chunks for query: '%s'", len(results), query[:80])
return results
# ββ Internal helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _chunk_documents(documents: list[Document]) -> list[Document]:
"""Split documents into smaller overlapping chunks."""
splitter = RecursiveCharacterTextSplitter(
chunk_size=cfg.chunk_size,
chunk_overlap=cfg.chunk_overlap,
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(documents)
logger.info(
"Chunking: %d raw docs β %d chunks (size=%d, overlap=%d)",
len(documents), len(chunks), cfg.chunk_size, cfg.chunk_overlap,
)
return chunks
def _load_embeddings() -> HuggingFaceEmbeddings:
"""Load the local sentence-transformer embedding model (cached after first call)."""
logger.info("Loading embedding model: %s", cfg.embed_model)
return HuggingFaceEmbeddings(
model_name=cfg.embed_model,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True},
)
def _create_faiss_index(chunks: list[Document], embeddings: HuggingFaceEmbeddings) -> FAISS:
"""Embed all chunks and build the FAISS index."""
logger.info("Building FAISS index over %d chunks β¦", len(chunks))
index = FAISS.from_documents(chunks, embeddings)
logger.info("FAISS index built β (vectors: %d)", index.index.ntotal)
return index
|