Spaces:

OnlyTheTruth03
/

DemoChatBot

Sleeping

App Files Files Community

DemoChatBot / vector_store.py

OnlyTheTruth03

Initial Commit

721ca73 verified about 1 month ago

raw

history blame contribute delete

3.66 kB

	"""
	vector_store.py
	───────────────
	Handles text chunking, embedding, and FAISS vector store creation/querying.

	Responsibilities:
	- Split raw Documents into overlapping chunks
	- Embed chunks using a local HuggingFace sentence-transformer
	- Build and expose a FAISS index for similarity search
	- Provide a clean retrieve() function used by the RAG pipeline
	"""

	import logging

	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	from config import cfg

	logger = logging.getLogger(__name__)


	# ── Public API ────────────────────────────────────────────────────────────────

	def build_index(documents: list[Document]) -> FAISS:
	"""
	Chunk → embed → index the supplied documents.

	Parameters
	----------
	documents : list[Document]
	Raw documents returned by data_loader.load_documents().

	Returns
	-------
	FAISS
	A ready-to-query FAISS vector store.
	"""
	chunks = _chunk_documents(documents)
	embeddings = _load_embeddings()
	index = _create_faiss_index(chunks, embeddings)
	return index


	def retrieve(index: FAISS, query: str, k: int \| None = None) -> list[Document]:
	"""
	Retrieve the top-k most relevant chunks for a given query.

	Parameters
	----------
	index : FAISS
	The FAISS vector store built by build_index().
	query : str
	The user's natural-language question.
	k : int, optional
	Number of results to return. Defaults to cfg.top_k.

	Returns
	-------
	list[Document]
	Retrieved chunks, most relevant first.
	"""
	k = k or cfg.top_k
	results = index.similarity_search(query, k=k)
	logger.debug("Retrieved %d chunks for query: '%s'", len(results), query[:80])
	return results


	# ── Internal helpers ──────────────────────────────────────────────────────────

	def _chunk_documents(documents: list[Document]) -> list[Document]:
	"""Split documents into smaller overlapping chunks."""
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=cfg.chunk_size,
	chunk_overlap=cfg.chunk_overlap,
	separators=["\n\n", "\n", ". ", " ", ""],
	)
	chunks = splitter.split_documents(documents)
	logger.info(
	"Chunking: %d raw docs → %d chunks (size=%d, overlap=%d)",
	len(documents), len(chunks), cfg.chunk_size, cfg.chunk_overlap,
	)
	return chunks


	def _load_embeddings() -> HuggingFaceEmbeddings:
	"""Load the local sentence-transformer embedding model (cached after first call)."""
	logger.info("Loading embedding model: %s", cfg.embed_model)
	return HuggingFaceEmbeddings(
	model_name=cfg.embed_model,
	model_kwargs={"device": "cpu"},
	encode_kwargs={"normalize_embeddings": True},
	)


	def _create_faiss_index(chunks: list[Document], embeddings: HuggingFaceEmbeddings) -> FAISS:
	"""Embed all chunks and build the FAISS index."""
	logger.info("Building FAISS index over %d chunks …", len(chunks))
	index = FAISS.from_documents(chunks, embeddings)
	logger.info("FAISS index built ✓ (vectors: %d)", index.index.ntotal)
	return index