Spaces:

Abdul2000
/

Ragbase_Studio

Sleeping

App Files Files Community

Ragbase_Studio / src /vector_database.py

Abdul2000

Rename vector_database.py to src/vector_database.py

52b4257 verified 19 days ago

Raw

History Blame Contribute Delete

4.64 kB

	"""
	vector_database.py
	------------------
	Manages a FAISS vector store — saving, loading, and updating it.

	What is FAISS?
	FAISS (Facebook AI Similarity Search) is a library that lets us store
	embeddings and quickly find the most similar ones to a query embedding.
	Think of it as a very fast search index for vectors.

	How we use it here:
	1. The first time documents are uploaded, we create a new FAISS index.
	2. On subsequent uploads, we load the existing index and ADD new vectors.
	3. When a user asks a question, we use FAISS to find the top-k most
	relevant chunks and pass them to the LLM as context.

	Files written to disk (inside VECTOR_STORE_DIR):
	- index.faiss : the vector data
	- index.pkl : metadata (source file names, page numbers, etc.)
	"""

	import os
	from langchain_community.vectorstores import FAISS
	from langchain_core.documents import Document
	from langchain_huggingface import HuggingFaceEmbeddings

	# ── configuration ─────────────────────────────────────────────────────────────

	# Where the FAISS index files are saved.
	# You can change this path if you prefer a different location.
	VECTOR_STORE_DIR = os.path.join(os.path.dirname(__file__), "..", "vector_store")
	INDEX_NAME = "index" # FAISS saves two files: index.faiss + index.pkl


	# ── public API ────────────────────────────────────────────────────────────────

	def save_vector_store(vector_store: FAISS, directory: str = VECTOR_STORE_DIR) -> None:
	"""Persist the FAISS index to disk so it survives restarts."""
	os.makedirs(directory, exist_ok=True)
	vector_store.save_local(directory, index_name=INDEX_NAME)
	print(f" OK: Vector store saved to '{directory}'")


	def load_vector_store(
	embeddings: HuggingFaceEmbeddings,
	directory: str = VECTOR_STORE_DIR,
	) -> FAISS \| None:
	"""
	Load a previously saved FAISS index from disk.

	Returns None if no index exists yet (first run).
	"""
	index_file = os.path.join(directory, f"{INDEX_NAME}.faiss")
	if not os.path.exists(index_file):
	print(" No existing vector store found - will create a new one.")
	return None

	print(f" Loading existing vector store from '{directory}'...")
	vector_store = FAISS.load_local(
	directory,
	embeddings,
	index_name=INDEX_NAME,
	allow_dangerous_deserialization=True, # required by LangChain for local files
	)
	print(" OK: Vector store loaded.")
	return vector_store


	def add_documents_to_store(
	documents: list[Document],
	embeddings: HuggingFaceEmbeddings,
	directory: str = VECTOR_STORE_DIR,
	) -> FAISS:
	"""
	Add new documents to the vector store.

	If a store already exists on disk, the new documents are merged into it.
	If no store exists, a fresh one is created.

	Parameters
	----------
	documents : list[Document] – chunked documents to index
	embeddings : HuggingFaceEmbeddings – the embedding model
	directory : str – where to save the index

	Returns
	-------
	FAISS – the updated (or newly created) vector store
	"""
	existing_store = load_vector_store(embeddings, directory)

	if existing_store is None:
	# First time — create a brand new FAISS index from the documents
	print(" Creating new FAISS index...")
	new_store = FAISS.from_documents(documents, embeddings)
	else:
	# Add new documents to the existing index
	print(" Merging new documents into existing FAISS index...")
	new_store = FAISS.from_documents(documents, embeddings)
	existing_store.merge_from(new_store)
	new_store = existing_store

	save_vector_store(new_store, directory)
	print(f" OK: {len(documents)} document chunk(s) indexed.")
	return new_store


	def get_retriever(
	vector_store: FAISS,
	k: int = 4,
	):
	"""
	Create a retriever from the vector store.

	Parameters
	----------
	vector_store : FAISS – the indexed documents
	k : int – number of chunks to retrieve per query

	Returns
	-------
	VectorStoreRetriever – a LangChain retriever object
	"""
	return vector_store.as_retriever(
	search_type="similarity",
	search_kwargs={"k": k},
	)