Ragbase_Studio / src /vector_database.py
Abdul2000's picture
Rename vector_database.py to src/vector_database.py
52b4257 verified
Raw
History Blame Contribute Delete
4.64 kB
"""
vector_database.py
------------------
Manages a FAISS vector store β€” saving, loading, and updating it.
What is FAISS?
FAISS (Facebook AI Similarity Search) is a library that lets us store
embeddings and quickly find the most similar ones to a query embedding.
Think of it as a very fast search index for vectors.
How we use it here:
1. The first time documents are uploaded, we create a new FAISS index.
2. On subsequent uploads, we load the existing index and ADD new vectors.
3. When a user asks a question, we use FAISS to find the top-k most
relevant chunks and pass them to the LLM as context.
Files written to disk (inside VECTOR_STORE_DIR):
- index.faiss : the vector data
- index.pkl : metadata (source file names, page numbers, etc.)
"""
import os
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
# ── configuration ─────────────────────────────────────────────────────────────
# Where the FAISS index files are saved.
# You can change this path if you prefer a different location.
VECTOR_STORE_DIR = os.path.join(os.path.dirname(__file__), "..", "vector_store")
INDEX_NAME = "index" # FAISS saves two files: index.faiss + index.pkl
# ── public API ────────────────────────────────────────────────────────────────
def save_vector_store(vector_store: FAISS, directory: str = VECTOR_STORE_DIR) -> None:
"""Persist the FAISS index to disk so it survives restarts."""
os.makedirs(directory, exist_ok=True)
vector_store.save_local(directory, index_name=INDEX_NAME)
print(f" OK: Vector store saved to '{directory}'")
def load_vector_store(
embeddings: HuggingFaceEmbeddings,
directory: str = VECTOR_STORE_DIR,
) -> FAISS | None:
"""
Load a previously saved FAISS index from disk.
Returns None if no index exists yet (first run).
"""
index_file = os.path.join(directory, f"{INDEX_NAME}.faiss")
if not os.path.exists(index_file):
print(" No existing vector store found - will create a new one.")
return None
print(f" Loading existing vector store from '{directory}'...")
vector_store = FAISS.load_local(
directory,
embeddings,
index_name=INDEX_NAME,
allow_dangerous_deserialization=True, # required by LangChain for local files
)
print(" OK: Vector store loaded.")
return vector_store
def add_documents_to_store(
documents: list[Document],
embeddings: HuggingFaceEmbeddings,
directory: str = VECTOR_STORE_DIR,
) -> FAISS:
"""
Add new documents to the vector store.
If a store already exists on disk, the new documents are merged into it.
If no store exists, a fresh one is created.
Parameters
----------
documents : list[Document] – chunked documents to index
embeddings : HuggingFaceEmbeddings – the embedding model
directory : str – where to save the index
Returns
-------
FAISS – the updated (or newly created) vector store
"""
existing_store = load_vector_store(embeddings, directory)
if existing_store is None:
# First time β€” create a brand new FAISS index from the documents
print(" Creating new FAISS index...")
new_store = FAISS.from_documents(documents, embeddings)
else:
# Add new documents to the existing index
print(" Merging new documents into existing FAISS index...")
new_store = FAISS.from_documents(documents, embeddings)
existing_store.merge_from(new_store)
new_store = existing_store
save_vector_store(new_store, directory)
print(f" OK: {len(documents)} document chunk(s) indexed.")
return new_store
def get_retriever(
vector_store: FAISS,
k: int = 4,
):
"""
Create a retriever from the vector store.
Parameters
----------
vector_store : FAISS – the indexed documents
k : int – number of chunks to retrieve per query
Returns
-------
VectorStoreRetriever – a LangChain retriever object
"""
return vector_store.as_retriever(
search_type="similarity",
search_kwargs={"k": k},
)