""" vector_database.py ------------------ Manages a FAISS vector store — saving, loading, and updating it. What is FAISS? FAISS (Facebook AI Similarity Search) is a library that lets us store embeddings and quickly find the most similar ones to a query embedding. Think of it as a very fast search index for vectors. How we use it here: 1. The first time documents are uploaded, we create a new FAISS index. 2. On subsequent uploads, we load the existing index and ADD new vectors. 3. When a user asks a question, we use FAISS to find the top-k most relevant chunks and pass them to the LLM as context. Files written to disk (inside VECTOR_STORE_DIR): - index.faiss : the vector data - index.pkl : metadata (source file names, page numbers, etc.) """ import os from langchain_community.vectorstores import FAISS from langchain_core.documents import Document from langchain_huggingface import HuggingFaceEmbeddings # ── configuration ───────────────────────────────────────────────────────────── # Where the FAISS index files are saved. # You can change this path if you prefer a different location. VECTOR_STORE_DIR = os.path.join(os.path.dirname(__file__), "..", "vector_store") INDEX_NAME = "index" # FAISS saves two files: index.faiss + index.pkl # ── public API ──────────────────────────────────────────────────────────────── def save_vector_store(vector_store: FAISS, directory: str = VECTOR_STORE_DIR) -> None: """Persist the FAISS index to disk so it survives restarts.""" os.makedirs(directory, exist_ok=True) vector_store.save_local(directory, index_name=INDEX_NAME) print(f" OK: Vector store saved to '{directory}'") def load_vector_store( embeddings: HuggingFaceEmbeddings, directory: str = VECTOR_STORE_DIR, ) -> FAISS | None: """ Load a previously saved FAISS index from disk. Returns None if no index exists yet (first run). """ index_file = os.path.join(directory, f"{INDEX_NAME}.faiss") if not os.path.exists(index_file): print(" No existing vector store found - will create a new one.") return None print(f" Loading existing vector store from '{directory}'...") vector_store = FAISS.load_local( directory, embeddings, index_name=INDEX_NAME, allow_dangerous_deserialization=True, # required by LangChain for local files ) print(" OK: Vector store loaded.") return vector_store def add_documents_to_store( documents: list[Document], embeddings: HuggingFaceEmbeddings, directory: str = VECTOR_STORE_DIR, ) -> FAISS: """ Add new documents to the vector store. If a store already exists on disk, the new documents are merged into it. If no store exists, a fresh one is created. Parameters ---------- documents : list[Document] – chunked documents to index embeddings : HuggingFaceEmbeddings – the embedding model directory : str – where to save the index Returns ------- FAISS – the updated (or newly created) vector store """ existing_store = load_vector_store(embeddings, directory) if existing_store is None: # First time — create a brand new FAISS index from the documents print(" Creating new FAISS index...") new_store = FAISS.from_documents(documents, embeddings) else: # Add new documents to the existing index print(" Merging new documents into existing FAISS index...") new_store = FAISS.from_documents(documents, embeddings) existing_store.merge_from(new_store) new_store = existing_store save_vector_store(new_store, directory) print(f" OK: {len(documents)} document chunk(s) indexed.") return new_store def get_retriever( vector_store: FAISS, k: int = 4, ): """ Create a retriever from the vector store. Parameters ---------- vector_store : FAISS – the indexed documents k : int – number of chunks to retrieve per query Returns ------- VectorStoreRetriever – a LangChain retriever object """ return vector_store.as_retriever( search_type="similarity", search_kwargs={"k": k}, )