Spaces:
Sleeping
Sleeping
| """ | |
| vector_database.py | |
| ------------------ | |
| Manages a FAISS vector store β saving, loading, and updating it. | |
| What is FAISS? | |
| FAISS (Facebook AI Similarity Search) is a library that lets us store | |
| embeddings and quickly find the most similar ones to a query embedding. | |
| Think of it as a very fast search index for vectors. | |
| How we use it here: | |
| 1. The first time documents are uploaded, we create a new FAISS index. | |
| 2. On subsequent uploads, we load the existing index and ADD new vectors. | |
| 3. When a user asks a question, we use FAISS to find the top-k most | |
| relevant chunks and pass them to the LLM as context. | |
| Files written to disk (inside VECTOR_STORE_DIR): | |
| - index.faiss : the vector data | |
| - index.pkl : metadata (source file names, page numbers, etc.) | |
| """ | |
| import os | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_core.documents import Document | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| # ββ configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Where the FAISS index files are saved. | |
| # You can change this path if you prefer a different location. | |
| VECTOR_STORE_DIR = os.path.join(os.path.dirname(__file__), "..", "vector_store") | |
| INDEX_NAME = "index" # FAISS saves two files: index.faiss + index.pkl | |
| # ββ public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save_vector_store(vector_store: FAISS, directory: str = VECTOR_STORE_DIR) -> None: | |
| """Persist the FAISS index to disk so it survives restarts.""" | |
| os.makedirs(directory, exist_ok=True) | |
| vector_store.save_local(directory, index_name=INDEX_NAME) | |
| print(f" OK: Vector store saved to '{directory}'") | |
| def load_vector_store( | |
| embeddings: HuggingFaceEmbeddings, | |
| directory: str = VECTOR_STORE_DIR, | |
| ) -> FAISS | None: | |
| """ | |
| Load a previously saved FAISS index from disk. | |
| Returns None if no index exists yet (first run). | |
| """ | |
| index_file = os.path.join(directory, f"{INDEX_NAME}.faiss") | |
| if not os.path.exists(index_file): | |
| print(" No existing vector store found - will create a new one.") | |
| return None | |
| print(f" Loading existing vector store from '{directory}'...") | |
| vector_store = FAISS.load_local( | |
| directory, | |
| embeddings, | |
| index_name=INDEX_NAME, | |
| allow_dangerous_deserialization=True, # required by LangChain for local files | |
| ) | |
| print(" OK: Vector store loaded.") | |
| return vector_store | |
| def add_documents_to_store( | |
| documents: list[Document], | |
| embeddings: HuggingFaceEmbeddings, | |
| directory: str = VECTOR_STORE_DIR, | |
| ) -> FAISS: | |
| """ | |
| Add new documents to the vector store. | |
| If a store already exists on disk, the new documents are merged into it. | |
| If no store exists, a fresh one is created. | |
| Parameters | |
| ---------- | |
| documents : list[Document] β chunked documents to index | |
| embeddings : HuggingFaceEmbeddings β the embedding model | |
| directory : str β where to save the index | |
| Returns | |
| ------- | |
| FAISS β the updated (or newly created) vector store | |
| """ | |
| existing_store = load_vector_store(embeddings, directory) | |
| if existing_store is None: | |
| # First time β create a brand new FAISS index from the documents | |
| print(" Creating new FAISS index...") | |
| new_store = FAISS.from_documents(documents, embeddings) | |
| else: | |
| # Add new documents to the existing index | |
| print(" Merging new documents into existing FAISS index...") | |
| new_store = FAISS.from_documents(documents, embeddings) | |
| existing_store.merge_from(new_store) | |
| new_store = existing_store | |
| save_vector_store(new_store, directory) | |
| print(f" OK: {len(documents)} document chunk(s) indexed.") | |
| return new_store | |
| def get_retriever( | |
| vector_store: FAISS, | |
| k: int = 4, | |
| ): | |
| """ | |
| Create a retriever from the vector store. | |
| Parameters | |
| ---------- | |
| vector_store : FAISS β the indexed documents | |
| k : int β number of chunks to retrieve per query | |
| Returns | |
| ------- | |
| VectorStoreRetriever β a LangChain retriever object | |
| """ | |
| return vector_store.as_retriever( | |
| search_type="similarity", | |
| search_kwargs={"k": k}, | |
| ) | |