import os import uuid import faiss import shutil import logging import pandas as pd from typing import Any from langchain_core import documents from langchain_community import embeddings from langchain_community import vectorstores from langchain_community.docstore import in_memory DEFAULT_INDEX_QUERY = "hello world" def build_faiss( data_frame: pd.DataFrame, index_path: str, embedder: Any ) -> vectorstores.FAISS: """Build a FAISS index from a DataFrame. Args: data_frame: DataFrame containing data to index index_path: Path where to save the FAISS index embedder: Embedder object to generate vectors Returns: vectorstores.FAISS: Built FAISS vectorstore object """ embedded_documents = [] for row_idx, row in data_frame.iterrows(): for col_name, cell_val in row.items(): embedded_documents.append(documents.Document( page_content=str(cell_val), metadata={"row": row_idx, "column": col_name}, )) if os.path.exists(index_path): shutil.rmtree(index_path, ignore_errors=True) logging.debug(f"Deleted existing FAISS index at {index_path}") vectorstore = vectorstores.FAISS( embedding_function=embedder, index=faiss.IndexFlatIP(len(embedder.embed_query(DEFAULT_INDEX_QUERY))), docstore=in_memory.InMemoryDocstore(), index_to_docstore_id={}, ) uuids = [str(uuid.uuid4()) for _ in range(len(embedded_documents))] vectorstore.add_documents(documents=embedded_documents, ids=uuids) logging.debug(f"Added {len(embedded_documents)} documents to FAISS index") os.makedirs(index_path, exist_ok=True) vectorstore.save_local(index_path) logging.debug(f"FAISS index saved to ./{index_path}/") return vectorstore def load_faiss_index( index_path: str, hf_model_name: str ) -> vectorstores.FAISS: """Load a previously saved FAISS index. Args: index_path: Path of the saved FAISS index hf_model_name: Name of the HuggingFace model for embeddings Returns: vectorstores.FAISS: Loaded FAISS vectorstore object """ embedder = embeddings.HuggingFaceEmbeddings(model_name=hf_model_name) return vectorstores.FAISS.load_local(index_path, embedder, allow_dangerous_deserialization=True)