beppeinthesky's picture
feat: Add cluster analysis and semantic filtering modules
7e85729
import os
import uuid
import faiss
import shutil
import logging
import pandas as pd
from typing import Any
from langchain_core import documents
from langchain_community import embeddings
from langchain_community import vectorstores
from langchain_community.docstore import in_memory
DEFAULT_INDEX_QUERY = "hello world"
def build_faiss(
data_frame: pd.DataFrame,
index_path: str,
embedder: Any
) -> vectorstores.FAISS:
"""Build a FAISS index from a DataFrame.
Args:
data_frame: DataFrame containing data to index
index_path: Path where to save the FAISS index
embedder: Embedder object to generate vectors
Returns:
vectorstores.FAISS: Built FAISS vectorstore object
"""
embedded_documents = []
for row_idx, row in data_frame.iterrows():
for col_name, cell_val in row.items():
embedded_documents.append(documents.Document(
page_content=str(cell_val),
metadata={"row": row_idx, "column": col_name},
))
if os.path.exists(index_path):
shutil.rmtree(index_path, ignore_errors=True)
logging.debug(f"Deleted existing FAISS index at {index_path}")
vectorstore = vectorstores.FAISS(
embedding_function=embedder,
index=faiss.IndexFlatIP(len(embedder.embed_query(DEFAULT_INDEX_QUERY))),
docstore=in_memory.InMemoryDocstore(),
index_to_docstore_id={},
)
uuids = [str(uuid.uuid4()) for _ in range(len(embedded_documents))]
vectorstore.add_documents(documents=embedded_documents, ids=uuids)
logging.debug(f"Added {len(embedded_documents)} documents to FAISS index")
os.makedirs(index_path, exist_ok=True)
vectorstore.save_local(index_path)
logging.debug(f"FAISS index saved to ./{index_path}/")
return vectorstore
def load_faiss_index(
index_path: str,
hf_model_name: str
) -> vectorstores.FAISS:
"""Load a previously saved FAISS index.
Args:
index_path: Path of the saved FAISS index
hf_model_name: Name of the HuggingFace model for embeddings
Returns:
vectorstores.FAISS: Loaded FAISS vectorstore object
"""
embedder = embeddings.HuggingFaceEmbeddings(model_name=hf_model_name)
return vectorstores.FAISS.load_local(index_path, embedder, allow_dangerous_deserialization=True)