File size: 2,376 Bytes
7e85729
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import uuid
import faiss
import shutil
import logging
import pandas as pd
from typing import Any
from langchain_core import documents
from langchain_community import embeddings
from langchain_community import vectorstores
from langchain_community.docstore import in_memory


DEFAULT_INDEX_QUERY = "hello world"


def build_faiss(
    data_frame: pd.DataFrame,
    index_path: str,
    embedder: Any
) -> vectorstores.FAISS:
    """Build a FAISS index from a DataFrame.
    
    Args:
        data_frame: DataFrame containing data to index
        index_path: Path where to save the FAISS index
        embedder: Embedder object to generate vectors
        
    Returns:
        vectorstores.FAISS: Built FAISS vectorstore object
    """
    embedded_documents = []
    for row_idx, row in data_frame.iterrows():
        for col_name, cell_val in row.items():
            embedded_documents.append(documents.Document(
                page_content=str(cell_val),
                metadata={"row": row_idx, "column": col_name},
            ))

    if os.path.exists(index_path):
        shutil.rmtree(index_path, ignore_errors=True)
        logging.debug(f"Deleted existing FAISS index at {index_path}")

    vectorstore = vectorstores.FAISS(
        embedding_function=embedder,
        index=faiss.IndexFlatIP(len(embedder.embed_query(DEFAULT_INDEX_QUERY))),
        docstore=in_memory.InMemoryDocstore(),
        index_to_docstore_id={},
    )

    uuids = [str(uuid.uuid4()) for _ in range(len(embedded_documents))]
    vectorstore.add_documents(documents=embedded_documents, ids=uuids)
    logging.debug(f"Added {len(embedded_documents)} documents to FAISS index")

    os.makedirs(index_path, exist_ok=True)
    vectorstore.save_local(index_path)
    logging.debug(f"FAISS index saved to ./{index_path}/")
    return vectorstore


def load_faiss_index(
    index_path: str,
    hf_model_name: str
) -> vectorstores.FAISS:
    """Load a previously saved FAISS index.
    
    Args:
        index_path: Path of the saved FAISS index
        hf_model_name: Name of the HuggingFace model for embeddings
        
    Returns:
        vectorstores.FAISS: Loaded FAISS vectorstore object
    """
    embedder = embeddings.HuggingFaceEmbeddings(model_name=hf_model_name)
    return vectorstores.FAISS.load_local(index_path, embedder, allow_dangerous_deserialization=True)