Spaces:

MarioBarbeque
/

MistralAI

Sleeping

File size: 1,754 Bytes

66d6614

"""
Script used to create the FAISS vector store of the glossary using Mistral embeddings
"""

import os
import tqdm
import time
import pandas as pd
import warnings
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore

# Suppress the tokenizer warning
warnings.filterwarnings("ignore", message="Could not download mistral tokenizer from Huggingface")

MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")

def load_glossary():
    df = pd.read_csv('glossary-terms.csv') # NOTE not adding this explicitly to public repo for security
    df.drop(columns=["Category", "Notes"], inplace=True)
    return df

def create_vector_index() -> None:
    df = load_glossary()
    documents = []

    for i in tqdm.tqdm(range(len(df)), desc="Creating documents"):
        doc = Document(
            page_content=f"Name: {df.iloc[i]['Name']}\nDescription: {df.iloc[i]['Description']}",
            metadata={"name": df.iloc[i]['Name'], "description": df.iloc[i]['Description']}
        )
        documents.append(doc)

    start_time = time.time()
    print(f"Starting FAISS vector store creation...")

    vector_store = FAISS.from_documents(
        documents=documents, 
        embedding=MistralAIEmbeddings(model="mistral-embed", mistral_api_key=MISTRAL_API_KEY), 
        docstore= InMemoryDocstore(),
        index_to_docstore_id={}
    )

    end_time = time.time()
    print(f"FAISS vector store created successfully in {end_time - start_time:.2f} seconds.")

    # Save the vector store
    vector_store.save_local("faiss_index")

if __name__ == "__main__":
    create_vector_index()