""" Script used to create the FAISS vector store of the glossary using Mistral embeddings """ import os import tqdm import time import pandas as pd import warnings from langchain_core.documents import Document from langchain_community.vectorstores import FAISS from langchain_mistralai.embeddings import MistralAIEmbeddings from langchain_community.docstore.in_memory import InMemoryDocstore # Suppress the tokenizer warning warnings.filterwarnings("ignore", message="Could not download mistral tokenizer from Huggingface") MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY") def load_glossary(): df = pd.read_csv('glossary-terms.csv') # NOTE not adding this explicitly to public repo for security df.drop(columns=["Category", "Notes"], inplace=True) return df def create_vector_index() -> None: df = load_glossary() documents = [] for i in tqdm.tqdm(range(len(df)), desc="Creating documents"): doc = Document( page_content=f"Name: {df.iloc[i]['Name']}\nDescription: {df.iloc[i]['Description']}", metadata={"name": df.iloc[i]['Name'], "description": df.iloc[i]['Description']} ) documents.append(doc) start_time = time.time() print(f"Starting FAISS vector store creation...") vector_store = FAISS.from_documents( documents=documents, embedding=MistralAIEmbeddings(model="mistral-embed", mistral_api_key=MISTRAL_API_KEY), docstore= InMemoryDocstore(), index_to_docstore_id={} ) end_time = time.time() print(f"FAISS vector store created successfully in {end_time - start_time:.2f} seconds.") # Save the vector store vector_store.save_local("faiss_index") if __name__ == "__main__": create_vector_index()