Spaces:
Sleeping
Sleeping
| """ | |
| Script used to create the FAISS vector store of the glossary using Mistral embeddings | |
| """ | |
| import os | |
| import tqdm | |
| import time | |
| import pandas as pd | |
| import warnings | |
| from langchain_core.documents import Document | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_mistralai.embeddings import MistralAIEmbeddings | |
| from langchain_community.docstore.in_memory import InMemoryDocstore | |
| # Suppress the tokenizer warning | |
| warnings.filterwarnings("ignore", message="Could not download mistral tokenizer from Huggingface") | |
| MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY") | |
| def load_glossary(): | |
| df = pd.read_csv('glossary-terms.csv') # NOTE not adding this explicitly to public repo for security | |
| df.drop(columns=["Category", "Notes"], inplace=True) | |
| return df | |
| def create_vector_index() -> None: | |
| df = load_glossary() | |
| documents = [] | |
| for i in tqdm.tqdm(range(len(df)), desc="Creating documents"): | |
| doc = Document( | |
| page_content=f"Name: {df.iloc[i]['Name']}\nDescription: {df.iloc[i]['Description']}", | |
| metadata={"name": df.iloc[i]['Name'], "description": df.iloc[i]['Description']} | |
| ) | |
| documents.append(doc) | |
| start_time = time.time() | |
| print(f"Starting FAISS vector store creation...") | |
| vector_store = FAISS.from_documents( | |
| documents=documents, | |
| embedding=MistralAIEmbeddings(model="mistral-embed", mistral_api_key=MISTRAL_API_KEY), | |
| docstore= InMemoryDocstore(), | |
| index_to_docstore_id={} | |
| ) | |
| end_time = time.time() | |
| print(f"FAISS vector store created successfully in {end_time - start_time:.2f} seconds.") | |
| # Save the vector store | |
| vector_store.save_local("faiss_index") | |
| if __name__ == "__main__": | |
| create_vector_index() |