Spaces:
Sleeping
Sleeping
File size: 1,754 Bytes
66d6614 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
"""
Script used to create the FAISS vector store of the glossary using Mistral embeddings
"""
import os
import tqdm
import time
import pandas as pd
import warnings
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
# Suppress the tokenizer warning
warnings.filterwarnings("ignore", message="Could not download mistral tokenizer from Huggingface")
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
def load_glossary():
df = pd.read_csv('glossary-terms.csv') # NOTE not adding this explicitly to public repo for security
df.drop(columns=["Category", "Notes"], inplace=True)
return df
def create_vector_index() -> None:
df = load_glossary()
documents = []
for i in tqdm.tqdm(range(len(df)), desc="Creating documents"):
doc = Document(
page_content=f"Name: {df.iloc[i]['Name']}\nDescription: {df.iloc[i]['Description']}",
metadata={"name": df.iloc[i]['Name'], "description": df.iloc[i]['Description']}
)
documents.append(doc)
start_time = time.time()
print(f"Starting FAISS vector store creation...")
vector_store = FAISS.from_documents(
documents=documents,
embedding=MistralAIEmbeddings(model="mistral-embed", mistral_api_key=MISTRAL_API_KEY),
docstore= InMemoryDocstore(),
index_to_docstore_id={}
)
end_time = time.time()
print(f"FAISS vector store created successfully in {end_time - start_time:.2f} seconds.")
# Save the vector store
vector_store.save_local("faiss_index")
if __name__ == "__main__":
create_vector_index() |