John Graham Reynolds commited on
Commit
66d6614
·
1 Parent(s): 89ac7a4

add script to create FAISS vector index

Browse files
Files changed (1) hide show
  1. src/create_vector_index.py +53 -0
src/create_vector_index.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script used to create the FAISS vector store of the glossary using Mistral embeddings
3
+ """
4
+
5
+ import os
6
+ import tqdm
7
+ import time
8
+ import pandas as pd
9
+ import warnings
10
+ from langchain_core.documents import Document
11
+ from langchain_community.vectorstores import FAISS
12
+ from langchain_mistralai.embeddings import MistralAIEmbeddings
13
+ from langchain_community.docstore.in_memory import InMemoryDocstore
14
+
15
+ # Suppress the tokenizer warning
16
+ warnings.filterwarnings("ignore", message="Could not download mistral tokenizer from Huggingface")
17
+
18
+ MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
19
+
20
+ def load_glossary():
21
+ df = pd.read_csv('glossary-terms.csv') # NOTE not adding this explicitly to public repo for security
22
+ df.drop(columns=["Category", "Notes"], inplace=True)
23
+ return df
24
+
25
+ def create_vector_index() -> None:
26
+ df = load_glossary()
27
+ documents = []
28
+
29
+ for i in tqdm.tqdm(range(len(df)), desc="Creating documents"):
30
+ doc = Document(
31
+ page_content=f"Name: {df.iloc[i]['Name']}\nDescription: {df.iloc[i]['Description']}",
32
+ metadata={"name": df.iloc[i]['Name'], "description": df.iloc[i]['Description']}
33
+ )
34
+ documents.append(doc)
35
+
36
+ start_time = time.time()
37
+ print(f"Starting FAISS vector store creation...")
38
+
39
+ vector_store = FAISS.from_documents(
40
+ documents=documents,
41
+ embedding=MistralAIEmbeddings(model="mistral-embed", mistral_api_key=MISTRAL_API_KEY),
42
+ docstore= InMemoryDocstore(),
43
+ index_to_docstore_id={}
44
+ )
45
+
46
+ end_time = time.time()
47
+ print(f"FAISS vector store created successfully in {end_time - start_time:.2f} seconds.")
48
+
49
+ # Save the vector store
50
+ vector_store.save_local("faiss_index")
51
+
52
+ if __name__ == "__main__":
53
+ create_vector_index()