Alejo760 commited on
Commit
4abc3ae
·
verified ·
1 Parent(s): 453a807

Upload create_vector_db.py

Browse files
Files changed (1) hide show
  1. create_vector_db.py +39 -0
create_vector_db.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+
8
+ def create_vector_db(pdf_path, db_directory):
9
+ # Delete existing vector database if it exists
10
+ if os.path.exists(db_directory):
11
+ print(f"The vector database already exists at {db_directory}. Deleting it...")
12
+ shutil.rmtree(db_directory)
13
+ print(f"Deleted the existing vector database at {db_directory}.")
14
+
15
+ # Load the PDF
16
+ loader = PyPDFLoader(pdf_path)
17
+ documents = loader.load()
18
+
19
+ # Split the documents into manageable chunks
20
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
21
+ docs = text_splitter.split_documents(documents)
22
+
23
+ # Load the embeddings model
24
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
+
26
+ # Create the vector database without deprecated settings
27
+ vectordb = Chroma.from_documents(
28
+ docs,
29
+ embedding=embeddings,
30
+ persist_directory=db_directory,
31
+ )
32
+
33
+ print(f"Vector database created at {db_directory}")
34
+
35
+ # Create vector database for PEI.pdf
36
+ create_vector_db("PEI.pdf", "./vector_db_PEI")
37
+
38
+ # Create vector database for guia.pdf
39
+ create_vector_db("guia.pdf", "./vector_db_guia")