pdf proccessing and vector database

#3
by Uzaiir - opened
Files changed (1) hide show
  1. src/PDFprocess_sample.py +49 -0
src/PDFprocess_sample.py CHANGED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import streamlit as st
3
+ import pickle
4
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ import faiss
9
+
10
+
11
+ def process_pdf(uploaded_file):
12
+
13
+ all_documents = []
14
+ st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
+
16
+ main_placeholder = st.empty()
17
+ # Creating a temporary file to store the uploaded PDF's
18
+ main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
+ for uploaded_file in uploaded_file:
20
+ with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
+ temp_file.write(uploaded_file.read()) ## write file to temporary
22
+ temp_file_path = temp_file.name # Get the temporary file path
23
+
24
+
25
+ # Load the PDF's from the temporary file path
26
+
27
+
28
+ loader = PyPDFLoader(temp_file_path) # Document loader
29
+ doc= loader.load() # load Document
30
+ main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
+ #final_documents = text_splitter.split_documents(doc)# splitting
33
+ final_documents = text_splitter.split_documents(doc)
34
+ all_documents.extend(final_documents)
35
+
36
+
37
+ if all_documents:
38
+ main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
+ st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
+ st.session_state.docs = all_documents
41
+
42
+ # Save FAISS vector store to disk
43
+ faiss_index = st.session_state.vectors.index # Extract FAISS index
44
+ faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
+ main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
+
47
+ else:
48
+ st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
49
+