Uzaiir commited on
Commit
cb666fa
Β·
verified Β·
1 Parent(s): 21a73c0

Update src/PDFprocess_sample.py

Browse files
Files changed (1) hide show
  1. src/PDFprocess_sample.py +70 -33
src/PDFprocess_sample.py CHANGED
@@ -1,57 +1,94 @@
1
  import tempfile
2
  import streamlit as st
3
  import pickle
4
- # from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
  from langchain_community.document_loaders import PyPDFLoader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
  import faiss
9
 
10
 
11
- def process_pdf(uploaded_file):
12
 
13
- all_documents = []
14
- # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
- main_placeholder = st.empty()
17
- # Creating a temporary file to store the uploaded PDF's
18
- main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
- for uploaded_file in uploaded_file:
20
 
21
- # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
22
- # temp_file.write(uploaded_file.read()) ## write file to temporary
23
- # temp_file_path = temp_file.name # Get the temporary file path
24
 
25
- temp_file_path = os.path.join("/tmp", uploaded_file.name)
26
- with open(temp_file_path, "wb") as f:
27
- f.write(uploaded_file.read())
28
 
29
- st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
30
 
31
 
32
 
33
- # Load the PDF's from the temporary file path
34
 
35
 
36
- loader = PyPDFLoader(temp_file_path) # Document loader
37
- doc= loader.load() # load Document
38
- main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
39
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
40
- #final_documents = text_splitter.split_documents(doc)# splitting
41
- final_documents = text_splitter.split_documents(doc)
42
- all_documents.extend(final_documents)
43
 
44
 
45
- if all_documents:
46
- main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
47
- st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
48
- st.session_state.docs = all_documents
49
 
50
- # Save FAISS vector store to disk
51
- faiss_index = st.session_state.vectors.index # Extract FAISS index
52
- faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
53
- main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
54
 
55
- else:
56
- st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import tempfile
2
  import streamlit as st
3
  import pickle
4
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
  from langchain_community.document_loaders import PyPDFLoader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
  import faiss
9
 
10
 
11
+ # def process_pdf(uploaded_file):
12
 
13
+ # all_documents = []
14
+ # # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
+ # main_placeholder = st.empty()
17
+ # # Creating a temporary file to store the uploaded PDF's
18
+ # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
+ # for uploaded_file in uploaded_file:
20
 
21
+ # # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
22
+ # # temp_file.write(uploaded_file.read()) ## write file to temporary
23
+ # # temp_file_path = temp_file.name # Get the temporary file path
24
 
25
+ # temp_file_path = os.path.join("/tmp", uploaded_file.name)
26
+ # with open(temp_file_path, "wb") as f:
27
+ # f.write(uploaded_file.read())
28
 
29
+ # st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
30
 
31
 
32
 
33
+ # # Load the PDF's from the temporary file path
34
 
35
 
36
+ # loader = PyPDFLoader(temp_file_path) # Document loader
37
+ # doc= loader.load() # load Document
38
+ # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
39
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
40
+ # #final_documents = text_splitter.split_documents(doc)# splitting
41
+ # final_documents = text_splitter.split_documents(doc)
42
+ # all_documents.extend(final_documents)
43
 
44
 
45
+ # if all_documents:
46
+ # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
47
+ # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
48
+ # st.session_state.docs = all_documents
49
 
50
+ # # Save FAISS vector store to disk
51
+ # faiss_index = st.session_state.vectors.index # Extract FAISS index
52
+ # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
53
+ # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
54
 
55
+ # else:
56
+ # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
57
+
58
+
59
+
60
+ def process_pdf(uploaded_files):
61
+ all_documents = []
62
+ main_placeholder = st.empty()
63
+ main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
64
+
65
+ for uploaded_file in uploaded_files:
66
+ temp_file_path = os.path.join("/tmp", uploaded_file.name)
67
+ with open(temp_file_path, "wb") as f:
68
+ f.write(uploaded_file.read())
69
+
70
+ st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
71
+
72
+ loader = PyPDFLoader(temp_file_path)
73
+ doc = loader.load()
74
+ main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
75
+
76
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
77
+ final_documents = text_splitter.split_documents(doc)
78
+ all_documents.extend(final_documents)
79
+
80
+ if all_documents:
81
+ main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
82
 
83
+ # ⏬ Move embedding initialization here
84
+ st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
85
+ st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
86
+ st.session_state.docs = all_documents
87
+
88
+ faiss_index = st.session_state.vectors.index
89
+ faiss.write_index(faiss_index, "faiss_index.bin")
90
+ main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
91
+
92
+ else:
93
+ st.error("No documents found or the PDF is corrupted.")
94
+