Uzaiir commited on
Commit
9acac0b
Β·
verified Β·
1 Parent(s): 0f4b0b6

Update src/PDFprocess_sample.py

Browse files
Files changed (1) hide show
  1. src/PDFprocess_sample.py +99 -46
src/PDFprocess_sample.py CHANGED
@@ -1,3 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import tempfile
2
  import streamlit as st
3
  import pickle
@@ -12,23 +109,15 @@ import os
12
  def process_pdf(uploaded_file):
13
 
14
  all_documents = []
15
- # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
16
 
17
  main_placeholder = st.empty()
18
  # Creating a temporary file to store the uploaded PDF's
19
  main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
20
  for uploaded_file in uploaded_file:
21
-
22
  with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
23
- temp_file.write(uploaded_file) ## write file to temporary
24
  temp_file_path = temp_file.name # Get the temporary file path
25
-
26
- # temp_file_path = os.path.join("/tmp", uploaded_file.name)
27
- # with open(temp_file_path, "wb") as f:
28
- # f.write(uploaded_file.read())
29
-
30
- # st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
31
-
32
 
33
 
34
  # Load the PDF's from the temporary file path
@@ -55,41 +144,5 @@ def process_pdf(uploaded_file):
55
 
56
  else:
57
  st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
58
-
59
-
60
-
61
- # def process_pdf(uploaded_files):
62
- # all_documents = []
63
- # main_placeholder = st.empty()
64
- # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
65
-
66
- # for uploaded_file in uploaded_files:
67
- # temp_file_path = os.path.join("/tmp", uploaded_file.name)
68
- # with open(temp_file_path, "wb") as f:
69
- # f.write(uploaded_file.read())
70
-
71
- # st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
72
-
73
- # loader = PyPDFLoader(temp_file_path)
74
- # doc = loader.load()
75
- # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
76
-
77
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
78
- # final_documents = text_splitter.split_documents(doc)
79
- # all_documents.extend(final_documents)
80
-
81
- # if all_documents:
82
- # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
83
 
84
- # # ⏬ Move embedding initialization here
85
- # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
86
- # st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
87
- # st.session_state.docs = all_documents
88
-
89
- # faiss_index = st.session_state.vectors.index
90
- # faiss.write_index(faiss_index, "faiss_index.bin")
91
- # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
92
-
93
- # else:
94
- # st.error("No documents found or the PDF is corrupted.")
95
 
 
1
+ # import tempfile
2
+ # import streamlit as st
3
+ # import pickle
4
+ # from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
+ # from langchain_community.document_loaders import PyPDFLoader
6
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ # from langchain_community.vectorstores import FAISS
8
+ # import faiss
9
+ # import os
10
+
11
+
12
+ # def process_pdf(uploaded_file):
13
+
14
+ # all_documents = []
15
+ # # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
16
+
17
+ # main_placeholder = st.empty()
18
+ # # Creating a temporary file to store the uploaded PDF's
19
+ # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
20
+ # for uploaded_file in uploaded_file:
21
+
22
+ # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
23
+ # temp_file.write(uploaded_file) ## write file to temporary
24
+ # temp_file_path = temp_file.name # Get the temporary file path
25
+
26
+ # # temp_file_path = os.path.join("/tmp", uploaded_file.name)
27
+ # # with open(temp_file_path, "wb") as f:
28
+ # # f.write(uploaded_file.read())
29
+
30
+ # # st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
31
+
32
+
33
+
34
+ # # Load the PDF's from the temporary file path
35
+
36
+
37
+ # loader = PyPDFLoader(temp_file_path) # Document loader
38
+ # doc= loader.load() # load Document
39
+ # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
40
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
41
+ # #final_documents = text_splitter.split_documents(doc)# splitting
42
+ # final_documents = text_splitter.split_documents(doc)
43
+ # all_documents.extend(final_documents)
44
+
45
+
46
+ # if all_documents:
47
+ # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
48
+ # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
49
+ # st.session_state.docs = all_documents
50
+
51
+ # # Save FAISS vector store to disk
52
+ # faiss_index = st.session_state.vectors.index # Extract FAISS index
53
+ # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
54
+ # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
55
+
56
+ # else:
57
+ # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
58
+
59
+
60
+
61
+ # # def process_pdf(uploaded_files):
62
+ # # all_documents = []
63
+ # # main_placeholder = st.empty()
64
+ # # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
65
+
66
+ # # for uploaded_file in uploaded_files:
67
+ # # temp_file_path = os.path.join("/tmp", uploaded_file.name)
68
+ # # with open(temp_file_path, "wb") as f:
69
+ # # f.write(uploaded_file.read())
70
+
71
+ # # st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
72
+
73
+ # # loader = PyPDFLoader(temp_file_path)
74
+ # # doc = loader.load()
75
+ # # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
76
+
77
+ # # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
78
+ # # final_documents = text_splitter.split_documents(doc)
79
+ # # all_documents.extend(final_documents)
80
+
81
+ # # if all_documents:
82
+ # # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
83
+
84
+ # # # ⏬ Move embedding initialization here
85
+ # # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
86
+ # # st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
87
+ # # st.session_state.docs = all_documents
88
+
89
+ # # faiss_index = st.session_state.vectors.index
90
+ # # faiss.write_index(faiss_index, "faiss_index.bin")
91
+ # # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
92
+
93
+ # # else:
94
+ # # st.error("No documents found or the PDF is corrupted.")
95
+
96
+
97
+
98
  import tempfile
99
  import streamlit as st
100
  import pickle
 
109
  def process_pdf(uploaded_file):
110
 
111
  all_documents = []
112
+ st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
113
 
114
  main_placeholder = st.empty()
115
  # Creating a temporary file to store the uploaded PDF's
116
  main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
117
  for uploaded_file in uploaded_file:
 
118
  with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
119
+ temp_file.write(uploaded_file.read()) ## write file to temporary
120
  temp_file_path = temp_file.name # Get the temporary file path
 
 
 
 
 
 
 
121
 
122
 
123
  # Load the PDF's from the temporary file path
 
144
 
145
  else:
146
  st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
 
 
 
 
 
 
 
 
 
 
 
148