Uzaiir commited on
Commit
7eceee8
Β·
verified Β·
1 Parent(s): 37567cd

Update src/PDFprocess_sample.py

Browse files
Files changed (1) hide show
  1. src/PDFprocess_sample.py +58 -58
src/PDFprocess_sample.py CHANGED
@@ -8,87 +8,87 @@ from langchain_community.vectorstores import FAISS
8
  import faiss
9
 
10
 
11
- # def process_pdf(uploaded_file):
12
 
13
- # all_documents = []
14
- # # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
- # main_placeholder = st.empty()
17
- # # Creating a temporary file to store the uploaded PDF's
18
- # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
- # for uploaded_file in uploaded_file:
20
 
21
- # # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
22
- # # temp_file.write(uploaded_file.read()) ## write file to temporary
23
- # # temp_file_path = temp_file.name # Get the temporary file path
24
 
25
- # temp_file_path = os.path.join("/tmp", uploaded_file.name)
26
- # with open(temp_file_path, "wb") as f:
27
- # f.write(uploaded_file.read())
28
 
29
- # st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
30
 
31
 
32
 
33
- # # Load the PDF's from the temporary file path
34
 
35
 
36
- # loader = PyPDFLoader(temp_file_path) # Document loader
37
- # doc= loader.load() # load Document
38
- # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
39
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
40
- # #final_documents = text_splitter.split_documents(doc)# splitting
41
- # final_documents = text_splitter.split_documents(doc)
42
- # all_documents.extend(final_documents)
43
 
44
 
45
- # if all_documents:
46
- # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
47
- # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
48
- # st.session_state.docs = all_documents
49
 
50
- # # Save FAISS vector store to disk
51
- # faiss_index = st.session_state.vectors.index # Extract FAISS index
52
- # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
53
- # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
54
 
55
- # else:
56
- # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
57
 
58
 
59
 
60
- def process_pdf(uploaded_files):
61
- all_documents = []
62
- main_placeholder = st.empty()
63
- main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
64
 
65
- for uploaded_file in uploaded_files:
66
- temp_file_path = os.path.join("/tmp", uploaded_file.name)
67
- with open(temp_file_path, "wb") as f:
68
- f.write(uploaded_file.read())
69
 
70
- st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
71
 
72
- loader = PyPDFLoader(temp_file_path)
73
- doc = loader.load()
74
- main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
75
 
76
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
77
- final_documents = text_splitter.split_documents(doc)
78
- all_documents.extend(final_documents)
79
 
80
- if all_documents:
81
- main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
82
 
83
- # ⏬ Move embedding initialization here
84
- st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
85
- st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
86
- st.session_state.docs = all_documents
87
 
88
- faiss_index = st.session_state.vectors.index
89
- faiss.write_index(faiss_index, "faiss_index.bin")
90
- main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
91
 
92
- else:
93
- st.error("No documents found or the PDF is corrupted.")
94
 
 
8
  import faiss
9
 
10
 
11
+ def process_pdf(uploaded_file):
12
 
13
+ all_documents = []
14
+ # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
+ main_placeholder = st.empty()
17
+ # Creating a temporary file to store the uploaded PDF's
18
+ main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
+ for uploaded_file in uploaded_file:
20
 
21
+ # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
22
+ # temp_file.write(uploaded_file.read()) ## write file to temporary
23
+ # temp_file_path = temp_file.name # Get the temporary file path
24
 
25
+ temp_file_path = os.path.join("/tmp", uploaded_file.name)
26
+ with open(temp_file_path, "wb") as f:
27
+ f.write(uploaded_file.read())
28
 
29
+ st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")
30
 
31
 
32
 
33
+ # Load the PDF's from the temporary file path
34
 
35
 
36
+ loader = PyPDFLoader(temp_file_path) # Document loader
37
+ doc= loader.load() # load Document
38
+ main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
39
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
40
+ #final_documents = text_splitter.split_documents(doc)# splitting
41
+ final_documents = text_splitter.split_documents(doc)
42
+ all_documents.extend(final_documents)
43
 
44
 
45
+ if all_documents:
46
+ main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
47
+ st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
48
+ st.session_state.docs = all_documents
49
 
50
+ # Save FAISS vector store to disk
51
+ faiss_index = st.session_state.vectors.index # Extract FAISS index
52
+ faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
53
+ main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
54
 
55
+ else:
56
+ st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
57
 
58
 
59
 
60
+ # def process_pdf(uploaded_files):
61
+ # all_documents = []
62
+ # main_placeholder = st.empty()
63
+ # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
64
 
65
+ # for uploaded_file in uploaded_files:
66
+ # temp_file_path = os.path.join("/tmp", uploaded_file.name)
67
+ # with open(temp_file_path, "wb") as f:
68
+ # f.write(uploaded_file.read())
69
 
70
+ # st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")
71
 
72
+ # loader = PyPDFLoader(temp_file_path)
73
+ # doc = loader.load()
74
+ # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
75
 
76
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
77
+ # final_documents = text_splitter.split_documents(doc)
78
+ # all_documents.extend(final_documents)
79
 
80
+ # if all_documents:
81
+ # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
82
 
83
+ # # ⏬ Move embedding initialization here
84
+ # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
85
+ # st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
86
+ # st.session_state.docs = all_documents
87
 
88
+ # faiss_index = st.session_state.vectors.index
89
+ # faiss.write_index(faiss_index, "faiss_index.bin")
90
+ # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
91
 
92
+ # else:
93
+ # st.error("No documents found or the PDF is corrupted.")
94