ChinarQ-AI Uzaiir commited on
Commit
daa4678
Β·
verified Β·
1 Parent(s): 0c040d4

Update src/PDFprocess_sample.py (#14)

Browse files

- Update src/PDFprocess_sample.py (2ee3a93666795a50884871e764d7e116c5b65316)


Co-authored-by: Khan <Uzaiir@users.noreply.huggingface.co>

Files changed (1) hide show
  1. src/PDFprocess_sample.py +65 -29
src/PDFprocess_sample.py CHANGED
@@ -8,42 +8,78 @@ from langchain_community.vectorstores import FAISS
8
  import faiss
9
 
10
 
11
- def process_pdf(uploaded_file):
12
 
13
- all_documents = []
14
- st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
- main_placeholder = st.empty()
17
- # Creating a temporary file to store the uploaded PDF's
18
- main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
- for uploaded_file in uploaded_file:
20
- with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
- temp_file.write(uploaded_file.read()) ## write file to temporary
22
- temp_file_path = temp_file.name # Get the temporary file path
23
 
24
 
25
- # Load the PDF's from the temporary file path
26
 
27
 
28
- loader = PyPDFLoader(temp_file_path) # Document loader
29
- doc= loader.load() # load Document
30
- main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
- #final_documents = text_splitter.split_documents(doc)# splitting
33
- final_documents = text_splitter.split_documents(doc)
34
- all_documents.extend(final_documents)
35
 
36
 
37
- if all_documents:
38
- main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
- st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
- st.session_state.docs = all_documents
41
 
42
- # Save FAISS vector store to disk
43
- faiss_index = st.session_state.vectors.index # Extract FAISS index
44
- faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
- main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
 
47
- else:
48
- st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
49
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import faiss
9
 
10
 
11
+ # def process_pdf(uploaded_file):
12
 
13
+ # all_documents = []
14
+ # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
+ # main_placeholder = st.empty()
17
+ # # Creating a temporary file to store the uploaded PDF's
18
+ # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
+ # for uploaded_file in uploaded_file:
20
+ # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
+ # temp_file.write(uploaded_file.read()) ## write file to temporary
22
+ # temp_file_path = temp_file.name # Get the temporary file path
23
 
24
 
25
+ # # Load the PDF's from the temporary file path
26
 
27
 
28
+ # loader = PyPDFLoader(temp_file_path) # Document loader
29
+ # doc= loader.load() # load Document
30
+ # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
+ # #final_documents = text_splitter.split_documents(doc)# splitting
33
+ # final_documents = text_splitter.split_documents(doc)
34
+ # all_documents.extend(final_documents)
35
 
36
 
37
+ # if all_documents:
38
+ # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
+ # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
+ # st.session_state.docs = all_documents
41
 
42
+ # # Save FAISS vector store to disk
43
+ # faiss_index = st.session_state.vectors.index # Extract FAISS index
44
+ # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
+ # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
 
47
+ # else:
48
+ # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
49
+
50
+
51
+
52
+ import streamlit as st
53
+ import pickle
54
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
55
+ from langchain_community.document_loaders import PyPDFLoader
56
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
57
+ from langchain_community.vectorstores import FAISS
58
+ import faiss
59
+
60
+
61
+ def process_pdf(file_path): # Expecting file path string
62
+ st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
63
+
64
+ main_placeholder = st.empty()
65
+ main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
66
+
67
+ # Load the PDF from the given file path
68
+ loader = PyPDFLoader(file_path)
69
+ doc = loader.load()
70
+
71
+ main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
72
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
73
+ final_documents = text_splitter.split_documents(doc)
74
+
75
+ if final_documents:
76
+ main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
77
+ st.session_state.vectors = FAISS.from_documents(final_documents, st.session_state.embeddings)
78
+ st.session_state.docs = final_documents
79
+
80
+ # Save FAISS vector store to disk
81
+ faiss_index = st.session_state.vectors.index
82
+ faiss.write_index(faiss_index, "faiss_index.bin")
83
+ main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
84
+ else:
85
+ st.error("No documents found or the PDF is corrupted.")