ChinarQ-AI Uzaiir commited on
Commit
aaa5d7e
Β·
verified Β·
1 Parent(s): 8fb1cbc

Update src/PDFprocess_sample.py (#16)

Browse files

- Update src/PDFprocess_sample.py (fbdd134c6e6630672b508e4e3101c94fbc0ac02c)


Co-authored-by: Khan <Uzaiir@users.noreply.huggingface.co>

Files changed (1) hide show
  1. src/PDFprocess_sample.py +81 -62
src/PDFprocess_sample.py CHANGED
@@ -1,85 +1,104 @@
1
- import tempfile
2
- import streamlit as st
3
- import pickle
4
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
- from langchain_community.document_loaders import PyPDFLoader
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain_community.vectorstores import FAISS
8
- import faiss
9
 
10
 
11
- # def process_pdf(uploaded_file):
12
 
13
- # all_documents = []
14
- # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
- # main_placeholder = st.empty()
17
- # # Creating a temporary file to store the uploaded PDF's
18
- # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
- # for uploaded_file in uploaded_file:
20
- # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
- # temp_file.write(uploaded_file.read()) ## write file to temporary
22
- # temp_file_path = temp_file.name # Get the temporary file path
23
 
24
 
25
- # # Load the PDF's from the temporary file path
26
 
27
 
28
- # loader = PyPDFLoader(temp_file_path) # Document loader
29
- # doc= loader.load() # load Document
30
- # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
- # #final_documents = text_splitter.split_documents(doc)# splitting
33
- # final_documents = text_splitter.split_documents(doc)
34
- # all_documents.extend(final_documents)
35
 
36
 
37
- # if all_documents:
38
- # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
- # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
- # st.session_state.docs = all_documents
41
 
42
- # # Save FAISS vector store to disk
43
- # faiss_index = st.session_state.vectors.index # Extract FAISS index
44
- # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
- # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
 
47
- # else:
48
- # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
49
-
50
 
51
 
52
  import streamlit as st
53
- import pickle
54
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
55
  from langchain_community.document_loaders import PyPDFLoader
56
  from langchain.text_splitter import RecursiveCharacterTextSplitter
57
- from langchain_community.vectorstores import FAISS
58
- import faiss
59
 
 
 
 
 
 
 
 
60
 
61
- def process_pdf(file_path): # Expecting file path string
62
- st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
63
-
64
- main_placeholder = st.empty()
65
- main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
 
 
 
 
66
 
67
- # Load the PDF from the given file path
68
- loader = PyPDFLoader(file_path)
69
- doc = loader.load()
70
 
71
- main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
72
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
73
- final_documents = text_splitter.split_documents(doc)
74
 
75
- if final_documents:
76
- main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
77
- st.session_state.vectors = FAISS.from_documents(final_documents, st.session_state.embeddings)
78
- st.session_state.docs = final_documents
 
79
 
80
- # Save FAISS vector store to disk
81
- faiss_index = st.session_state.vectors.index
82
- faiss.write_index(faiss_index, "faiss_index.bin")
83
- main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
84
- else:
85
- st.error("No documents found or the PDF is corrupted.")
 
 
 
 
 
 
 
 
 
 
 
1
+ # import tempfile
2
+ # import streamlit as st
3
+ # import pickle
4
+ # from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
+ # from langchain_community.document_loaders import PyPDFLoader
6
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ # from langchain_community.vectorstores import FAISS
8
+ # import faiss
9
 
10
 
11
+ # # def process_pdf(uploaded_file):
12
 
13
+ # # all_documents = []
14
+ # # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
15
 
16
+ # # main_placeholder = st.empty()
17
+ # # # Creating a temporary file to store the uploaded PDF's
18
+ # # main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
19
+ # # for uploaded_file in uploaded_file:
20
+ # # with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
21
+ # # temp_file.write(uploaded_file.read()) ## write file to temporary
22
+ # # temp_file_path = temp_file.name # Get the temporary file path
23
 
24
 
25
+ # # # Load the PDF's from the temporary file path
26
 
27
 
28
+ # # loader = PyPDFLoader(temp_file_path) # Document loader
29
+ # # doc= loader.load() # load Document
30
+ # # main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
31
+ # # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
32
+ # # #final_documents = text_splitter.split_documents(doc)# splitting
33
+ # # final_documents = text_splitter.split_documents(doc)
34
+ # # all_documents.extend(final_documents)
35
 
36
 
37
+ # # if all_documents:
38
+ # # main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
39
+ # # st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
40
+ # # st.session_state.docs = all_documents
41
 
42
+ # # # Save FAISS vector store to disk
43
+ # # faiss_index = st.session_state.vectors.index # Extract FAISS index
44
+ # # faiss.write_index(faiss_index, "faiss_index.bin") # Save index to a binary file
45
+ # # main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
46
 
47
+ # # else:
48
+ # # st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
 
49
 
50
 
51
  import streamlit as st
52
+ import faiss
 
53
  from langchain_community.document_loaders import PyPDFLoader
54
  from langchain.text_splitter import RecursiveCharacterTextSplitter
55
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
56
+ from langchain.vectorstores import FAISS
57
 
58
+ def process_pdf_from_path(file_path):
59
+ """
60
+ Processes a PDF from a given path by:
61
+ - Loading the PDF
62
+ - Splitting it into manageable chunks
63
+ - Creating embeddings with Gemini
64
+ - Saving the FAISS vector index to disk
65
 
66
+ Parameters:
67
+ file_path (str): Path to the uploaded PDF file
68
+ """
69
+
70
+ all_documents = []
71
+
72
+ try:
73
+ # Initialize embeddings model
74
+ st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
75
 
76
+ main_placeholder = st.empty()
77
+ main_placeholder.text("Loading PDF and preparing text... βœ…")
 
78
 
79
+ # Load PDF document
80
+ loader = PyPDFLoader(file_path)
81
+ documents = loader.load()
82
 
83
+ # Split documents into smaller chunks
84
+ main_placeholder.text("Splitting text into chunks... βœ…")
85
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
86
+ final_documents = text_splitter.split_documents(documents)
87
+ all_documents.extend(final_documents)
88
 
89
+ if all_documents:
90
+ main_placeholder.text("Creating vector embeddings... βœ…")
91
+ # Generate vector store
92
+ st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
93
+ st.session_state.docs = all_documents
94
+
95
+ # Save FAISS index
96
+ faiss_index = st.session_state.vectors.index
97
+ faiss.write_index(faiss_index, "/tmp/faiss_index.bin")
98
+
99
+ main_placeholder.text("Vector database created successfully! πŸŽ‰")
100
+ else:
101
+ st.error("No valid documents found in the uploaded PDF.")
102
+
103
+ except Exception as e:
104
+ st.error(f"An error occurred while processing the PDF: {e}")