File size: 6,494 Bytes
9acac0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f33131
 
 
37567cd
9f33131
 
 
 
1359360
9f33131
 
7eceee8
9f33131
7eceee8
9acac0b
9f33131
7eceee8
 
 
 
04c37c4
9acac0b
04c37c4
9f33131
 
7eceee8
9f33131
 
7eceee8
 
 
 
 
 
 
9f33131
 
7eceee8
 
 
 
9f33131
7eceee8
 
04c1b26
 
 
 
 
7eceee8
9f33131
7eceee8
 
9f33131
cb666fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# import tempfile
# import streamlit as st
# import pickle
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_community.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# import faiss 
# import os


# def process_pdf(uploaded_file):
    
#     all_documents = []
#     # st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
#     main_placeholder = st.empty()
#     # Creating  a temporary file to store the uploaded PDF's
#     main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
#     for uploaded_file in uploaded_file:
        
#         with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
#             temp_file.write(uploaded_file) ## write file to temporary
#             temp_file_path = temp_file.name  # Get the temporary file path

#         # temp_file_path = os.path.join("/tmp", uploaded_file.name)
#         # with open(temp_file_path, "wb") as f:
#         #     f.write(uploaded_file.read())

#         # st.write(f"Uploaded files: {[file.name for file in uploaded_file]}")

            
            
#             # Load the PDF's from the temporary file path
            
        
#         loader = PyPDFLoader(temp_file_path) # Document loader
#         doc= loader.load() # load Document 
#         main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
#         #final_documents = text_splitter.split_documents(doc)# splitting
#         final_documents = text_splitter.split_documents(doc)
#         all_documents.extend(final_documents)
        
        
#         if all_documents:
#             main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
#             st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
#             st.session_state.docs = all_documents 
            
#             # Save FAISS vector store to disk
#             faiss_index = st.session_state.vectors.index  # Extract FAISS index
#             faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file
#             main_placeholder.text("Vector database created!...βœ…βœ…βœ…")   
            
#         else:
#             st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")



# # def process_pdf(uploaded_files):
# #     all_documents = []
# #     main_placeholder = st.empty()
# #     main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")

# #     for uploaded_file in uploaded_files:
# #         temp_file_path = os.path.join("/tmp", uploaded_file.name)
# #         with open(temp_file_path, "wb") as f:
# #             f.write(uploaded_file.read())

# #         st.write(f"Uploaded files: {[file.name for file in uploaded_files]}")

# #         loader = PyPDFLoader(temp_file_path)
# #         doc = loader.load()
# #         main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")

# #         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# #         final_documents = text_splitter.split_documents(doc)
# #         all_documents.extend(final_documents)

# #     if all_documents:
# #         main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
        
# #         # ⏬ Move embedding initialization here
# #         st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# #         st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
# #         st.session_state.docs = all_documents

# #         faiss_index = st.session_state.vectors.index
# #         faiss.write_index(faiss_index, "faiss_index.bin")
# #         main_placeholder.text("Vector database created!...βœ…βœ…βœ…")

# #     else:
# #         st.error("No documents found or the PDF is corrupted.")



import tempfile
import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss 
import os


def process_pdf(uploaded_file):
    
    all_documents = []
    st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
    main_placeholder = st.empty()
    # Creating  a temporary file to store the uploaded PDF's
    main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
    for uploaded_file in uploaded_file:
        with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
            temp_file.write(uploaded_file.read()) ## write file to temporary
            temp_file_path = temp_file.name  # Get the temporary file path
            
            
            # Load the PDF's from the temporary file path
            
        
        loader = PyPDFLoader(temp_file_path) # Document loader
        doc= loader.load() # load Document 
        main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
        #final_documents = text_splitter.split_documents(doc)# splitting
        final_documents = text_splitter.split_documents(doc)
        all_documents.extend(final_documents)
        
        
        if all_documents:
            main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
            st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
            st.session_state.docs = all_documents 
            
            # Save FAISS vector store to disk
            faiss_index = st.session_state.vectors.index  # Extract FAISS index
            # faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file

            output_path = "/tmp/faiss_index.bin"  # or another writable path
            faiss.write_index(faiss_index, output_path)
            
            main_placeholder.text("Vector database created!...βœ…βœ…βœ…")   
            
        else:
            st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")