File size: 3,698 Bytes
3ea0a6e
 
 
 
 
 
 
 
 
 
2ee3a93
3ea0a6e
2ee3a93
 
3ea0a6e
2ee3a93
 
 
 
 
 
 
3ea0a6e
 
2ee3a93
3ea0a6e
 
2ee3a93
 
 
 
 
 
 
3ea0a6e
 
2ee3a93
 
 
 
3ea0a6e
2ee3a93
 
 
 
3ea0a6e
2ee3a93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import tempfile
import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss 


# def process_pdf(uploaded_file):
    
#     all_documents = []
#     st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
#     main_placeholder = st.empty()
#     # Creating  a temporary file to store the uploaded PDF's
#     main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
#     for uploaded_file in uploaded_file:
#         with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
#             temp_file.write(uploaded_file.read()) ## write file to temporary
#             temp_file_path = temp_file.name  # Get the temporary file path
            
            
#             # Load the PDF's from the temporary file path
            
        
#         loader = PyPDFLoader(temp_file_path) # Document loader
#         doc= loader.load() # load Document 
#         main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
#         #final_documents = text_splitter.split_documents(doc)# splitting
#         final_documents = text_splitter.split_documents(doc)
#         all_documents.extend(final_documents)
        
        
#         if all_documents:
#             main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
#             st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
#             st.session_state.docs = all_documents 
            
#             # Save FAISS vector store to disk
#             faiss_index = st.session_state.vectors.index  # Extract FAISS index
#             faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file
#             main_placeholder.text("Vector database created!...βœ…βœ…βœ…")   
            
#         else:
#             st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")



import streamlit as st
import pickle
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss


def process_pdf(file_path):  # Expecting file path string
    st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
    main_placeholder = st.empty()
    main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")

    # Load the PDF from the given file path
    loader = PyPDFLoader(file_path)
    doc = loader.load()

    main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    final_documents = text_splitter.split_documents(doc)

    if final_documents:
        main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
        st.session_state.vectors = FAISS.from_documents(final_documents, st.session_state.embeddings)
        st.session_state.docs = final_documents

        # Save FAISS vector store to disk
        faiss_index = st.session_state.vectors.index
        faiss.write_index(faiss_index, "faiss_index.bin")
        main_placeholder.text("Vector database created!...βœ…βœ…βœ…")
    else:
        st.error("No documents found or the PDF is corrupted.")