File size: 4,343 Bytes
aaa5d7e
 
 
 
 
 
 
 
3ea0a6e
 
aaa5d7e
3ea0a6e
aaa5d7e
 
3ea0a6e
aaa5d7e
 
 
 
 
 
 
3ea0a6e
 
aaa5d7e
3ea0a6e
 
aaa5d7e
 
 
 
 
 
 
3ea0a6e
 
aaa5d7e
 
 
 
3ea0a6e
aaa5d7e
 
 
 
3ea0a6e
aaa5d7e
 
daa4678
 
 
aaa5d7e
daa4678
 
aaa5d7e
 
daa4678
aaa5d7e
 
 
 
 
 
 
daa4678
aaa5d7e
 
 
 
 
 
 
 
 
daa4678
aaa5d7e
 
daa4678
aaa5d7e
 
 
daa4678
aaa5d7e
 
 
 
 
daa4678
aaa5d7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# import tempfile
# import streamlit as st
# import pickle
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_community.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import FAISS
# import faiss 


# # def process_pdf(uploaded_file):
    
# #     all_documents = []
# #     st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
# #     main_placeholder = st.empty()
# #     # Creating  a temporary file to store the uploaded PDF's
# #     main_placeholder.text("Data Loading...Started...βœ…βœ…βœ…")
# #     for uploaded_file in uploaded_file:
# #         with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
# #             temp_file.write(uploaded_file.read()) ## write file to temporary
# #             temp_file_path = temp_file.name  # Get the temporary file path
            
            
# #             # Load the PDF's from the temporary file path
            
        
# #         loader = PyPDFLoader(temp_file_path) # Document loader
# #         doc= loader.load() # load Document 
# #         main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
# #         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
# #         #final_documents = text_splitter.split_documents(doc)# splitting
# #         final_documents = text_splitter.split_documents(doc)
# #         all_documents.extend(final_documents)
        
        
# #         if all_documents:
# #             main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
# #             st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
# #             st.session_state.docs = all_documents 
            
# #             # Save FAISS vector store to disk
# #             faiss_index = st.session_state.vectors.index  # Extract FAISS index
# #             faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file
# #             main_placeholder.text("Vector database created!...βœ…βœ…βœ…")   
            
# #         else:
# #             st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")


import streamlit as st
import faiss
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS

def process_pdf_from_path(file_path):
    """
    Processes a PDF from a given path by:
    - Loading the PDF
    - Splitting it into manageable chunks
    - Creating embeddings with Gemini
    - Saving the FAISS vector index to disk

    Parameters:
    file_path (str): Path to the uploaded PDF file
    """

    all_documents = []

    try:
        # Initialize embeddings model
        st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

        main_placeholder = st.empty()
        main_placeholder.text("Loading PDF and preparing text... βœ…")

        # Load PDF document
        loader = PyPDFLoader(file_path)
        documents = loader.load()

        # Split documents into smaller chunks
        main_placeholder.text("Splitting text into chunks... βœ…")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        final_documents = text_splitter.split_documents(documents)
        all_documents.extend(final_documents)

        if all_documents:
            main_placeholder.text("Creating vector embeddings... βœ…")
            # Generate vector store
            st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
            st.session_state.docs = all_documents

            # Save FAISS index
            faiss_index = st.session_state.vectors.index
            faiss.write_index(faiss_index, "/tmp/faiss_index.bin")

            main_placeholder.text("Vector database created successfully! πŸŽ‰")
        else:
            st.error("No valid documents found in the uploaded PDF.")
    
    except Exception as e:
        st.error(f"An error occurred while processing the PDF: {e}")