Spaces:

Chinar-Q-AI
/

pdf-Interactor

Sleeping

App Files Files Community

ChinarQ-AI

Uzaiir commited on May 14, 2025

Commit

aaa5d7e

verified ·

1 Parent(s): 8fb1cbc

Update src/PDFprocess_sample.py (#16)

Browse files

- Update src/PDFprocess_sample.py (fbdd134c6e6630672b508e4e3101c94fbc0ac02c)

Co-authored-by: Khan <Uzaiir@users.noreply.huggingface.co>

Files changed (1) hide show

src/PDFprocess_sample.py +81 -62

src/PDFprocess_sample.py CHANGED Viewed

@@ -1,85 +1,104 @@
-import tempfile
-import streamlit as st
-import pickle
-from langchain_google_genai import GoogleGenerativeAIEmbeddings
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-import faiss
-# def process_pdf(uploaded_file):
-#     all_documents = []
-#     st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-#     main_placeholder = st.empty()
-#     # Creating  a temporary file to store the uploaded PDF's
-#     main_placeholder.text("Data Loading...Started...✅✅✅")
-#     for uploaded_file in uploaded_file:
-#         with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
-#             temp_file.write(uploaded_file.read()) ## write file to temporary
-#             temp_file_path = temp_file.name  # Get the temporary file path
-#             # Load the PDF's from the temporary file path
-#         loader = PyPDFLoader(temp_file_path) # Document loader
-#         doc= loader.load() # load Document
-#         main_placeholder.text("Text Splitter...Started...✅✅✅")
-#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
-#         #final_documents = text_splitter.split_documents(doc)# splitting
-#         final_documents = text_splitter.split_documents(doc)
-#         all_documents.extend(final_documents)
-#         if all_documents:
-#             main_placeholder.text("Embedding Vector Started Building...✅✅✅")
-#             st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
-#             st.session_state.docs = all_documents
-#             # Save FAISS vector store to disk
-#             faiss_index = st.session_state.vectors.index  # Extract FAISS index
-#             faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file
-#             main_placeholder.text("Vector database created!...✅✅✅")
-#         else:
-#             st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
 import streamlit as st
-import pickle
-from langchain_google_genai import GoogleGenerativeAIEmbeddings
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-import faiss
-def process_pdf(file_path):  # Expecting file path string
-    st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    main_placeholder = st.empty()
-    main_placeholder.text("Data Loading...Started...✅✅✅")
-    # Load the PDF from the given file path
-    loader = PyPDFLoader(file_path)
-    doc = loader.load()
-    main_placeholder.text("Text Splitter...Started...✅✅✅")
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    final_documents = text_splitter.split_documents(doc)
-    if final_documents:
-        main_placeholder.text("Embedding Vector Started Building...✅✅✅")
-        st.session_state.vectors = FAISS.from_documents(final_documents, st.session_state.embeddings)
-        st.session_state.docs = final_documents
-        # Save FAISS vector store to disk
-        faiss_index = st.session_state.vectors.index
-        faiss.write_index(faiss_index, "faiss_index.bin")
-        main_placeholder.text("Vector database created!...✅✅✅")
-    else:
-        st.error("No documents found or the PDF is corrupted.")

+# import tempfile
+# import streamlit as st
+# import pickle
+# from langchain_google_genai import GoogleGenerativeAIEmbeddings
+# from langchain_community.document_loaders import PyPDFLoader
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain_community.vectorstores import FAISS
+# import faiss
+# # def process_pdf(uploaded_file):
+# #     all_documents = []
+# #     st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+# #     main_placeholder = st.empty()
+# #     # Creating  a temporary file to store the uploaded PDF's
+# #     main_placeholder.text("Data Loading...Started...✅✅✅")
+# #     for uploaded_file in uploaded_file:
+# #         with tempfile.NamedTemporaryFile(delete=False , suffix='.pdf') as temp_file:
+# #             temp_file.write(uploaded_file.read()) ## write file to temporary
+# #             temp_file_path = temp_file.name  # Get the temporary file path
+# #             # Load the PDF's from the temporary file path
+# #         loader = PyPDFLoader(temp_file_path) # Document loader
+# #         doc= loader.load() # load Document
+# #         main_placeholder.text("Text Splitter...Started...✅✅✅")
+# #         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Recursive Character String
+# #         #final_documents = text_splitter.split_documents(doc)# splitting
+# #         final_documents = text_splitter.split_documents(doc)
+# #         all_documents.extend(final_documents)
+# #         if all_documents:
+# #             main_placeholder.text("Embedding Vector Started Building...✅✅✅")
+# #             st.session_state.vectors = FAISS.from_documents(all_documents,st.session_state.embeddings)
+# #             st.session_state.docs = all_documents
+# #             # Save FAISS vector store to disk
+# #             faiss_index = st.session_state.vectors.index  # Extract FAISS index
+# #             faiss.write_index(faiss_index, "faiss_index.bin")  # Save index to a binary file
+# #             main_placeholder.text("Vector database created!...✅✅✅")
+# #         else:
+# #             st.error("No documents found after processing the uploaded files or the pdf is corrupted / unsupported.")
 import streamlit as st
+import faiss
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain.vectorstores import FAISS
+def process_pdf_from_path(file_path):
+    """
+    Processes a PDF from a given path by:
+    - Loading the PDF
+    - Splitting it into manageable chunks
+    - Creating embeddings with Gemini
+    - Saving the FAISS vector index to disk
+    Parameters:
+    file_path (str): Path to the uploaded PDF file
+    """
+    all_documents = []
+    try:
+        # Initialize embeddings model
+        st.session_state.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+        main_placeholder = st.empty()
+        main_placeholder.text("Loading PDF and preparing text... ✅")
+        # Load PDF document
+        loader = PyPDFLoader(file_path)
+        documents = loader.load()
+        # Split documents into smaller chunks
+        main_placeholder.text("Splitting text into chunks... ✅")
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+        final_documents = text_splitter.split_documents(documents)
+        all_documents.extend(final_documents)
+        if all_documents:
+            main_placeholder.text("Creating vector embeddings... ✅")
+            # Generate vector store
+            st.session_state.vectors = FAISS.from_documents(all_documents, st.session_state.embeddings)
+            st.session_state.docs = all_documents
+            # Save FAISS index
+            faiss_index = st.session_state.vectors.index
+            faiss.write_index(faiss_index, "/tmp/faiss_index.bin")
+            main_placeholder.text("Vector database created successfully! 🎉")
+        else:
+            st.error("No valid documents found in the uploaded PDF.")
+    except Exception as e:
+        st.error(f"An error occurred while processing the PDF: {e}")