Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

9847598

verified ·

1 Parent(s): 542ad54

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -18

app.py CHANGED Viewed

@@ -3,14 +3,14 @@ import pandas as pd
 import io
 import tempfile
 import os
-from langchain_community.document_loaders import UnstructuredFileLoader
 # Create a temporary directory for storing download files
 temp_dir = tempfile.TemporaryDirectory()
 def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=None):
     """
-    Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
@@ -21,7 +21,8 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
         tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
     """
     try:
-        loader = UnstructuredFileLoader(pdf_file_path)
         documents = loader.load()
         total_pages = len(documents)
@@ -29,23 +30,35 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
         # Validate and adjust page range
         if start_page is not None and end_page is not None:
             if start_page < 1:
                 start_page = 1
             if end_page > total_pages:
                 end_page = total_pages
             if start_page > end_page:
                 start_page, end_page = end_page, start_page  # Swap if out of order
             selected_docs = documents[start_page - 1:end_page]
         else:
-            selected_docs = documents  # Extract all pages
         # Concatenate selected page contents into a single string
         pdf_pages_content = '\n'.join(doc.page_content for doc in selected_docs)
         extracted_data = []
-        for idx, doc in enumerate(selected_docs, start=1):  # Page numbering starts at 1
-            page_num = idx  # Assigning sequential page numbers based on selection
             paragraphs = doc.page_content.split("\n\n")  # Split into paragraphs
             for paragraph in paragraphs:
@@ -105,8 +118,8 @@ def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         extraction_mode (str): "All Pages" or "Range of Pages".
-        start_page (int): Starting page number for extraction.
-        end_page (int): Ending page number for extraction.
     Returns:
         tuple: Paths to CSV and TXT files, Status message.
@@ -169,7 +182,7 @@ with gr.Blocks() as demo:
             type="filepath",  # Ensure type is set to "filepath"
             interactive=True
         )
     with gr.Row():
         extraction_mode = gr.Radio(
             label="Extraction Mode",
@@ -177,7 +190,7 @@ with gr.Blocks() as demo:
             value="All Pages",
             interactive=True
         )
     with gr.Row():
         start_page = gr.Number(
             label="Start Page",
@@ -193,18 +206,20 @@ with gr.Blocks() as demo:
             interactive=True,
             visible=False  # Initially hidden
         )
     # Toggle visibility of start_page and end_page based on extraction_mode
     extraction_mode.change(
-        fn=lambda mode: (gr.update(visible=(mode == "Range of Pages")),
-                        gr.update(visible=(mode == "Range of Pages"))),
         inputs=[extraction_mode],
         outputs=[start_page, end_page]
     )
     with gr.Row():
         extract_button = gr.Button("Extract and Download")
     with gr.Row():
         csv_download = gr.File(
             label="Download Extracted CSV",
@@ -214,20 +229,20 @@ with gr.Blocks() as demo:
             label="Download Full Text",
             interactive=False
         )
     with gr.Row():
         status_output = gr.Textbox(
             label="Status",
             interactive=False,
             lines=2
         )
     extract_button.click(
         fn=on_extract,
         inputs=[pdf_input, extraction_mode, start_page, end_page],
         outputs=[csv_download, txt_download, status_output]
     )
     gr.Markdown("""
     ---
     Developed with ❤️ using Gradio and LangChain.

 import io
 import tempfile
 import os
+from langchain_community.document_loaders import UnstructuredPDFLoader
 # Create a temporary directory for storing download files
 temp_dir = tempfile.TemporaryDirectory()
 def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=None):
     """
+    Extract text from a PDF page by page using LangChain's UnstructuredPDFLoader.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
     """
     try:
+        # Initialize the loader with split_pages=True to ensure each page is a separate document
+        loader = UnstructuredPDFLoader(pdf_file_path, split_pages=True)
         documents = loader.load()
         total_pages = len(documents)
         # Validate and adjust page range
         if start_page is not None and end_page is not None:
+            # Convert to integers to avoid slicing issues
+            start_page = int(start_page)
+            end_page = int(end_page)
+            # Adjust to valid range
             if start_page < 1:
                 start_page = 1
             if end_page > total_pages:
                 end_page = total_pages
             if start_page > end_page:
                 start_page, end_page = end_page, start_page  # Swap if out of order
+            # Select the subset of documents based on user input
             selected_docs = documents[start_page - 1:end_page]
         else:
+            selected_docs = documents
+            start_page = 1
+            end_page = total_pages
         # Concatenate selected page contents into a single string
         pdf_pages_content = '\n'.join(doc.page_content for doc in selected_docs)
         extracted_data = []
+        for idx, doc in enumerate(selected_docs, start=1):
+            # Assign the actual page number
+            page_num = start_page + idx - 1
+            # Split content into paragraphs
             paragraphs = doc.page_content.split("\n\n")  # Split into paragraphs
             for paragraph in paragraphs:
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         extraction_mode (str): "All Pages" or "Range of Pages".
+        start_page (float): Starting page number for extraction.
+        end_page (float): Ending page number for extraction.
     Returns:
         tuple: Paths to CSV and TXT files, Status message.
             type="filepath",  # Ensure type is set to "filepath"
             interactive=True
         )
     with gr.Row():
         extraction_mode = gr.Radio(
             label="Extraction Mode",
             value="All Pages",
             interactive=True
         )
     with gr.Row():
         start_page = gr.Number(
             label="Start Page",
             interactive=True,
             visible=False  # Initially hidden
         )
     # Toggle visibility of start_page and end_page based on extraction_mode
     extraction_mode.change(
+        fn=lambda mode: (
+            gr.update(visible=(mode == "Range of Pages")),
+            gr.update(visible=(mode == "Range of Pages"))
+        ),
         inputs=[extraction_mode],
         outputs=[start_page, end_page]
     )
     with gr.Row():
         extract_button = gr.Button("Extract and Download")
     with gr.Row():
         csv_download = gr.File(
             label="Download Extracted CSV",
             label="Download Full Text",
             interactive=False
         )
     with gr.Row():
         status_output = gr.Textbox(
             label="Status",
             interactive=False,
             lines=2
         )
     extract_button.click(
         fn=on_extract,
         inputs=[pdf_input, extraction_mode, start_page, end_page],
         outputs=[csv_download, txt_download, status_output]
     )
     gr.Markdown("""
     ---
     Developed with ❤️ using Gradio and LangChain.