Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

ff0f9fc

verified ·

1 Parent(s): 24333e4

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -153

app.py CHANGED Viewed

@@ -1,191 +1,83 @@
 import gradio as gr
 import pandas as pd
 import os
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from PyPDF2 import PdfReader
-def extract_text_by_page(pdf_file_path, page_num):
-    """
-    Extract text from a single page of the PDF and return as a list of dictionaries.
-    Parameters:
-    - pdf_file_path (str): Path to the uploaded PDF file.
-    - page_num (int): Page number to extract (1-based indexing).
-    Returns:
-    - list of dict: Extracted data with Document, Page, and Paragraph.
-    """
     doc_name = os.path.basename(pdf_file_path)
     extracted_data = []
-    try:
         loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
         documents = loader.load()
         if not documents:
             print(f"No content found on Page {page_num}.")
-            return extracted_data  # Empty list
-        # Concatenate all text from the page to preserve column integrity
-        pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
-        # Split content into paragraphs based on double newlines
-        paragraphs = pdf_pages_content.split("\n\n")
-        for para in paragraphs:
-            if para.strip():  # Skip empty paragraphs
-                extracted_data.append({
-                    "Document": doc_name,
-                    "Page": page_num,
-                    "Paragraph": para.strip()
-                })
-    except Exception as e:
-        print(f"Error processing Page {page_num}: {e}")
-        extracted_data.append({
-            "Document": doc_name,
-            "Page": page_num,
-            "Paragraph": f"Error extracting this page: {e}"
-        })
     return extracted_data
 def save_to_csv(data, output_filename="extracted_content.csv"):
-    """
-    Save extracted data to a CSV file.
-    Parameters:
-    - data (list of dict): Extracted data.
-    - output_filename (str): Name of the output CSV file.
-    Returns:
-    - str: Path to the saved CSV file.
-    """
     df = pd.DataFrame(data)
     df.to_csv(output_filename, index=False)
     return output_filename
-def extract_and_save(pdf_file, extraction_option, start_page, end_page):
-    """
-    Main function to extract text based on user options and save to CSV.
-    Parameters:
-    - pdf_file (File): Uploaded PDF file.
-    - extraction_option (str): 'All Pages' or 'Page Range'.
-    - start_page (int): Starting page number (if applicable).
-    - end_page (int): Ending page number (if applicable).
-    Returns:
-    - tuple: (csv_path, message)
-    """
     if pdf_file is None:
-        return None, "❌ No file uploaded."
-    pdf_file_path = pdf_file.name
-    # Initialize PDF reader to get total pages
-    try:
-        reader = PdfReader(pdf_file_path)
-        total_pages = len(reader.pages)
-        if total_pages == 0:
-            return None, "❌ The uploaded PDF has no pages."
-    except Exception as e:
-        return None, f"❌ Error reading PDF: {e}"
-    # Determine extraction parameters
-    if extraction_option == "All Pages":
-        pages_to_extract = list(range(1, total_pages + 1))
-    else:
-        # Validate start and end pages
-        if start_page is None or end_page is None:
-            return None, "❌ Please specify both start and end pages."
-        if start_page < 1 or end_page > total_pages:
-            return None, f"❌ Page range must be between 1 and {total_pages}."
-        if start_page > end_page:
-            return None, "❌ Start page cannot be greater than end page."
-        pages_to_extract = list(range(int(start_page), int(end_page) + 1))
-    extracted_data = []
-    try:
-        for page_num in pages_to_extract:
-            print(f"Processing Page {page_num}/{len(pages_to_extract)}")
-            page_data = extract_text_by_page(pdf_file_path, page_num)
-            extracted_data.extend(page_data)
-    except Exception as e:
-        return None, f"❌ An error occurred during extraction: {e}"
     if not extracted_data:
-        return None, "❌ No text extracted from the specified pages."
     # Save to CSV
-    try:
-        csv_filename = "extracted_content.csv"
-        csv_path = save_to_csv(extracted_data, csv_filename)
-    except Exception as e:
-        return None, f"❌ Error saving CSV: {e}"
-    return csv_path, "✅ Extraction successful! Download your CSV file below."
 # Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("""
-    # 📄 PDF Text Extractor with Page Range Selection and CSV Export
-    Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
-    ## How It Works
-    1. **Upload PDF**: Select and upload your PDF file.
-    2. **Choose Extraction Option**:
-       - **All Pages**: Extract text from every page in the PDF.
-       - **Page Range**: Specify the start and end pages to extract text from.
-    3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
-    4. **Download**: Once complete, download the CSV file containing the extracted data.
-    """)
-    with gr.Row():
-        pdf_input = gr.File(label="📁 Upload PDF", type="filepath")
-    with gr.Row():
-        extraction_option = gr.Radio(
-            choices=["All Pages", "Page Range"],
-            value="All Pages",
-            label="Extraction Option"
-        )
     with gr.Row():
-        start_page = gr.Number(label="📝 Start Page", value=1, precision=0, visible=False)
-        end_page = gr.Number(label="📝 End Page", value=1, precision=0, visible=False)
-    # Show or hide start/end page inputs based on extraction option
-    def toggle_page_range(option):
-        if option == "Page Range":
-            return gr.update(visible=True), gr.update(visible=True)
-        else:
-            return gr.update(visible=False), gr.update(visible=False)
-    extraction_option.change(
-        fn=toggle_page_range,
-        inputs=[extraction_option],
-        outputs=[start_page, end_page]
-    )
     with gr.Row():
-        extract_button = gr.Button("🟢 Extract and Download CSV")
     with gr.Row():
-        download_csv = gr.File(label="📥 Download Extracted CSV")
-        message = gr.Textbox(label="Message", interactive=False, lines=2)
     extract_button.click(
         fn=extract_and_save,
-        inputs=[pdf_input, extraction_option, start_page, end_page],
-        outputs=[download_csv, message],
-        show_progress=False  # Progress tracking removed
     )
-# Launch the Gradio
 demo.queue().launch()

 import gradio as gr
 import pandas as pd
+import time
 import os
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from PyPDF2 import PdfReader
+def extract_text_by_page(pdf_file_path):
+    """Extract text from each page of the PDF and return as a list of dictionaries."""
+    # Initialize PDF reader
+    reader = PdfReader(pdf_file_path)
+    num_pages = len(reader.pages)
     doc_name = os.path.basename(pdf_file_path)
     extracted_data = []
+    for page_num in range(1, num_pages + 1):
+        print(f"Processing Page {page_num}...")
+        # Initialize the loader for the specific page
         loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
         documents = loader.load()
         if not documents:
             print(f"No content found on Page {page_num}.")
+            continue
+        for doc in documents:
+            paragraphs = doc.page_content.split("\n\n")  # Split text into paragraphs
+            for para in paragraphs:
+                if para.strip():  # Skip empty paragraphs
+                    extracted_data.append({
+                        "Document": doc_name,
+                        "Page": page_num,
+                        "Paragraph": para.strip()
+                    })
+        time.sleep(1)  # Optional: Introduce a small delay between pages
     return extracted_data
 def save_to_csv(data, output_filename="extracted_content.csv"):
+    """Save extracted data to a CSV file."""
     df = pd.DataFrame(data)
     df.to_csv(output_filename, index=False)
     return output_filename
+def extract_and_save(pdf_file):
+    """Main function to extract text and save to CSV."""
     if pdf_file is None:
+        return "No file uploaded."
+    # Extract text by page
+    extracted_data = extract_text_by_page(pdf_file.name)
     if not extracted_data:
+        return "No text extracted from the PDF."
     # Save to CSV
+    csv_path = save_to_csv(extracted_data)
+    return csv_path
 # Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("# PDF Text Extractor with Page Tracking and CSV Export")
     with gr.Row():
+        pdf_input = gr.File(label="Upload PDF", type="filepath")
     with gr.Row():
+        extract_button = gr.Button("Extract and Download CSV")
     with gr.Row():
+        download_csv = gr.File(label="Download Extracted CSV")
     extract_button.click(
         fn=extract_and_save,
+        inputs=pdf_input,
+        outputs=download_csv
     )
+# Launch the Gradio app
 demo.queue().launch()