Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 15, 2024

Commit

c4707d0

verified ·

1 Parent(s): af972d0

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -72

app.py CHANGED Viewed

@@ -3,120 +3,153 @@ import pandas as pd
 import os
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from PyPDF2 import PdfReader
-import concurrent.futures
-def extract_and_save(pdf_file, progress=gr.Progress()):
     """
-    Extract text from each page of the PDF, split into paragraphs,
-    track page numbers and document name, append to DataFrame,
-    and save as a CSV file with progress updates.
     """
     if pdf_file is None:
         return "No file uploaded."
     pdf_file_path = pdf_file.name
-    doc_name = os.path.basename(pdf_file_path)
-    # Initialize PDF reader to get the number of pages
     try:
         reader = PdfReader(pdf_file_path)
-        num_pages = len(reader.pages)
-        if num_pages == 0:
             return "The uploaded PDF has no pages."
     except Exception as e:
         return f"Error reading PDF: {e}"
     extracted_data = []
-    def process_page(page_num):
-        """
-        Extract paragraphs from a single page.
-        Returns a list of dictionaries with Document, Page, and Paragraph.
-        """
-        try:
-            loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
-            documents = loader.load()
-            if not documents:
-                print(f"No content found on Page {page_num}.")
-                return []
-            page_data = []
-            for doc in documents:
-                # Split content into paragraphs based on double newlines
-                page_text = '\n'.join(doc.page_content)
-                paragraphs = page_text.split("\n\n")
-                for para in paragraphs:
-                    if para.strip():  # Skip empty paragraphs
-                        page_data.append({
-                            "Document": doc_name,
-                            "Page": page_num,
-                            "Paragraph": para.strip()
-                        })
-            return page_data
-        except Exception as e:
-            print(f"Error processing Page {page_num}: {e}")
-            return []
-    # Use ThreadPoolExecutor for parallel processing
-    max_workers = min(3, 6)  # Limit the number of threads to prevent resource exhaustion
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Submit all page processing tasks
-        future_to_page = {executor.submit(process_page, page_num): page_num for page_num in range(1, num_pages + 1)}
-        completed = 0
-        for future in concurrent.futures.as_completed(future_to_page):
-            page_num = future_to_page[future]
-            try:
-                page_data = future.result()
-                extracted_data.extend(page_data)
-            except Exception as e:
-                print(f"Error processing Page {page_num}: {e}")
-            completed += 1
-            # Update progress: 1 step per completed page
-            progress(1, description=f"Processed page {page_num}/{num_pages}")
-    if not extracted_data:
-        return "No text extracted from the PDF."
-    # Convert the extracted data to a DataFrame
-    df = pd.DataFrame(extracted_data)
-    # Save the DataFrame to a CSV file
-    output_filename = "extracted_content.csv"
     try:
-        df.to_csv(output_filename, index=False)
     except Exception as e:
         return f"Error saving CSV: {e}"
-    return output_filename
 # Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("""
-    # 📄 PDF Text Extractor with Metadata and CSV Export
-    Upload a PDF document to extract its text content. The app processes the PDF **page by page**, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
     ## How It Works
     1. **Upload PDF**: Select and upload your PDF file.
-    2. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
-    3. **Progress Updates**: Watch the progress bar as each page is processed.
-    4. **Download**: Once complete, download the CSV file containing the extracted data.
     """)
     with gr.Row():
         pdf_input = gr.File(label="📁 Upload PDF", type="filepath")
     with gr.Row():
         extract_button = gr.Button("🟢 Extract and Download CSV")
     with gr.Row():
         download_csv = gr.File(label="📥 Download Extracted CSV")
-    # Link the button to the extraction function with progress enabled
     extract_button.click(
         fn=extract_and_save,
-        inputs=pdf_input,
-        outputs=download_csv,
-        show_progress=True  # Enables the progress bar
     )
 # Launch the Gradio

 import os
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from PyPDF2 import PdfReader
+def extract_and_save(pdf_file, extraction_option, start_page, end_page):
     """
+    Main function to extract text based on user options and save to CSV.
+    Parameters:
+    - pdf_file (File): Uploaded PDF file.
+    - extraction_option (str): 'All Pages' or 'Page Range'.
+    - start_page (int): Starting page number (if applicable).
+    - end_page (int): Ending page number (if applicable).
+    Returns:
+    - str: Path to the saved CSV file or error message.
     """
     if pdf_file is None:
         return "No file uploaded."
     pdf_file_path = pdf_file.name
+    # Initialize PDF reader to get total pages
     try:
         reader = PdfReader(pdf_file_path)
+        total_pages = len(reader.pages)
+        if total_pages == 0:
             return "The uploaded PDF has no pages."
     except Exception as e:
         return f"Error reading PDF: {e}"
+    # Determine extraction parameters
+    if extraction_option == "All Pages":
+        pages_to_extract = list(range(1, total_pages + 1))
+    else:
+        # Validate start and end pages
+        if start_page is None or end_page is None:
+            return "Please specify both start and end pages."
+        if start_page < 1 or end_page > total_pages:
+            return f"Page range must be between 1 and {total_pages}."
+        if start_page > end_page:
+            return "Start page cannot be greater than end page."
+        pages_to_extract = list(range(int(start_page), int(end_page) + 1))
+    doc_name = os.path.basename(pdf_file_path)
     extracted_data = []
+    try:
+        with gr.Progress() as progress:
+            for idx, page_num in enumerate(pages_to_extract, start=1):
+                try:
+                    progress(1, description=f"Processing Page {page_num}/{len(pages_to_extract)}")
+                    loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
+                    documents = loader.load()
+                    if not documents:
+                        print(f"No content found on Page {page_num}.")
+                        continue
+                    # Concatenate all text from the page to preserve column integrity
+                    pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
+                    # Split content into paragraphs based on double newlines
+                    paragraphs = pdf_pages_content.split("\n\n")
+                    for para in paragraphs:
+                        if para.strip():  # Skip empty paragraphs
+                            extracted_data.append({
+                                "Document": doc_name,
+                                "Page": page_num,
+                                "Paragraph": para.strip()
+                            })
+                except Exception as e:
+                    print(f"Error processing Page {page_num}: {e}")
+                    extracted_data.append({
+                        "Document": doc_name,
+                        "Page": page_num,
+                        "Paragraph": f"Error extracting this page: {e}"
+                    })
+    except Exception as e:
+        return f"An error occurred during extraction: {e}"
+    if not extracted_data:
+        return "No text extracted from the specified pages."
+    # Save to CSV
     try:
+        csv_filename = "extracted_content.csv"
+        df = pd.DataFrame(extracted_data)
+        df.to_csv(csv_filename, index=False)
     except Exception as e:
         return f"Error saving CSV: {e}"
+    return csv_filename
 # Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("""
+    # 📄 PDF Text Extractor with Page Range Selection and CSV Export
+    Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
     ## How It Works
     1. **Upload PDF**: Select and upload your PDF file.
+    2. **Choose Extraction Option**:
+       - **All Pages**: Extract text from every page in the PDF.
+       - **Page Range**: Specify the start and end pages to extract text from.
+    3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
+    4. **Progress Updates**: Watch the progress bar as each page is processed.
+    5. **Download**: Once complete, download the CSV file containing the extracted data.
     """)
     with gr.Row():
         pdf_input = gr.File(label="📁 Upload PDF", type="filepath")
+    with gr.Row():
+        extraction_option = gr.Radio(
+            choices=["All Pages", "Page Range"],
+            value="All Pages",
+            label="Extraction Option"
+        )
+    with gr.Row():
+        start_page = gr.Number(label="📝 Start Page", value=1, precision=0, visible=False)
+        end_page = gr.Number(label="📝 End Page", value=1, precision=0, visible=False)
+    # Show or hide start/end page inputs based on extraction option
+    def toggle_page_range(option):
+        if option == "Page Range":
+            return gr.update(visible=True), gr.update(visible=True)
+        else:
+            return gr.update(visible=False), gr.update(visible=False)
+    extraction_option.change(
+        fn=toggle_page_range,
+        inputs=[extraction_option],
+        outputs=[start_page, end_page]
+    )
     with gr.Row():
         extract_button = gr.Button("🟢 Extract and Download CSV")
     with gr.Row():
         download_csv = gr.File(label="📥 Download Extracted CSV")
     extract_button.click(
         fn=extract_and_save,
+        inputs=[pdf_input, extraction_option, start_page, end_page],
+        outputs=[download_csv],
+        show_progress=True
     )
 # Launch the Gradio