Spaces:
Running
Running
| import gradio as gr | |
| import pdfplumber | |
| import pandas as pd | |
| import re | |
| import warnings | |
| import logging | |
| # Configure logging for pdfminer | |
| logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings | |
| def extract_text_from_pdf(pdf_path, suppress_warnings=True): | |
| """ | |
| Extracts all text from a PDF, including text from nested tables and complex layouts. | |
| Parameters: | |
| pdf_path (str): Path to the PDF file | |
| suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True) | |
| """ | |
| text = "" | |
| # Create a custom filter for the specific warning | |
| if suppress_warnings: | |
| warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*") | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| # Extract text from the page | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| # Extract text from tables (if any) | |
| for table in page.extract_tables(): | |
| for row in table: | |
| for cell in row: | |
| if isinstance(cell, str): | |
| text += cell + " " | |
| text += "\n" | |
| return text | |
| def process_pdf(file): | |
| """ | |
| Processes the uploaded PDF file and returns the extracted text. | |
| """ | |
| if file is None: | |
| return "Please upload a PDF file." | |
| try: | |
| extracted_text = extract_text_from_pdf(file.name) | |
| return extracted_text | |
| except Exception as e: | |
| return f"Error processing PDF: {str(e)}" | |
| # Create the Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# PDF Text Extractor") | |
| gr.Markdown("Upload a PDF file to extract its text content.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| submit_btn = gr.Button("Extract Text") | |
| with gr.Column(): | |
| text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False) | |
| submit_btn.click( | |
| fn=process_pdf, | |
| inputs=file_input, | |
| outputs=text_output | |
| ) | |
| # Run the app | |
| if __name__ == "__main__": | |
| demo.launch() |