import gradio as gr import pdfplumber import pandas as pd import re import warnings import logging # Configure logging for pdfminer logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings def extract_text_from_pdf(pdf_path, suppress_warnings=True): """ Extracts all text from a PDF, including text from nested tables and complex layouts. Parameters: pdf_path (str): Path to the PDF file suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True) """ text = "" # Create a custom filter for the specific warning if suppress_warnings: warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*") with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: # Extract text from the page page_text = page.extract_text() if page_text: text += page_text + "\n" # Extract text from tables (if any) for table in page.extract_tables(): for row in table: for cell in row: if isinstance(cell, str): text += cell + " " text += "\n" return text def process_pdf(file): """ Processes the uploaded PDF file and returns the extracted text. """ if file is None: return "Please upload a PDF file." try: extracted_text = extract_text_from_pdf(file.name) return extracted_text except Exception as e: return f"Error processing PDF: {str(e)}" # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("# PDF Text Extractor") gr.Markdown("Upload a PDF file to extract its text content.") with gr.Row(): with gr.Column(): file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) submit_btn = gr.Button("Extract Text") with gr.Column(): text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False) submit_btn.click( fn=process_pdf, inputs=file_input, outputs=text_output ) # Run the app if __name__ == "__main__": demo.launch()