edouardlgp's picture
Update app.py
d8cfa3f verified
raw
history blame
2.28 kB
import gradio as gr
import pdfplumber
import pandas as pd
import re
import warnings
import logging
# Configure logging for pdfminer
logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
def extract_text_from_pdf(pdf_path, suppress_warnings=True):
"""
Extracts all text from a PDF, including text from nested tables and complex layouts.
Parameters:
pdf_path (str): Path to the PDF file
suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
"""
text = ""
# Create a custom filter for the specific warning
if suppress_warnings:
warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract text from the page
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
# Extract text from tables (if any)
for table in page.extract_tables():
for row in table:
for cell in row:
if isinstance(cell, str):
text += cell + " "
text += "\n"
return text
def process_pdf(file):
"""
Processes the uploaded PDF file and returns the extracted text.
"""
if file is None:
return "Please upload a PDF file."
try:
extracted_text = extract_text_from_pdf(file.name)
return extracted_text
except Exception as e:
return f"Error processing PDF: {str(e)}"
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Text Extractor")
gr.Markdown("Upload a PDF file to extract its text content.")
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
submit_btn = gr.Button("Extract Text")
with gr.Column():
text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
submit_btn.click(
fn=process_pdf,
inputs=file_input,
outputs=text_output
)
# Run the app
if __name__ == "__main__":
demo.launch()