Spaces:
Running
Running
File size: 2,527 Bytes
03582cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import gradio as gr
import pdfplumber
import pandas as pd
import re
import warnings
import logging
# Configure logging for pdfminer
logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
def extract_text_from_pdf(pdf_path, suppress_warnings=True):
"""
Extracts all text from a PDF, including text from nested tables and complex layouts.
Parameters:
pdf_path (str): Path to the PDF file
suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
"""
text = ""
# Create a custom filter for the specific warning
if suppress_warnings:
warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract text from the page
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
# Extract text from tables (if any)
for table in page.extract_tables():
for row in table:
for cell in row:
if isinstance(cell, str):
text += cell + " "
text += "\n"
return text
def process_pdf(file):
"""
Processes the uploaded PDF file and returns the extracted text.
"""
if file is None:
return "Please upload a PDF file."
try:
extracted_text = extract_text_from_pdf(file.name)
return extracted_text
except Exception as e:
return f"Error processing PDF: {str(e)}"
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Text Extractor")
gr.Markdown("Upload a PDF file to extract its text content.")
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
submit_btn = gr.Button("Extract Text")
with gr.Column():
text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
submit_btn.click(
fn=process_pdf,
inputs=file_input,
outputs=text_output
)
gr.Examples(
examples=["example.pdf"], # Replace with actual example files if available
inputs=file_input,
outputs=text_output,
fn=process_pdf,
cache_examples=True,
label="Try an example"
)
# Run the app
if __name__ == "__main__":
demo.launch() |