edouardlgp commited on
Commit
03582cd
·
verified ·
1 Parent(s): 565345e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ import pandas as pd
4
+ import re
5
+ import warnings
6
+ import logging
7
+
8
+ # Configure logging for pdfminer
9
+ logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings
10
+
11
+ def extract_text_from_pdf(pdf_path, suppress_warnings=True):
12
+ """
13
+ Extracts all text from a PDF, including text from nested tables and complex layouts.
14
+
15
+ Parameters:
16
+ pdf_path (str): Path to the PDF file
17
+ suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
18
+ """
19
+ text = ""
20
+
21
+ # Create a custom filter for the specific warning
22
+ if suppress_warnings:
23
+ warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
24
+
25
+ with pdfplumber.open(pdf_path) as pdf:
26
+ for page in pdf.pages:
27
+ # Extract text from the page
28
+ page_text = page.extract_text()
29
+ if page_text:
30
+ text += page_text + "\n"
31
+
32
+ # Extract text from tables (if any)
33
+ for table in page.extract_tables():
34
+ for row in table:
35
+ for cell in row:
36
+ if isinstance(cell, str):
37
+ text += cell + " "
38
+ text += "\n"
39
+ return text
40
+
41
+ def process_pdf(file):
42
+ """
43
+ Processes the uploaded PDF file and returns the extracted text.
44
+ """
45
+ if file is None:
46
+ return "Please upload a PDF file."
47
+
48
+ try:
49
+ extracted_text = extract_text_from_pdf(file.name)
50
+ return extracted_text
51
+ except Exception as e:
52
+ return f"Error processing PDF: {str(e)}"
53
+
54
+ # Create the Gradio interface
55
+ with gr.Blocks() as demo:
56
+ gr.Markdown("# PDF Text Extractor")
57
+ gr.Markdown("Upload a PDF file to extract its text content.")
58
+
59
+ with gr.Row():
60
+ with gr.Column():
61
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
62
+ submit_btn = gr.Button("Extract Text")
63
+ with gr.Column():
64
+ text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
65
+
66
+ submit_btn.click(
67
+ fn=process_pdf,
68
+ inputs=file_input,
69
+ outputs=text_output
70
+ )
71
+
72
+ gr.Examples(
73
+ examples=["example.pdf"], # Replace with actual example files if available
74
+ inputs=file_input,
75
+ outputs=text_output,
76
+ fn=process_pdf,
77
+ cache_examples=True,
78
+ label="Try an example"
79
+ )
80
+
81
+ # Run the app
82
+ if __name__ == "__main__":
83
+ demo.launch()