Spaces:

edouardlgp
/

Job_Classification

Running

App Files Files Community

edouardlgp commited on May 10, 2025

Commit

03582cd

verified ·

1 Parent(s): 565345e

Create app.py

Browse files

Files changed (1) hide show

app.py +83 -0

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import gradio as gr
+import pdfplumber
+import pandas as pd
+import re
+import warnings
+import logging
+# Configure logging for pdfminer
+logging.getLogger('pdfminer').setLevel(logging.ERROR)  # Only show errors, not warnings
+def extract_text_from_pdf(pdf_path, suppress_warnings=True):
+    """
+    Extracts all text from a PDF, including text from nested tables and complex layouts.
+    Parameters:
+        pdf_path (str): Path to the PDF file
+        suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
+    """
+    text = ""
+    # Create a custom filter for the specific warning
+    if suppress_warnings:
+        warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            # Extract text from the page
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+            # Extract text from tables (if any)
+            for table in page.extract_tables():
+                for row in table:
+                    for cell in row:
+                        if isinstance(cell, str):
+                            text += cell + " "
+                    text += "\n"
+    return text
+def process_pdf(file):
+    """
+    Processes the uploaded PDF file and returns the extracted text.
+    """
+    if file is None:
+        return "Please upload a PDF file."
+    try:
+        extracted_text = extract_text_from_pdf(file.name)
+        return extracted_text
+    except Exception as e:
+        return f"Error processing PDF: {str(e)}"
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# PDF Text Extractor")
+    gr.Markdown("Upload a PDF file to extract its text content.")
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+            submit_btn = gr.Button("Extract Text")
+        with gr.Column():
+            text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)
+    submit_btn.click(
+        fn=process_pdf,
+        inputs=file_input,
+        outputs=text_output
+    )
+    gr.Examples(
+        examples=["example.pdf"],  # Replace with actual example files if available
+        inputs=file_input,
+        outputs=text_output,
+        fn=process_pdf,
+        cache_examples=True,
+        label="Try an example"
+    )
+# Run the app
+if __name__ == "__main__":
+    demo.launch()