Spaces:

edouardlgp
/

Job_Classification

Running

App Files Files Community

Job_Classification / app.py

edouardlgp

Update app.py

d8cfa3f verified 8 months ago

raw

history blame

2.28 kB

	import gradio as gr
	import pdfplumber
	import pandas as pd
	import re
	import warnings
	import logging

	# Configure logging for pdfminer
	logging.getLogger('pdfminer').setLevel(logging.ERROR) # Only show errors, not warnings

	def extract_text_from_pdf(pdf_path, suppress_warnings=True):
	"""
	Extracts all text from a PDF, including text from nested tables and complex layouts.

	Parameters:
	pdf_path (str): Path to the PDF file
	suppress_warnings (bool): Whether to suppress PDF parsing warnings (default: True)
	"""
	text = ""

	# Create a custom filter for the specific warning
	if suppress_warnings:
	warnings.filterwarnings("ignore", category=UserWarning, message="CropBox.*")

	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	# Extract text from the page
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	# Extract text from tables (if any)
	for table in page.extract_tables():
	for row in table:
	for cell in row:
	if isinstance(cell, str):
	text += cell + " "
	text += "\n"
	return text

	def process_pdf(file):
	"""
	Processes the uploaded PDF file and returns the extracted text.
	"""
	if file is None:
	return "Please upload a PDF file."

	try:
	extracted_text = extract_text_from_pdf(file.name)
	return extracted_text
	except Exception as e:
	return f"Error processing PDF: {str(e)}"

	# Create the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# PDF Text Extractor")
	gr.Markdown("Upload a PDF file to extract its text content.")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	submit_btn = gr.Button("Extract Text")
	with gr.Column():
	text_output = gr.Textbox(label="Extracted Text", lines=30, max_lines=50, interactive=False)

	submit_btn.click(
	fn=process_pdf,
	inputs=file_input,
	outputs=text_output
	)

	# Run the app
	if __name__ == "__main__":
	demo.launch()