Spaces:

okewunmi
/

pdf-text-extraction

Sleeping

App Files Files Community

pdf-text-extraction / app.py

okewunmi

Update app.py

2d31420 verified 6 months ago

raw

history blame contribute delete

7.05 kB

	import gradio as gr
	import fitz # PyMuPDF
	import requests
	import os
	import tempfile
	import base64
	from typing import Optional, Tuple

	# OCR.space API configuration
	OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here')
	OCR_API_URL = 'https://api.ocr.space/parse/image'

	def extract_text_with_ocr(pdf_file_path: str) -> str:
	"""Extract text using OCR.space API as fallback"""
	try:
	# Convert PDF to image first (using first page)
	doc = fitz.open(pdf_file_path)
	page = doc[0] # Get first page

	# Convert page to image
	mat = fitz.Matrix(2.0, 2.0) # Higher resolution
	pix = page.get_pixmap(matrix=mat)
	img_data = pix.tobytes("png")
	doc.close()

	# Encode image to base64
	img_base64 = base64.b64encode(img_data).decode('utf-8')

	# Prepare OCR.space API request
	payload = {
	'apikey': OCR_API_KEY,
	'language': 'eng',
	'isOverlayRequired': False,
	'base64Image': f'data:image/png;base64,{img_base64}',
	'iscreatesearchablepdf': False,
	'issearchablepdfhidetextlayer': False
	}

	# Make API request
	response = requests.post(OCR_API_URL, data=payload, timeout=60)

	if response.status_code == 200:
	result = response.json()
	if result.get('IsErroredOnProcessing', False):
	return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}"

	parsed_results = result.get('ParsedResults', [])
	if parsed_results:
	return parsed_results[0].get('ParsedText', 'No text found')
	else:
	return "No text extracted from OCR"
	else:
	return f"OCR API Error: {response.status_code}"

	except Exception as e:
	return f"OCR processing error: {str(e)}"

	def extract_text_from_pdf(pdf_file) -> Tuple[str, str]:
	"""Extract text from uploaded PDF file with OCR fallback"""
	if pdf_file is None:
	return "No file uploaded", "❌ Error"

	status = "✅ Success"

	try:
	# Primary method: PyMuPDF text extraction
	doc = fitz.open(pdf_file.name)
	text = ""

	# Extract text from each page
	for page_num, page in enumerate(doc):
	page_text = page.get_text("text")
	if page_text.strip():
	text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"

	doc.close()

	# If we got meaningful text, return it
	if text.strip() and len(text.strip()) > 50: # Arbitrary threshold
	return text.strip(), status

	# If no text or very little text, try OCR fallback
	status = "⚠️ Using OCR (Image-based PDF detected)"

	# Check if OCR API key is configured
	if OCR_API_KEY == 'your_ocr_space_api_key_here':
	return ("No extractable text found. This appears to be an image-based PDF.\n"
	"To extract text from image-based PDFs, please:\n"
	"1. Get a free API key from https://ocr.space/ocrapi\n"
	"2. Set the OCR_API_KEY environment variable\n"
	"3. Restart the application"), "❌ OCR Not Configured"

	# Try OCR extraction
	ocr_text = extract_text_with_ocr(pdf_file.name)

	if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"):
	return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "❌ OCR Failed"

	return f"Extracted using OCR:\n\n{ocr_text}", status

	except Exception as e:
	# Complete fallback error handling
	error_msg = f"Error processing PDF: {str(e)}"

	# Try to provide helpful error messages
	if "No such file" in str(e):
	error_msg = "File not found. Please try uploading the PDF again."
	elif "not a PDF" in str(e):
	error_msg = "Invalid file format. Please upload a valid PDF file."
	elif "encrypted" in str(e).lower():
	error_msg = "This PDF is password-protected. Please provide an unlocked PDF."
	elif "corrupted" in str(e).lower():
	error_msg = "This PDF file appears to be corrupted. Please try a different file."

	return error_msg, "❌ Error"

	def clear_output():
	"""Clear the output textbox"""
	return "", "🔄 Ready"

	# Create the Gradio interface
	with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📄 PDF Text Extraction App")
	gr.Markdown("""
	Upload a PDF file to extract its text content.

	Features:
	- ✅ Direct text extraction from text-based PDFs
	- 🔍 OCR fallback for image-based PDFs (requires OCR.space API key)
	- 📊 Status indicators for extraction method used
	""")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="📎 Upload PDF File",
	file_types=[".pdf"],
	type="filepath"
	)

	with gr.Row():
	extract_btn = gr.Button("🔍 Extract Text", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")

	# Status indicator
	status_output = gr.Textbox(
	label="Status",
	value="🔄 Ready",
	interactive=False,
	max_lines=1
	)

	# OCR Configuration info
	gr.Markdown("""
	OCR Configuration:
	Set `OCR_API_KEY` environment variable for image-based PDF support.
	Get free API key at: https://ocr.space/ocrapi
	""")

	with gr.Column(scale=2):
	text_output = gr.Textbox(
	label="📝 Extracted Text",
	lines=25,
	max_lines=50,
	placeholder="Extracted text will appear here...",
	show_copy_button=True
	)

	# Event handlers
	extract_btn.click(
	fn=extract_text_from_pdf,
	inputs=pdf_input,
	outputs=[text_output, status_output]
	)

	clear_btn.click(
	fn=clear_output,
	outputs=[text_output, status_output]
	)

	# Auto-extract when file is uploaded
	pdf_input.change(
	fn=extract_text_from_pdf,
	inputs=pdf_input,
	outputs=[text_output, status_output]
	)

	# Footer
	gr.Markdown("""
	---
	Tips:
	- For best results with image-based PDFs, ensure good image quality
	- Large PDFs may take longer to process
	- OCR works best with clear, high-contrast text
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=True
	)