Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

paddleocr-processor / archive /app -og.py

mbuckle

Enhanced paddle test

78b142a 9 months ago

raw

history blame contribute delete

14 kB

	# app.py - Correct structure with monkey patch BEFORE any fitz imports

	import os
	import subprocess
	import sys
	import tempfile
	import time
	import base64
	import json

	# SSL fix function (keep as is)
	def fix_ssl_library():
	"""Download and install libssl1.1 if not present"""
	try:
	if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
	print("libssl.so.1.1 already exists")
	return True

	print("Attempting to install libssl1.1...")

	subprocess.run([
	'wget', '-q',
	'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
	'-O', '/tmp/libssl1.1.deb'
	], check=True)

	result = subprocess.run([
	'dpkg', '-i', '/tmp/libssl1.1.deb'
	], capture_output=True, text=True)

	if result.returncode != 0:
	print("dpkg install failed, trying manual extraction...")
	subprocess.run([
	'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
	], check=True)

	lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
	current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
	if current_ld_path:
	os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
	else:
	os.environ['LD_LIBRARY_PATH'] = lib_path
	print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")

	return True

	except Exception as e:
	print(f"Failed to install libssl1.1: {e}")
	return False

	# CRITICAL: Apply monkey patch BEFORE importing fitz/PyMuPDF
	def monkey_patch_pymupdf():
	"""Fix PaddleOCR compatibility with newer PyMuPDF versions"""
	print("Applying PyMuPDF compatibility patches...")

	# Import fitz here to apply patches
	import fitz

	# Add pageCount property to Document class if it doesn't exist
	if not hasattr(fitz.Document, 'pageCount'):
	def pageCount_property(self):
	return self.page_count

	fitz.Document.pageCount = property(pageCount_property)
	print("✓ Added pageCount compatibility property to PyMuPDF Document class")
	else:
	print("✓ pageCount already exists")

	# Add getPixmap method to Page class if it doesn't exist
	if not hasattr(fitz.Page, 'getPixmap'):
	def getPixmap(self, matrix=None, alpha=True):
	return self.get_pixmap(matrix=matrix, alpha=alpha)

	fitz.Page.getPixmap = getPixmap
	print("✓ Added getPixmap compatibility method to PyMuPDF Page class")
	else:
	print("✓ getPixmap already exists")

	# Add getText method if it doesn't exist
	if not hasattr(fitz.Page, 'getText'):
	def getText(self, option="text"):
	return self.get_text(option)

	fitz.Page.getText = getText
	print("✓ Added getText compatibility method to PyMuPDF Page class")
	else:
	print("✓ getText already exists")

	print("✓ PyMuPDF compatibility patches applied successfully")

	def try_paddle_import():
	"""Try different approaches to import PaddleOCR"""

	# First try the SSL fix
	fix_ssl_library()

	# CRITICAL: Apply PyMuPDF compatibility patches BEFORE importing PaddleOCR
	monkey_patch_pymupdf()

	# Try importing with different environment variables
	os.environ['PADDLE_GIT_DISABLE'] = '1'

	try:
	from paddleocr import PaddleOCR
	return PaddleOCR
	except ImportError as e:
	if 'libssl.so.1.1' in str(e):
	print("Still having SSL issues, trying alternative PaddlePaddle version...")

	try:
	subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
	capture_output=True)
	subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'],
	check=True)
	from paddleocr import PaddleOCR
	return PaddleOCR
	except Exception as inner_e:
	print(f"Failed to install alternative version: {inner_e}")

	print(f"PaddleOCR import failed: {e}")
	raise e

	# Import Gradio
	import gradio as gr

	# Import PyMuPDF AFTER monkey patch is defined but BEFORE PaddleOCR
	import fitz # This import will use the patched version

	# Try to import PaddleOCR with fixes
	print("Attempting to import PaddleOCR...")
	try:
	PaddleOCR = try_paddle_import()
	print("Loading PaddleOCR models...")
	ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
	print("PaddleOCR models loaded successfully!")
	except Exception as e:
	print(f"Failed to load PaddleOCR: {e}")
	print("Application will exit - compatibility issue not resolved")
	sys.exit(1)

	# Test the monkey patch
	print("Testing monkey patch...")
	test_doc = None
	try:
	# Create a simple test to verify pageCount exists
	import io
	pdf_content = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000053 00000 n \n0000000100 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n179\n%%EOF"
	test_doc = fitz.open(stream=pdf_content, filetype="pdf")

	if hasattr(test_doc, 'pageCount'):
	print(f"✓ Monkey patch successful! pageCount = {test_doc.pageCount}")
	else:
	print("✗ Monkey patch failed - pageCount not found")
	print(f"Available attributes: {[attr for attr in dir(test_doc) if 'count' in attr.lower()]}")

	test_doc.close()
	except Exception as e:
	print(f"Monkey patch test failed: {e}")
	if test_doc:
	test_doc.close()

	# Rest of your app code (process_document, API functions, Gradio interface, etc.)
	def process_document(file):
	"""Process uploaded document with PaddleOCR"""
	if file is None:
	return "No file uploaded", "", ""

	start_time = time.time()

	try:
	filename = os.path.basename(file.name)
	print(f"Processing: {filename}")

	file_path = file.name
	print(f"File path: {file_path}")

	# Count pages if PDF
	total_pages = 1
	if filename.lower().endswith('.pdf'):
	try:
	print(f"Opening PDF: {file_path}")
	doc = fitz.open(file_path)

	# Test pageCount attribute
	print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}")
	print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}")

	if hasattr(doc, 'pageCount'):
	total_pages = doc.pageCount
	print(f"Used pageCount: {total_pages}")
	elif hasattr(doc, 'page_count'):
	total_pages = doc.page_count
	print(f"Used page_count: {total_pages}")
	else:
	total_pages = len(doc)
	print(f"Used len(): {total_pages}")

	doc.close()
	except Exception as e:
	print(f"PDF page counting error: {e}")
	total_pages = 1

	# Run OCR
	print(f"Running OCR on: {file_path}")
	result = ocr.ocr(file_path, cls=True)

	# Extract text
	extracted_text = ""
	pages_processed = 0

	if result:
	for page_idx, page_result in enumerate(result):
	if page_result:
	pages_processed += 1
	for line in page_result:
	if len(line) >= 2 and line[1][1] > 0.5:
	extracted_text += line[1][0] + "\n"

	processing_time = time.time() - start_time

	summary = f"""
	📄 File: {filename}
	📊 Pages Processed: {pages_processed}/{total_pages}
	⏱️ Processing Time: {processing_time:.2f} seconds
	📝 Text Length: {len(extracted_text)} characters
	🔧 OCR Engine: PaddleOCR
	"""

	api_response = json.dumps({
	"success": True,
	"text": extracted_text,
	"filename": filename,
	"pages_processed": pages_processed,
	"total_pages": total_pages,
	"processing_time": processing_time,
	"ocr_engine": "PaddleOCR"
	}, indent=2)

	return summary, extracted_text, api_response

	except Exception as e:
	error_msg = f"Error processing file: {str(e)}"
	print(f"Full error: {e}")
	import traceback
	traceback.print_exc()
	return error_msg, "", json.dumps({"success": False, "error": str(e)})

	def process_api_request(api_data):
	"""Process API-style requests (for integration with your Vercel app)"""
	try:
	data = json.loads(api_data)

	if 'file' not in data:
	return json.dumps({"success": False, "error": "No file data provided"})

	# Decode base64 file
	file_data = base64.b64decode(data['file'])
	filename = data.get('filename', 'unknown.pdf')

	# Save to temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
	tmp_file.write(file_data)
	tmp_file_path = tmp_file.name

	try:
	# Run OCR
	result = ocr.ocr(tmp_file_path, cls=True)

	# Extract text
	text = ""
	for page_result in result:
	if page_result:
	for line in page_result:
	if len(line) >= 2:
	text += line[1][0] + "\n"

	return json.dumps({
	"success": True,
	"text": text,
	"filename": filename,
	"ocr_engine": "PaddleOCR"
	})

	finally:
	os.unlink(tmp_file_path)

	except Exception as e:
	return json.dumps({"success": False, "error": str(e)})

	# Create Gradio interface with multiple tabs
	with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
	gr.Markdown("# 🏥 PaddleOCR Medical Document Processor")
	gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")

	with gr.Tab("📄 File Upload"):
	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload Document (PDF, JPG, PNG)",
	file_types=[".pdf", ".jpg", ".jpeg", ".png"]
	)
	process_btn = gr.Button("🔍 Process Document", variant="primary")

	with gr.Column():
	summary_output = gr.Markdown(label="📊 Processing Summary")

	with gr.Row():
	text_output = gr.Textbox(
	label="📝 Extracted Text",
	lines=15,
	max_lines=20
	)

	process_btn.click(
	fn=process_document,
	inputs=[file_input],
	outputs=[summary_output, text_output, gr.Textbox(visible=False)]
	)

	with gr.Tab("🔌 API Integration"):
	gr.Markdown("### For integration with your Vercel app:")
	gr.Markdown("Endpoint: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
	gr.Markdown("Method: POST")
	gr.Markdown("Headers: `Content-Type: application/json`")

	with gr.Row():
	with gr.Column():
	gr.Markdown("Sample Request:")
	gr.Code('''
	{
	"data": [
	{
	"file": "base64_encoded_file_data_here",
	"filename": "lab_report.pdf"
	}
	]
	}
	''', language="json")

	with gr.Column():
	gr.Markdown("Sample Response:")
	gr.Code('''
	{
	"data": [
	{
	"success": true,
	"text": "Extracted text content...",
	"filename": "lab_report.pdf",
	"ocr_engine": "PaddleOCR"
	}
	]
	}
	''', language="json")

	gr.Markdown("### Test API Request:")
	api_input = gr.Textbox(
	label="API Request (JSON)",
	placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
	lines=5
	)
	api_btn = gr.Button("🧪 Test API Request")
	api_output = gr.Textbox(
	label="API Response (JSON)",
	lines=10
	)

	api_btn.click(
	fn=process_api_request,
	inputs=[api_input],
	outputs=[api_output]
	)

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	### 🎯 Purpose
	This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.

	### 🔧 Integration
	This Hugging Face Space can be integrated with your Vercel app as an external OCR service.

	### 📚 Supported Formats
	- PDF documents (multi-page)
	- JPEG/JPG images
	- PNG images

	### 🚀 Features
	- High accuracy OCR with PaddleOCR
	- Medical document optimization
	- Multi-page PDF support
	- RESTful API integration
	- Free hosting on Hugging Face
	- SSL compatibility fixes included

	### 🔗 Integration URL
	`https://mbuck17-paddleocr-processor.hf.space/api/predict`
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)