mbuckle's picture
Enhanced paddle test
78b142a
# app.py - Correct structure with monkey patch BEFORE any fitz imports
import os
import subprocess
import sys
import tempfile
import time
import base64
import json
# SSL fix function (keep as is)
def fix_ssl_library():
"""Download and install libssl1.1 if not present"""
try:
if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
print("libssl.so.1.1 already exists")
return True
print("Attempting to install libssl1.1...")
subprocess.run([
'wget', '-q',
'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
'-O', '/tmp/libssl1.1.deb'
], check=True)
result = subprocess.run([
'dpkg', '-i', '/tmp/libssl1.1.deb'
], capture_output=True, text=True)
if result.returncode != 0:
print("dpkg install failed, trying manual extraction...")
subprocess.run([
'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
], check=True)
lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
if current_ld_path:
os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
else:
os.environ['LD_LIBRARY_PATH'] = lib_path
print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")
return True
except Exception as e:
print(f"Failed to install libssl1.1: {e}")
return False
# CRITICAL: Apply monkey patch BEFORE importing fitz/PyMuPDF
def monkey_patch_pymupdf():
"""Fix PaddleOCR compatibility with newer PyMuPDF versions"""
print("Applying PyMuPDF compatibility patches...")
# Import fitz here to apply patches
import fitz
# Add pageCount property to Document class if it doesn't exist
if not hasattr(fitz.Document, 'pageCount'):
def pageCount_property(self):
return self.page_count
fitz.Document.pageCount = property(pageCount_property)
print("βœ“ Added pageCount compatibility property to PyMuPDF Document class")
else:
print("βœ“ pageCount already exists")
# Add getPixmap method to Page class if it doesn't exist
if not hasattr(fitz.Page, 'getPixmap'):
def getPixmap(self, matrix=None, alpha=True):
return self.get_pixmap(matrix=matrix, alpha=alpha)
fitz.Page.getPixmap = getPixmap
print("βœ“ Added getPixmap compatibility method to PyMuPDF Page class")
else:
print("βœ“ getPixmap already exists")
# Add getText method if it doesn't exist
if not hasattr(fitz.Page, 'getText'):
def getText(self, option="text"):
return self.get_text(option)
fitz.Page.getText = getText
print("βœ“ Added getText compatibility method to PyMuPDF Page class")
else:
print("βœ“ getText already exists")
print("βœ“ PyMuPDF compatibility patches applied successfully")
def try_paddle_import():
"""Try different approaches to import PaddleOCR"""
# First try the SSL fix
fix_ssl_library()
# CRITICAL: Apply PyMuPDF compatibility patches BEFORE importing PaddleOCR
monkey_patch_pymupdf()
# Try importing with different environment variables
os.environ['PADDLE_GIT_DISABLE'] = '1'
try:
from paddleocr import PaddleOCR
return PaddleOCR
except ImportError as e:
if 'libssl.so.1.1' in str(e):
print("Still having SSL issues, trying alternative PaddlePaddle version...")
try:
subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
capture_output=True)
subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'],
check=True)
from paddleocr import PaddleOCR
return PaddleOCR
except Exception as inner_e:
print(f"Failed to install alternative version: {inner_e}")
print(f"PaddleOCR import failed: {e}")
raise e
# Import Gradio
import gradio as gr
# Import PyMuPDF AFTER monkey patch is defined but BEFORE PaddleOCR
import fitz # This import will use the patched version
# Try to import PaddleOCR with fixes
print("Attempting to import PaddleOCR...")
try:
PaddleOCR = try_paddle_import()
print("Loading PaddleOCR models...")
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
print("PaddleOCR models loaded successfully!")
except Exception as e:
print(f"Failed to load PaddleOCR: {e}")
print("Application will exit - compatibility issue not resolved")
sys.exit(1)
# Test the monkey patch
print("Testing monkey patch...")
test_doc = None
try:
# Create a simple test to verify pageCount exists
import io
pdf_content = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000053 00000 n \n0000000100 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n179\n%%EOF"
test_doc = fitz.open(stream=pdf_content, filetype="pdf")
if hasattr(test_doc, 'pageCount'):
print(f"βœ“ Monkey patch successful! pageCount = {test_doc.pageCount}")
else:
print("βœ— Monkey patch failed - pageCount not found")
print(f"Available attributes: {[attr for attr in dir(test_doc) if 'count' in attr.lower()]}")
test_doc.close()
except Exception as e:
print(f"Monkey patch test failed: {e}")
if test_doc:
test_doc.close()
# Rest of your app code (process_document, API functions, Gradio interface, etc.)
def process_document(file):
"""Process uploaded document with PaddleOCR"""
if file is None:
return "No file uploaded", "", ""
start_time = time.time()
try:
filename = os.path.basename(file.name)
print(f"Processing: {filename}")
file_path = file.name
print(f"File path: {file_path}")
# Count pages if PDF
total_pages = 1
if filename.lower().endswith('.pdf'):
try:
print(f"Opening PDF: {file_path}")
doc = fitz.open(file_path)
# Test pageCount attribute
print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}")
print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}")
if hasattr(doc, 'pageCount'):
total_pages = doc.pageCount
print(f"Used pageCount: {total_pages}")
elif hasattr(doc, 'page_count'):
total_pages = doc.page_count
print(f"Used page_count: {total_pages}")
else:
total_pages = len(doc)
print(f"Used len(): {total_pages}")
doc.close()
except Exception as e:
print(f"PDF page counting error: {e}")
total_pages = 1
# Run OCR
print(f"Running OCR on: {file_path}")
result = ocr.ocr(file_path, cls=True)
# Extract text
extracted_text = ""
pages_processed = 0
if result:
for page_idx, page_result in enumerate(result):
if page_result:
pages_processed += 1
for line in page_result:
if len(line) >= 2 and line[1][1] > 0.5:
extracted_text += line[1][0] + "\n"
processing_time = time.time() - start_time
summary = f"""
πŸ“„ **File**: {filename}
πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
⏱️ **Processing Time**: {processing_time:.2f} seconds
πŸ“ **Text Length**: {len(extracted_text)} characters
πŸ”§ **OCR Engine**: PaddleOCR
"""
api_response = json.dumps({
"success": True,
"text": extracted_text,
"filename": filename,
"pages_processed": pages_processed,
"total_pages": total_pages,
"processing_time": processing_time,
"ocr_engine": "PaddleOCR"
}, indent=2)
return summary, extracted_text, api_response
except Exception as e:
error_msg = f"Error processing file: {str(e)}"
print(f"Full error: {e}")
import traceback
traceback.print_exc()
return error_msg, "", json.dumps({"success": False, "error": str(e)})
def process_api_request(api_data):
"""Process API-style requests (for integration with your Vercel app)"""
try:
data = json.loads(api_data)
if 'file' not in data:
return json.dumps({"success": False, "error": "No file data provided"})
# Decode base64 file
file_data = base64.b64decode(data['file'])
filename = data.get('filename', 'unknown.pdf')
# Save to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
tmp_file.write(file_data)
tmp_file_path = tmp_file.name
try:
# Run OCR
result = ocr.ocr(tmp_file_path, cls=True)
# Extract text
text = ""
for page_result in result:
if page_result:
for line in page_result:
if len(line) >= 2:
text += line[1][0] + "\n"
return json.dumps({
"success": True,
"text": text,
"filename": filename,
"ocr_engine": "PaddleOCR"
})
finally:
os.unlink(tmp_file_path)
except Exception as e:
return json.dumps({"success": False, "error": str(e)})
# Create Gradio interface with multiple tabs
with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
gr.Markdown("# πŸ₯ PaddleOCR Medical Document Processor")
gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
with gr.Tab("πŸ“„ File Upload"):
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload Document (PDF, JPG, PNG)",
file_types=[".pdf", ".jpg", ".jpeg", ".png"]
)
process_btn = gr.Button("πŸ” Process Document", variant="primary")
with gr.Column():
summary_output = gr.Markdown(label="πŸ“Š Processing Summary")
with gr.Row():
text_output = gr.Textbox(
label="πŸ“ Extracted Text",
lines=15,
max_lines=20
)
process_btn.click(
fn=process_document,
inputs=[file_input],
outputs=[summary_output, text_output, gr.Textbox(visible=False)]
)
with gr.Tab("πŸ”Œ API Integration"):
gr.Markdown("### For integration with your Vercel app:")
gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
gr.Markdown("**Method**: POST")
gr.Markdown("**Headers**: `Content-Type: application/json`")
with gr.Row():
with gr.Column():
gr.Markdown("**Sample Request:**")
gr.Code('''
{
"data": [
{
"file": "base64_encoded_file_data_here",
"filename": "lab_report.pdf"
}
]
}
''', language="json")
with gr.Column():
gr.Markdown("**Sample Response:**")
gr.Code('''
{
"data": [
{
"success": true,
"text": "Extracted text content...",
"filename": "lab_report.pdf",
"ocr_engine": "PaddleOCR"
}
]
}
''', language="json")
gr.Markdown("### Test API Request:")
api_input = gr.Textbox(
label="API Request (JSON)",
placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
lines=5
)
api_btn = gr.Button("πŸ§ͺ Test API Request")
api_output = gr.Textbox(
label="API Response (JSON)",
lines=10
)
api_btn.click(
fn=process_api_request,
inputs=[api_input],
outputs=[api_output]
)
with gr.Tab("ℹ️ About"):
gr.Markdown("""
### 🎯 Purpose
This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
### πŸ”§ Integration
This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
### πŸ“š Supported Formats
- PDF documents (multi-page)
- JPEG/JPG images
- PNG images
### πŸš€ Features
- High accuracy OCR with PaddleOCR
- Medical document optimization
- Multi-page PDF support
- RESTful API integration
- Free hosting on Hugging Face
- SSL compatibility fixes included
### πŸ”— Integration URL
`https://mbuck17-paddleocr-processor.hf.space/api/predict`
""")
# Launch the app
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)