# app.py - Correct structure with monkey patch BEFORE any fitz imports import os import subprocess import sys import tempfile import time import base64 import json # SSL fix function (keep as is) def fix_ssl_library(): """Download and install libssl1.1 if not present""" try: if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'): print("libssl.so.1.1 already exists") return True print("Attempting to install libssl1.1...") subprocess.run([ 'wget', '-q', 'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb', '-O', '/tmp/libssl1.1.deb' ], check=True) result = subprocess.run([ 'dpkg', '-i', '/tmp/libssl1.1.deb' ], capture_output=True, text=True) if result.returncode != 0: print("dpkg install failed, trying manual extraction...") subprocess.run([ 'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract' ], check=True) lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu' current_ld_path = os.environ.get('LD_LIBRARY_PATH', '') if current_ld_path: os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}" else: os.environ['LD_LIBRARY_PATH'] = lib_path print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}") return True except Exception as e: print(f"Failed to install libssl1.1: {e}") return False # CRITICAL: Apply monkey patch BEFORE importing fitz/PyMuPDF def monkey_patch_pymupdf(): """Fix PaddleOCR compatibility with newer PyMuPDF versions""" print("Applying PyMuPDF compatibility patches...") # Import fitz here to apply patches import fitz # Add pageCount property to Document class if it doesn't exist if not hasattr(fitz.Document, 'pageCount'): def pageCount_property(self): return self.page_count fitz.Document.pageCount = property(pageCount_property) print("✓ Added pageCount compatibility property to PyMuPDF Document class") else: print("✓ pageCount already exists") # Add getPixmap method to Page class if it doesn't exist if not hasattr(fitz.Page, 'getPixmap'): def getPixmap(self, matrix=None, alpha=True): return self.get_pixmap(matrix=matrix, alpha=alpha) fitz.Page.getPixmap = getPixmap print("✓ Added getPixmap compatibility method to PyMuPDF Page class") else: print("✓ getPixmap already exists") # Add getText method if it doesn't exist if not hasattr(fitz.Page, 'getText'): def getText(self, option="text"): return self.get_text(option) fitz.Page.getText = getText print("✓ Added getText compatibility method to PyMuPDF Page class") else: print("✓ getText already exists") print("✓ PyMuPDF compatibility patches applied successfully") def try_paddle_import(): """Try different approaches to import PaddleOCR""" # First try the SSL fix fix_ssl_library() # CRITICAL: Apply PyMuPDF compatibility patches BEFORE importing PaddleOCR monkey_patch_pymupdf() # Try importing with different environment variables os.environ['PADDLE_GIT_DISABLE'] = '1' try: from paddleocr import PaddleOCR return PaddleOCR except ImportError as e: if 'libssl.so.1.1' in str(e): print("Still having SSL issues, trying alternative PaddlePaddle version...") try: subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'], capture_output=True) subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'], check=True) from paddleocr import PaddleOCR return PaddleOCR except Exception as inner_e: print(f"Failed to install alternative version: {inner_e}") print(f"PaddleOCR import failed: {e}") raise e # Import Gradio import gradio as gr # Import PyMuPDF AFTER monkey patch is defined but BEFORE PaddleOCR import fitz # This import will use the patched version # Try to import PaddleOCR with fixes print("Attempting to import PaddleOCR...") try: PaddleOCR = try_paddle_import() print("Loading PaddleOCR models...") ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False) print("PaddleOCR models loaded successfully!") except Exception as e: print(f"Failed to load PaddleOCR: {e}") print("Application will exit - compatibility issue not resolved") sys.exit(1) # Test the monkey patch print("Testing monkey patch...") test_doc = None try: # Create a simple test to verify pageCount exists import io pdf_content = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000053 00000 n \n0000000100 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n179\n%%EOF" test_doc = fitz.open(stream=pdf_content, filetype="pdf") if hasattr(test_doc, 'pageCount'): print(f"✓ Monkey patch successful! pageCount = {test_doc.pageCount}") else: print("✗ Monkey patch failed - pageCount not found") print(f"Available attributes: {[attr for attr in dir(test_doc) if 'count' in attr.lower()]}") test_doc.close() except Exception as e: print(f"Monkey patch test failed: {e}") if test_doc: test_doc.close() # Rest of your app code (process_document, API functions, Gradio interface, etc.) def process_document(file): """Process uploaded document with PaddleOCR""" if file is None: return "No file uploaded", "", "" start_time = time.time() try: filename = os.path.basename(file.name) print(f"Processing: {filename}") file_path = file.name print(f"File path: {file_path}") # Count pages if PDF total_pages = 1 if filename.lower().endswith('.pdf'): try: print(f"Opening PDF: {file_path}") doc = fitz.open(file_path) # Test pageCount attribute print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}") print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}") if hasattr(doc, 'pageCount'): total_pages = doc.pageCount print(f"Used pageCount: {total_pages}") elif hasattr(doc, 'page_count'): total_pages = doc.page_count print(f"Used page_count: {total_pages}") else: total_pages = len(doc) print(f"Used len(): {total_pages}") doc.close() except Exception as e: print(f"PDF page counting error: {e}") total_pages = 1 # Run OCR print(f"Running OCR on: {file_path}") result = ocr.ocr(file_path, cls=True) # Extract text extracted_text = "" pages_processed = 0 if result: for page_idx, page_result in enumerate(result): if page_result: pages_processed += 1 for line in page_result: if len(line) >= 2 and line[1][1] > 0.5: extracted_text += line[1][0] + "\n" processing_time = time.time() - start_time summary = f""" 📄 **File**: {filename} 📊 **Pages Processed**: {pages_processed}/{total_pages} ⏱️ **Processing Time**: {processing_time:.2f} seconds 📝 **Text Length**: {len(extracted_text)} characters 🔧 **OCR Engine**: PaddleOCR """ api_response = json.dumps({ "success": True, "text": extracted_text, "filename": filename, "pages_processed": pages_processed, "total_pages": total_pages, "processing_time": processing_time, "ocr_engine": "PaddleOCR" }, indent=2) return summary, extracted_text, api_response except Exception as e: error_msg = f"Error processing file: {str(e)}" print(f"Full error: {e}") import traceback traceback.print_exc() return error_msg, "", json.dumps({"success": False, "error": str(e)}) def process_api_request(api_data): """Process API-style requests (for integration with your Vercel app)""" try: data = json.loads(api_data) if 'file' not in data: return json.dumps({"success": False, "error": "No file data provided"}) # Decode base64 file file_data = base64.b64decode(data['file']) filename = data.get('filename', 'unknown.pdf') # Save to temp file with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file: tmp_file.write(file_data) tmp_file_path = tmp_file.name try: # Run OCR result = ocr.ocr(tmp_file_path, cls=True) # Extract text text = "" for page_result in result: if page_result: for line in page_result: if len(line) >= 2: text += line[1][0] + "\n" return json.dumps({ "success": True, "text": text, "filename": filename, "ocr_engine": "PaddleOCR" }) finally: os.unlink(tmp_file_path) except Exception as e: return json.dumps({"success": False, "error": str(e)}) # Create Gradio interface with multiple tabs with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo: gr.Markdown("# 🏥 PaddleOCR Medical Document Processor") gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR") with gr.Tab("📄 File Upload"): with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload Document (PDF, JPG, PNG)", file_types=[".pdf", ".jpg", ".jpeg", ".png"] ) process_btn = gr.Button("🔍 Process Document", variant="primary") with gr.Column(): summary_output = gr.Markdown(label="📊 Processing Summary") with gr.Row(): text_output = gr.Textbox( label="📝 Extracted Text", lines=15, max_lines=20 ) process_btn.click( fn=process_document, inputs=[file_input], outputs=[summary_output, text_output, gr.Textbox(visible=False)] ) with gr.Tab("🔌 API Integration"): gr.Markdown("### For integration with your Vercel app:") gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`") gr.Markdown("**Method**: POST") gr.Markdown("**Headers**: `Content-Type: application/json`") with gr.Row(): with gr.Column(): gr.Markdown("**Sample Request:**") gr.Code(''' { "data": [ { "file": "base64_encoded_file_data_here", "filename": "lab_report.pdf" } ] } ''', language="json") with gr.Column(): gr.Markdown("**Sample Response:**") gr.Code(''' { "data": [ { "success": true, "text": "Extracted text content...", "filename": "lab_report.pdf", "ocr_engine": "PaddleOCR" } ] } ''', language="json") gr.Markdown("### Test API Request:") api_input = gr.Textbox( label="API Request (JSON)", placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}', lines=5 ) api_btn = gr.Button("🧪 Test API Request") api_output = gr.Textbox( label="API Response (JSON)", lines=10 ) api_btn.click( fn=process_api_request, inputs=[api_input], outputs=[api_output] ) with gr.Tab("ℹ️ About"): gr.Markdown(""" ### 🎯 Purpose This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms. ### 🔧 Integration This Hugging Face Space can be integrated with your Vercel app as an external OCR service. ### 📚 Supported Formats - PDF documents (multi-page) - JPEG/JPG images - PNG images ### 🚀 Features - High accuracy OCR with PaddleOCR - Medical document optimization - Multi-page PDF support - RESTful API integration - Free hosting on Hugging Face - SSL compatibility fixes included ### 🔗 Integration URL `https://mbuck17-paddleocr-processor.hf.space/api/predict` """) # Launch the app if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)