Spaces:
Sleeping
Sleeping
| # app.py - Correct structure with monkey patch BEFORE any fitz imports | |
| import os | |
| import subprocess | |
| import sys | |
| import tempfile | |
| import time | |
| import base64 | |
| import json | |
| # SSL fix function (keep as is) | |
| def fix_ssl_library(): | |
| """Download and install libssl1.1 if not present""" | |
| try: | |
| if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'): | |
| print("libssl.so.1.1 already exists") | |
| return True | |
| print("Attempting to install libssl1.1...") | |
| subprocess.run([ | |
| 'wget', '-q', | |
| 'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb', | |
| '-O', '/tmp/libssl1.1.deb' | |
| ], check=True) | |
| result = subprocess.run([ | |
| 'dpkg', '-i', '/tmp/libssl1.1.deb' | |
| ], capture_output=True, text=True) | |
| if result.returncode != 0: | |
| print("dpkg install failed, trying manual extraction...") | |
| subprocess.run([ | |
| 'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract' | |
| ], check=True) | |
| lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu' | |
| current_ld_path = os.environ.get('LD_LIBRARY_PATH', '') | |
| if current_ld_path: | |
| os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}" | |
| else: | |
| os.environ['LD_LIBRARY_PATH'] = lib_path | |
| print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}") | |
| return True | |
| except Exception as e: | |
| print(f"Failed to install libssl1.1: {e}") | |
| return False | |
| # CRITICAL: Apply monkey patch BEFORE importing fitz/PyMuPDF | |
| def monkey_patch_pymupdf(): | |
| """Fix PaddleOCR compatibility with newer PyMuPDF versions""" | |
| print("Applying PyMuPDF compatibility patches...") | |
| # Import fitz here to apply patches | |
| import fitz | |
| # Add pageCount property to Document class if it doesn't exist | |
| if not hasattr(fitz.Document, 'pageCount'): | |
| def pageCount_property(self): | |
| return self.page_count | |
| fitz.Document.pageCount = property(pageCount_property) | |
| print("β Added pageCount compatibility property to PyMuPDF Document class") | |
| else: | |
| print("β pageCount already exists") | |
| # Add getPixmap method to Page class if it doesn't exist | |
| if not hasattr(fitz.Page, 'getPixmap'): | |
| def getPixmap(self, matrix=None, alpha=True): | |
| return self.get_pixmap(matrix=matrix, alpha=alpha) | |
| fitz.Page.getPixmap = getPixmap | |
| print("β Added getPixmap compatibility method to PyMuPDF Page class") | |
| else: | |
| print("β getPixmap already exists") | |
| # Add getText method if it doesn't exist | |
| if not hasattr(fitz.Page, 'getText'): | |
| def getText(self, option="text"): | |
| return self.get_text(option) | |
| fitz.Page.getText = getText | |
| print("β Added getText compatibility method to PyMuPDF Page class") | |
| else: | |
| print("β getText already exists") | |
| print("β PyMuPDF compatibility patches applied successfully") | |
| def try_paddle_import(): | |
| """Try different approaches to import PaddleOCR""" | |
| # First try the SSL fix | |
| fix_ssl_library() | |
| # CRITICAL: Apply PyMuPDF compatibility patches BEFORE importing PaddleOCR | |
| monkey_patch_pymupdf() | |
| # Try importing with different environment variables | |
| os.environ['PADDLE_GIT_DISABLE'] = '1' | |
| try: | |
| from paddleocr import PaddleOCR | |
| return PaddleOCR | |
| except ImportError as e: | |
| if 'libssl.so.1.1' in str(e): | |
| print("Still having SSL issues, trying alternative PaddlePaddle version...") | |
| try: | |
| subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'], | |
| capture_output=True) | |
| subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'], | |
| check=True) | |
| from paddleocr import PaddleOCR | |
| return PaddleOCR | |
| except Exception as inner_e: | |
| print(f"Failed to install alternative version: {inner_e}") | |
| print(f"PaddleOCR import failed: {e}") | |
| raise e | |
| # Import Gradio | |
| import gradio as gr | |
| # Import PyMuPDF AFTER monkey patch is defined but BEFORE PaddleOCR | |
| import fitz # This import will use the patched version | |
| # Try to import PaddleOCR with fixes | |
| print("Attempting to import PaddleOCR...") | |
| try: | |
| PaddleOCR = try_paddle_import() | |
| print("Loading PaddleOCR models...") | |
| ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False) | |
| print("PaddleOCR models loaded successfully!") | |
| except Exception as e: | |
| print(f"Failed to load PaddleOCR: {e}") | |
| print("Application will exit - compatibility issue not resolved") | |
| sys.exit(1) | |
| # Test the monkey patch | |
| print("Testing monkey patch...") | |
| test_doc = None | |
| try: | |
| # Create a simple test to verify pageCount exists | |
| import io | |
| pdf_content = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000053 00000 n \n0000000100 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n179\n%%EOF" | |
| test_doc = fitz.open(stream=pdf_content, filetype="pdf") | |
| if hasattr(test_doc, 'pageCount'): | |
| print(f"β Monkey patch successful! pageCount = {test_doc.pageCount}") | |
| else: | |
| print("β Monkey patch failed - pageCount not found") | |
| print(f"Available attributes: {[attr for attr in dir(test_doc) if 'count' in attr.lower()]}") | |
| test_doc.close() | |
| except Exception as e: | |
| print(f"Monkey patch test failed: {e}") | |
| if test_doc: | |
| test_doc.close() | |
| # Rest of your app code (process_document, API functions, Gradio interface, etc.) | |
| def process_document(file): | |
| """Process uploaded document with PaddleOCR""" | |
| if file is None: | |
| return "No file uploaded", "", "" | |
| start_time = time.time() | |
| try: | |
| filename = os.path.basename(file.name) | |
| print(f"Processing: {filename}") | |
| file_path = file.name | |
| print(f"File path: {file_path}") | |
| # Count pages if PDF | |
| total_pages = 1 | |
| if filename.lower().endswith('.pdf'): | |
| try: | |
| print(f"Opening PDF: {file_path}") | |
| doc = fitz.open(file_path) | |
| # Test pageCount attribute | |
| print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}") | |
| print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}") | |
| if hasattr(doc, 'pageCount'): | |
| total_pages = doc.pageCount | |
| print(f"Used pageCount: {total_pages}") | |
| elif hasattr(doc, 'page_count'): | |
| total_pages = doc.page_count | |
| print(f"Used page_count: {total_pages}") | |
| else: | |
| total_pages = len(doc) | |
| print(f"Used len(): {total_pages}") | |
| doc.close() | |
| except Exception as e: | |
| print(f"PDF page counting error: {e}") | |
| total_pages = 1 | |
| # Run OCR | |
| print(f"Running OCR on: {file_path}") | |
| result = ocr.ocr(file_path, cls=True) | |
| # Extract text | |
| extracted_text = "" | |
| pages_processed = 0 | |
| if result: | |
| for page_idx, page_result in enumerate(result): | |
| if page_result: | |
| pages_processed += 1 | |
| for line in page_result: | |
| if len(line) >= 2 and line[1][1] > 0.5: | |
| extracted_text += line[1][0] + "\n" | |
| processing_time = time.time() - start_time | |
| summary = f""" | |
| π **File**: {filename} | |
| π **Pages Processed**: {pages_processed}/{total_pages} | |
| β±οΈ **Processing Time**: {processing_time:.2f} seconds | |
| π **Text Length**: {len(extracted_text)} characters | |
| π§ **OCR Engine**: PaddleOCR | |
| """ | |
| api_response = json.dumps({ | |
| "success": True, | |
| "text": extracted_text, | |
| "filename": filename, | |
| "pages_processed": pages_processed, | |
| "total_pages": total_pages, | |
| "processing_time": processing_time, | |
| "ocr_engine": "PaddleOCR" | |
| }, indent=2) | |
| return summary, extracted_text, api_response | |
| except Exception as e: | |
| error_msg = f"Error processing file: {str(e)}" | |
| print(f"Full error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return error_msg, "", json.dumps({"success": False, "error": str(e)}) | |
| def process_api_request(api_data): | |
| """Process API-style requests (for integration with your Vercel app)""" | |
| try: | |
| data = json.loads(api_data) | |
| if 'file' not in data: | |
| return json.dumps({"success": False, "error": "No file data provided"}) | |
| # Decode base64 file | |
| file_data = base64.b64decode(data['file']) | |
| filename = data.get('filename', 'unknown.pdf') | |
| # Save to temp file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file: | |
| tmp_file.write(file_data) | |
| tmp_file_path = tmp_file.name | |
| try: | |
| # Run OCR | |
| result = ocr.ocr(tmp_file_path, cls=True) | |
| # Extract text | |
| text = "" | |
| for page_result in result: | |
| if page_result: | |
| for line in page_result: | |
| if len(line) >= 2: | |
| text += line[1][0] + "\n" | |
| return json.dumps({ | |
| "success": True, | |
| "text": text, | |
| "filename": filename, | |
| "ocr_engine": "PaddleOCR" | |
| }) | |
| finally: | |
| os.unlink(tmp_file_path) | |
| except Exception as e: | |
| return json.dumps({"success": False, "error": str(e)}) | |
| # Create Gradio interface with multiple tabs | |
| with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo: | |
| gr.Markdown("# π₯ PaddleOCR Medical Document Processor") | |
| gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR") | |
| with gr.Tab("π File Upload"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload Document (PDF, JPG, PNG)", | |
| file_types=[".pdf", ".jpg", ".jpeg", ".png"] | |
| ) | |
| process_btn = gr.Button("π Process Document", variant="primary") | |
| with gr.Column(): | |
| summary_output = gr.Markdown(label="π Processing Summary") | |
| with gr.Row(): | |
| text_output = gr.Textbox( | |
| label="π Extracted Text", | |
| lines=15, | |
| max_lines=20 | |
| ) | |
| process_btn.click( | |
| fn=process_document, | |
| inputs=[file_input], | |
| outputs=[summary_output, text_output, gr.Textbox(visible=False)] | |
| ) | |
| with gr.Tab("π API Integration"): | |
| gr.Markdown("### For integration with your Vercel app:") | |
| gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`") | |
| gr.Markdown("**Method**: POST") | |
| gr.Markdown("**Headers**: `Content-Type: application/json`") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("**Sample Request:**") | |
| gr.Code(''' | |
| { | |
| "data": [ | |
| { | |
| "file": "base64_encoded_file_data_here", | |
| "filename": "lab_report.pdf" | |
| } | |
| ] | |
| } | |
| ''', language="json") | |
| with gr.Column(): | |
| gr.Markdown("**Sample Response:**") | |
| gr.Code(''' | |
| { | |
| "data": [ | |
| { | |
| "success": true, | |
| "text": "Extracted text content...", | |
| "filename": "lab_report.pdf", | |
| "ocr_engine": "PaddleOCR" | |
| } | |
| ] | |
| } | |
| ''', language="json") | |
| gr.Markdown("### Test API Request:") | |
| api_input = gr.Textbox( | |
| label="API Request (JSON)", | |
| placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}', | |
| lines=5 | |
| ) | |
| api_btn = gr.Button("π§ͺ Test API Request") | |
| api_output = gr.Textbox( | |
| label="API Response (JSON)", | |
| lines=10 | |
| ) | |
| api_btn.click( | |
| fn=process_api_request, | |
| inputs=[api_input], | |
| outputs=[api_output] | |
| ) | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ### π― Purpose | |
| This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms. | |
| ### π§ Integration | |
| This Hugging Face Space can be integrated with your Vercel app as an external OCR service. | |
| ### π Supported Formats | |
| - PDF documents (multi-page) | |
| - JPEG/JPG images | |
| - PNG images | |
| ### π Features | |
| - High accuracy OCR with PaddleOCR | |
| - Medical document optimization | |
| - Multi-page PDF support | |
| - RESTful API integration | |
| - Free hosting on Hugging Face | |
| - SSL compatibility fixes included | |
| ### π Integration URL | |
| `https://mbuck17-paddleocr-processor.hf.space/api/predict` | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |