Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

56630b3

1 Parent(s): e02f05d

Update monkey patch

Browse files

Files changed (1) hide show

app.py +59 -53

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# app.py - Complete Hugging Face Spaces app with SSL fix
 import os
 import subprocess
 import sys
@@ -7,37 +8,32 @@ import time
 import base64
 import json
-# Try to fix SSL library issue before importing PaddleOCR
 def fix_ssl_library():
     """Download and install libssl1.1 if not present"""
     try:
-        # Check if libssl1.1 already exists
         if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
             print("libssl.so.1.1 already exists")
             return True
         print("Attempting to install libssl1.1...")
-        # Download libssl1.1 from Ubuntu repos
         subprocess.run([
             'wget', '-q',
             'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
             '-O', '/tmp/libssl1.1.deb'
         ], check=True)
-        # Try to install the package
         result = subprocess.run([
             'dpkg', '-i', '/tmp/libssl1.1.deb'
         ], capture_output=True, text=True)
-        # If dpkg install failed, try extracting manually
         if result.returncode != 0:
             print("dpkg install failed, trying manual extraction...")
             subprocess.run([
                 'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
             ], check=True)
-            # Set LD_LIBRARY_PATH to include the extracted libraries
             lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
             current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
             if current_ld_path:
@@ -52,8 +48,12 @@ def fix_ssl_library():
         print(f"Failed to install libssl1.1: {e}")
         return False
 def monkey_patch_pymupdf():
     """Fix PaddleOCR compatibility with newer PyMuPDF versions"""
     import fitz
     # Add pageCount property to Document class if it doesn't exist
@@ -62,7 +62,9 @@ def monkey_patch_pymupdf():
             return self.page_count
         fitz.Document.pageCount = property(pageCount_property)
-        print("Added pageCount compatibility property to PyMuPDF Document class")
     # Add getPixmap method to Page class if it doesn't exist
     if not hasattr(fitz.Page, 'getPixmap'):
@@ -70,25 +72,31 @@ def monkey_patch_pymupdf():
             return self.get_pixmap(matrix=matrix, alpha=alpha)
         fitz.Page.getPixmap = getPixmap
-        print("Added getPixmap compatibility method to PyMuPDF Page class")
-    # Add other common compatibility methods if needed
     if not hasattr(fitz.Page, 'getText'):
         def getText(self, option="text"):
             return self.get_text(option)
         fitz.Page.getText = getText
-        print("Added getText compatibility method to PyMuPDF Page class")
-    print("PyMuPDF compatibility patches applied successfully")
-# Try alternative PaddlePaddle versions
 def try_paddle_import():
     """Try different approaches to import PaddleOCR"""
     # First try the SSL fix
     fix_ssl_library()
     # Try importing with different environment variables
     os.environ['PADDLE_GIT_DISABLE'] = '1'
@@ -99,7 +107,6 @@ def try_paddle_import():
         if 'libssl.so.1.1' in str(e):
             print("Still having SSL issues, trying alternative PaddlePaddle version...")
-            # Try installing older version
             try:
                 subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
                              capture_output=True)
@@ -113,9 +120,11 @@ def try_paddle_import():
         print(f"PaddleOCR import failed: {e}")
         raise e
-# Import other required libraries
 import gradio as gr
-import fitz  # PyMuPDF
 # Try to import PaddleOCR with fixes
 print("Attempting to import PaddleOCR...")
@@ -126,44 +135,43 @@ try:
     print("PaddleOCR models loaded successfully!")
 except Exception as e:
     print(f"Failed to load PaddleOCR: {e}")
-    print("Application will exit - SSL library issue not resolved")
     sys.exit(1)
 def process_document(file):
-    """Process uploaded document with PaddleOCR - Debug Version"""
     if file is None:
         return "No file uploaded", "", ""
     start_time = time.time()
     try:
-        # Debug file object
-        print(f"File object type: {type(file)}")
-        print(f"File object attributes: {dir(file)}")
-        # Try different ways to get filename
-        try:
-            filename = os.path.basename(file.name)
-        except AttributeError:
-            try:
-                filename = file.orig_name if hasattr(file, 'orig_name') else 'unknown.pdf'
-            except:
-                filename = 'unknown.pdf'
         print(f"Processing: {filename}")
-        # Try different ways to access file path
-        file_path = None
-        if hasattr(file, 'name'):
-            file_path = file.name
-        elif hasattr(file, 'path'):
-            file_path = file.path
-        elif hasattr(file, 'file'):
-            file_path = file.file.name if hasattr(file.file, 'name') else None
-        if not file_path:
-            return "Error: Could not access file path", "", json.dumps({"success": False, "error": "File path not accessible"})
         print(f"File path: {file_path}")
         # Count pages if PDF
@@ -173,17 +181,16 @@ def process_document(file):
                 print(f"Opening PDF: {file_path}")
                 doc = fitz.open(file_path)
-                # Debug document object
-                print(f"Document object type: {type(doc)}")
-                print(f"Document attributes: {[attr for attr in dir(doc) if not attr.startswith('_')]}")
-                # Try all possible ways to get page count
-                if hasattr(doc, 'page_count'):
-                    total_pages = doc.page_count
-                    print(f"Used page_count: {total_pages}")
-                elif hasattr(doc, 'pageCount'):
                     total_pages = doc.pageCount
                     print(f"Used pageCount: {total_pages}")
                 else:
                     total_pages = len(doc)
                     print(f"Used len(): {total_pages}")
@@ -196,7 +203,6 @@ def process_document(file):
         # Run OCR
         print(f"Running OCR on: {file_path}")
         result = ocr.ocr(file_path, cls=True)
-        print(f"OCR result type: {type(result)}")
         # Extract text
         extracted_text = ""
@@ -238,7 +244,7 @@ def process_document(file):
         import traceback
         traceback.print_exc()
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):
     """Process API-style requests (for integration with your Vercel app)"""
     try:

+# app.py - Correct structure with monkey patch BEFORE any fitz imports
 import os
 import subprocess
 import sys
 import base64
 import json
+# SSL fix function (keep as is)
 def fix_ssl_library():
     """Download and install libssl1.1 if not present"""
     try:
         if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
             print("libssl.so.1.1 already exists")
             return True
         print("Attempting to install libssl1.1...")
         subprocess.run([
             'wget', '-q',
             'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
             '-O', '/tmp/libssl1.1.deb'
         ], check=True)
         result = subprocess.run([
             'dpkg', '-i', '/tmp/libssl1.1.deb'
         ], capture_output=True, text=True)
         if result.returncode != 0:
             print("dpkg install failed, trying manual extraction...")
             subprocess.run([
                 'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
             ], check=True)
             lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
             current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
             if current_ld_path:
         print(f"Failed to install libssl1.1: {e}")
         return False
+# CRITICAL: Apply monkey patch BEFORE importing fitz/PyMuPDF
 def monkey_patch_pymupdf():
     """Fix PaddleOCR compatibility with newer PyMuPDF versions"""
+    print("Applying PyMuPDF compatibility patches...")
+    # Import fitz here to apply patches
     import fitz
     # Add pageCount property to Document class if it doesn't exist
             return self.page_count
         fitz.Document.pageCount = property(pageCount_property)
+        print("✓ Added pageCount compatibility property to PyMuPDF Document class")
+    else:
+        print("✓ pageCount already exists")
     # Add getPixmap method to Page class if it doesn't exist
     if not hasattr(fitz.Page, 'getPixmap'):
             return self.get_pixmap(matrix=matrix, alpha=alpha)
         fitz.Page.getPixmap = getPixmap
+        print("✓ Added getPixmap compatibility method to PyMuPDF Page class")
+    else:
+        print("✓ getPixmap already exists")
+    # Add getText method if it doesn't exist
     if not hasattr(fitz.Page, 'getText'):
         def getText(self, option="text"):
             return self.get_text(option)
         fitz.Page.getText = getText
+        print("✓ Added getText compatibility method to PyMuPDF Page class")
+    else:
+        print("✓ getText already exists")
+    print("✓ PyMuPDF compatibility patches applied successfully")
 def try_paddle_import():
     """Try different approaches to import PaddleOCR"""
     # First try the SSL fix
     fix_ssl_library()
+    # CRITICAL: Apply PyMuPDF compatibility patches BEFORE importing PaddleOCR
+    monkey_patch_pymupdf()
     # Try importing with different environment variables
     os.environ['PADDLE_GIT_DISABLE'] = '1'
         if 'libssl.so.1.1' in str(e):
             print("Still having SSL issues, trying alternative PaddlePaddle version...")
             try:
                 subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
                              capture_output=True)
         print(f"PaddleOCR import failed: {e}")
         raise e
+# Import Gradio
 import gradio as gr
+# Import PyMuPDF AFTER monkey patch is defined but BEFORE PaddleOCR
+import fitz  # This import will use the patched version
 # Try to import PaddleOCR with fixes
 print("Attempting to import PaddleOCR...")
     print("PaddleOCR models loaded successfully!")
 except Exception as e:
     print(f"Failed to load PaddleOCR: {e}")
+    print("Application will exit - compatibility issue not resolved")
     sys.exit(1)
+# Test the monkey patch
+print("Testing monkey patch...")
+test_doc = None
+try:
+    # Create a simple test to verify pageCount exists
+    import io
+    pdf_content = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000053 00000 n \n0000000100 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n179\n%%EOF"
+    test_doc = fitz.open(stream=pdf_content, filetype="pdf")
+    if hasattr(test_doc, 'pageCount'):
+        print(f"✓ Monkey patch successful! pageCount = {test_doc.pageCount}")
+    else:
+        print("✗ Monkey patch failed - pageCount not found")
+        print(f"Available attributes: {[attr for attr in dir(test_doc) if 'count' in attr.lower()]}")
+    test_doc.close()
+except Exception as e:
+    print(f"Monkey patch test failed: {e}")
+    if test_doc:
+        test_doc.close()
+# Rest of your app code (process_document, API functions, Gradio interface, etc.)
 def process_document(file):
+    """Process uploaded document with PaddleOCR"""
     if file is None:
         return "No file uploaded", "", ""
     start_time = time.time()
     try:
+        filename = os.path.basename(file.name)
         print(f"Processing: {filename}")
+        file_path = file.name
         print(f"File path: {file_path}")
         # Count pages if PDF
                 print(f"Opening PDF: {file_path}")
                 doc = fitz.open(file_path)
+                # Test pageCount attribute
+                print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}")
+                print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}")
+                if hasattr(doc, 'pageCount'):
                     total_pages = doc.pageCount
                     print(f"Used pageCount: {total_pages}")
+                elif hasattr(doc, 'page_count'):
+                    total_pages = doc.page_count
+                    print(f"Used page_count: {total_pages}")
                 else:
                     total_pages = len(doc)
                     print(f"Used len(): {total_pages}")
         # Run OCR
         print(f"Running OCR on: {file_path}")
         result = ocr.ocr(file_path, cls=True)
         # Extract text
         extracted_text = ""
         import traceback
         traceback.print_exc()
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):
     """Process API-style requests (for integration with your Vercel app)"""
     try: