Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 2, 2025

Commit

9b66525

1 Parent(s): 8c10e64

Page Count fix

Browse files

Files changed (1) hide show

app.py +140 -13

app.py CHANGED Viewed

@@ -1,17 +1,103 @@
-# app.py - Hugging Face Spaces version
-import gradio as gr
-import tempfile
 import os
 import time
 import base64
 import json
-from paddleocr import PaddleOCR
 import fitz  # PyMuPDF
-# Initialize PaddleOCR
-print("Loading PaddleOCR models...")
-ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
-print("PaddleOCR models loaded!")
 def process_document(file):
     """Process uploaded document with PaddleOCR"""
@@ -21,7 +107,6 @@ def process_document(file):
     start_time = time.time()
     try:
-        # Get file info
         filename = os.path.basename(file.name)
         print(f"Processing: {filename}")
@@ -30,7 +115,11 @@ def process_document(file):
         if filename.lower().endswith('.pdf'):
             try:
                 doc = fitz.open(file.name)
-                total_pages = len(doc)
                 doc.close()
             except Exception as e:
                 print(f"Could not count PDF pages: {e}")
@@ -57,6 +146,7 @@ def process_document(file):
 📊 **Pages Processed**: {pages_processed}/{total_pages}
 ⏱️ **Processing Time**: {processing_time:.2f} seconds
 📝 **Text Length**: {len(extracted_text)} characters
         """
         # For API compatibility, also return JSON format
@@ -66,13 +156,15 @@ def process_document(file):
             "filename": filename,
             "pages_processed": pages_processed,
             "total_pages": total_pages,
-            "processing_time": processing_time
         }, indent=2)
         return summary, extracted_text, api_response
     except Exception as e:
         error_msg = f"Error processing file: {str(e)}"
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):
@@ -107,7 +199,8 @@ def process_api_request(api_data):
             return json.dumps({
                 "success": True,
                 "text": text,
-                "filename": filename
             })
         finally:
@@ -148,10 +241,40 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
     with gr.Tab("🔌 API Integration"):
         gr.Markdown("### For integration with your Vercel app:")
-        gr.Markdown("**Endpoint**: `https://your-space-name-your-username.hf.space/api/predict`")
         gr.Markdown("**Method**: POST")
         gr.Markdown("**Headers**: `Content-Type: application/json`")
         api_input = gr.Textbox(
             label="API Request (JSON)",
             placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
@@ -188,6 +311,10 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
         - Multi-page PDF support
         - RESTful API integration
         - Free hosting on Hugging Face
         """)
 # Launch the app

+# app.py - Complete Hugging Face Spaces app with SSL fix
 import os
+import subprocess
+import sys
+import tempfile
 import time
 import base64
 import json
+# Try to fix SSL library issue before importing PaddleOCR
+def fix_ssl_library():
+    """Download and install libssl1.1 if not present"""
+    try:
+        # Check if libssl1.1 already exists
+        if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
+            print("libssl.so.1.1 already exists")
+            return True
+        print("Attempting to install libssl1.1...")
+        # Download libssl1.1 from Ubuntu repos
+        subprocess.run([
+            'wget', '-q',
+            'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
+            '-O', '/tmp/libssl1.1.deb'
+        ], check=True)
+        # Try to install the package
+        result = subprocess.run([
+            'dpkg', '-i', '/tmp/libssl1.1.deb'
+        ], capture_output=True, text=True)
+        # If dpkg install failed, try extracting manually
+        if result.returncode != 0:
+            print("dpkg install failed, trying manual extraction...")
+            subprocess.run([
+                'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
+            ], check=True)
+            # Set LD_LIBRARY_PATH to include the extracted libraries
+            lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
+            current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
+            if current_ld_path:
+                os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
+            else:
+                os.environ['LD_LIBRARY_PATH'] = lib_path
+            print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")
+        return True
+    except Exception as e:
+        print(f"Failed to install libssl1.1: {e}")
+        return False
+# Try alternative PaddlePaddle versions
+def try_paddle_import():
+    """Try different approaches to import PaddleOCR"""
+    # First try the SSL fix
+    fix_ssl_library()
+    # Try importing with different environment variables
+    os.environ['PADDLE_GIT_DISABLE'] = '1'
+    try:
+        from paddleocr import PaddleOCR
+        return PaddleOCR
+    except ImportError as e:
+        if 'libssl.so.1.1' in str(e):
+            print("Still having SSL issues, trying alternative PaddlePaddle version...")
+            # Try installing older version
+            try:
+                subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
+                             capture_output=True)
+                subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'],
+                             check=True)
+                from paddleocr import PaddleOCR
+                return PaddleOCR
+            except Exception as inner_e:
+                print(f"Failed to install alternative version: {inner_e}")
+        print(f"PaddleOCR import failed: {e}")
+        raise e
+# Import other required libraries
+import gradio as gr
 import fitz  # PyMuPDF
+# Try to import PaddleOCR with fixes
+print("Attempting to import PaddleOCR...")
+try:
+    PaddleOCR = try_paddle_import()
+    print("Loading PaddleOCR models...")
+    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+    print("PaddleOCR models loaded successfully!")
+except Exception as e:
+    print(f"Failed to load PaddleOCR: {e}")
+    print("Application will exit - SSL library issue not resolved")
+    sys.exit(1)
 def process_document(file):
     """Process uploaded document with PaddleOCR"""
     start_time = time.time()
     try:
         filename = os.path.basename(file.name)
         print(f"Processing: {filename}")
         if filename.lower().endswith('.pdf'):
             try:
                 doc = fitz.open(file.name)
+                # Handle different PyMuPDF versions
+                try:
+                    total_pages = doc.page_count  # Newer versions
+                except AttributeError:
+                    total_pages = len(doc)  # Older versions or alternative
                 doc.close()
             except Exception as e:
                 print(f"Could not count PDF pages: {e}")
 📊 **Pages Processed**: {pages_processed}/{total_pages}
 ⏱️ **Processing Time**: {processing_time:.2f} seconds
 📝 **Text Length**: {len(extracted_text)} characters
+🔧 **OCR Engine**: PaddleOCR
         """
         # For API compatibility, also return JSON format
             "filename": filename,
             "pages_processed": pages_processed,
             "total_pages": total_pages,
+            "processing_time": processing_time,
+            "ocr_engine": "PaddleOCR"
         }, indent=2)
         return summary, extracted_text, api_response
     except Exception as e:
         error_msg = f"Error processing file: {str(e)}"
+        print(f"Processing error: {e}")
         return error_msg, "", json.dumps({"success": False, "error": str(e)})
 def process_api_request(api_data):
             return json.dumps({
                 "success": True,
                 "text": text,
+                "filename": filename,
+                "ocr_engine": "PaddleOCR"
             })
         finally:
     with gr.Tab("🔌 API Integration"):
         gr.Markdown("### For integration with your Vercel app:")
+        gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
         gr.Markdown("**Method**: POST")
         gr.Markdown("**Headers**: `Content-Type: application/json`")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("**Sample Request:**")
+                gr.Code('''
+{
+  "data": [
+    {
+      "file": "base64_encoded_file_data_here",
+      "filename": "lab_report.pdf"
+    }
+  ]
+}
+                ''', language="json")
+            with gr.Column():
+                gr.Markdown("**Sample Response:**")
+                gr.Code('''
+{
+  "data": [
+    {
+      "success": true,
+      "text": "Extracted text content...",
+      "filename": "lab_report.pdf",
+      "ocr_engine": "PaddleOCR"
+    }
+  ]
+}
+                ''', language="json")
+        gr.Markdown("### Test API Request:")
         api_input = gr.Textbox(
             label="API Request (JSON)",
             placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
         - Multi-page PDF support
         - RESTful API integration
         - Free hosting on Hugging Face
+        - SSL compatibility fixes included
+        ### 🔗 Integration URL
+        `https://mbuck17-paddleocr-processor.hf.space/api/predict`
         """)
 # Launch the app