Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

b7aa35b

1 Parent(s): 728c43f

Add standalone script

Browse files

Files changed (2) hide show

app.py +102 -189
paddle_ocr_standalone.py +93 -0

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - Simple fix mirroring your local implementation
 import os
 import subprocess
@@ -8,143 +8,83 @@ import time
 import base64
 import json
-# SSL fix function (keep as is)
-def fix_ssl_library():
-    """Download and install libssl1.1 if not present"""
     try:
-        if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
-            print("libssl.so.1.1 already exists")
-            return True
-        print("Attempting to install libssl1.1...")
-        subprocess.run([
-            'wget', '-q',
-            'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
-            '-O', '/tmp/libssl1.1.deb'
-        ], check=True)
-        result = subprocess.run([
-            'dpkg', '-i', '/tmp/libssl1.1.deb'
-        ], capture_output=True, text=True)
-        if result.returncode != 0:
-            print("dpkg install failed, trying manual extraction...")
-            subprocess.run([
-                'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
-            ], check=True)
-            lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
-            current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
-            if current_ld_path:
-                os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
-            else:
-                os.environ['LD_LIBRARY_PATH'] = lib_path
-            print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")
-        return True
-    except Exception as e:
-        print(f"Failed to install libssl1.1: {e}")
-        return False
-def try_paddle_import():
-    """Try different approaches to import PaddleOCR"""
-    # First try the SSL fix
-    fix_ssl_library()
-    # Try importing with different environment variables
-    os.environ['PADDLE_GIT_DISABLE'] = '1'
-    try:
-        from paddleocr import PaddleOCR
-        return PaddleOCR
-    except ImportError as e:
-        if 'libssl.so.1.1' in str(e):
-            print("Still having SSL issues, trying alternative PaddlePaddle version...")
-            try:
-                subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
-                             capture_output=True)
-                subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'],
-                             check=True)
-                from paddleocr import PaddleOCR
-                return PaddleOCR
-            except Exception as inner_e:
-                print(f"Failed to install alternative version: {inner_e}")
-        print(f"PaddleOCR import failed: {e}")
-        raise e
-# Import Gradio
-import gradio as gr
-# CRITICAL: Apply PyMuPDF compatibility patch BEFORE importing PaddleOCR
-print("Applying PyMuPDF compatibility patches...")
-import fitz
-# Add pageCount property to Document class if it doesn't exist
-if not hasattr(fitz.Document, 'pageCount'):
-    def pageCount_property(self):
-        return self.page_count
-    fitz.Document.pageCount = property(pageCount_property)
-    print("✓ Added pageCount compatibility property to PyMuPDF Document class")
-else:
-    print("✓ pageCount already exists")
-# Add getPixmap method to Page class if it doesn't exist
-if not hasattr(fitz.Page, 'getPixmap'):
-    def getPixmap(self, matrix=None, alpha=True):
-        return self.get_pixmap(matrix=matrix, alpha=alpha)
-    fitz.Page.getPixmap = getPixmap
-    print("✓ Added getPixmap compatibility method to PyMuPDF Page class")
-else:
-    print("✓ getPixmap already exists")
-# Add getText method if it doesn't exist
-if not hasattr(fitz.Page, 'getText'):
-    def getText(self, option="text"):
-        return self.get_text(option)
-    fitz.Page.getText = getText
-    print("✓ Added getText compatibility method to PyMuPDF Page class")
-else:
-    print("✓ getText already exists")
-print("✓ PyMuPDF compatibility patches applied successfully")
-# NOW import PaddleOCR after the patches are applied
-print("Attempting to import PaddleOCR...")
-try:
-    PaddleOCR = try_paddle_import()
-    print("Loading PaddleOCR models...")
-    # Use the same settings as your local implementation
-    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
-    print("PaddleOCR models loaded successfully!")
-except Exception as e:
-    print(f"Failed to load PaddleOCR: {e}")
-    print("Application will exit - compatibility issue not resolved")
-    sys.exit(1)
-def count_pdf_pages(file_path):
-    """Count pages in PDF - mirrors your local implementation"""
-    try:
-        if file_path.lower().endswith('.pdf'):
-            doc = fitz.open(file_path)
-            page_count = len(doc)
-            doc.close()
-            return page_count
-        else:
-            return 1  # Images are considered as 1 page
     except Exception as e:
-        print(f"Error counting PDF pages: {e}")
-        return 1  # Default to 1 if we can't determine
 def process_document(file):
-    """Process uploaded document with PaddleOCR - mirrors your local implementation"""
     if file is None:
         return "No file uploaded", "", ""
@@ -157,49 +97,27 @@ def process_document(file):
         file_path = file.name
         print(f"File path: {file_path}")
-        # Count total pages - exactly like your local implementation
-        total_pages = count_pdf_pages(file_path)
-        print(f"Total pages detected: {total_pages}")
-        # Run OCR directly on the file path - just like your local version
-        print(f"Running OCR on: {file_path}")
-        # This is the exact same call as in your paddle_ocr.py
-        result = ocr.ocr(file_path, cls=True)
-        # Extract text - same logic as your local implementation
-        extracted_text = ""
-        pages_processed = 0
-        if result:
-            for page_idx, page_result in enumerate(result):
-                current_page = page_idx + 1
-                print(f"Processing page {current_page} of {total_pages}")
-                if page_result:
-                    pages_processed += 1
-                    page_text = ""
-                    for line in page_result:
-                        if len(line) >= 2:
-                            # Add confidence check like your local version might have
-                            confidence = line[1][1] if len(line[1]) > 1 else 1.0
-                            if confidence > 0.5:  # Only include high-confidence text
-                                page_text += line[1][0] + "\n"
-                    if page_text.strip():
-                        extracted_text += f"\n--- Page {current_page} ---\n"
-                        extracted_text += page_text
         processing_time = time.time() - start_time
-        print(f"Completed processing {total_pages} pages in {processing_time:.2f}s")
         summary = f"""
 📄 **File**: {filename}
 📊 **Pages Processed**: {pages_processed}/{total_pages}
 ⏱️ **Processing Time**: {processing_time:.2f} seconds
 📝 **Text Length**: {len(extracted_text)} characters
-🔧 **OCR Engine**: PaddleOCR (Direct PDF Processing)
         """
         api_response = json.dumps({
@@ -209,13 +127,14 @@ def process_document(file):
             "pages_processed": pages_processed,
             "total_pages": total_pages,
             "processing_time": processing_time,
-            "ocr_engine": "PaddleOCR"
         }, indent=2)
         return summary, extracted_text, api_response
     except Exception as e:
-        error_msg = f"Error processing file: {str(e)}"
         print(f"Full error: {e}")
         import traceback
         traceback.print_exc()
@@ -239,32 +158,21 @@ def process_api_request(api_data):
             tmp_file_path = tmp_file.name
         try:
-            # Count pages
-            total_pages = count_pdf_pages(tmp_file_path)
-            # Run OCR - same as your local implementation
-            result = ocr.ocr(tmp_file_path, cls=True)
-            # Extract text
-            text = ""
-            pages_processed = 0
-            if result:
-                for page_idx, page_result in enumerate(result):
-                    if page_result:
-                        pages_processed += 1
-                        for line in page_result:
-                            if len(line) >= 2:
-                                text += line[1][0] + "\n"
-            return json.dumps({
-                "success": True,
-                "text": text,
-                "filename": filename,
-                "pages_processed": pages_processed,
-                "total_pages": total_pages,
-                "ocr_engine": "PaddleOCR"
-            })
         finally:
             os.unlink(tmp_file_path)
@@ -331,7 +239,8 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
       "success": true,
       "text": "Extracted text content...",
       "filename": "lab_report.pdf",
-      "ocr_engine": "PaddleOCR"
     }
   ]
 }
@@ -370,7 +279,7 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
         ### 🚀 Features
         - High accuracy OCR with PaddleOCR
-        - Direct PDF processing (like your local implementation)
         - Medical document optimization
         - Multi-page PDF support
         - RESTful API integration
@@ -378,6 +287,10 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
         ### 🔗 Integration URL
         `https://mbuck17-paddleocr-processor.hf.space/api/predict`
         """)
 # Launch the app

+# app.py - Using subprocess approach like your local Node.js implementation
 import os
 import subprocess
 import base64
 import json
+# Import Gradio
+import gradio as gr
+def run_paddle_ocr_subprocess(file_path):
+    """Run PaddleOCR as a subprocess - mirrors your local Node.js approach"""
     try:
+        # Get the path to our standalone OCR script
+        script_path = os.path.join(os.path.dirname(__file__), 'paddle_ocr_standalone.py')
+        # Run the subprocess - exactly like your Node.js implementation
+        command = [sys.executable, script_path, file_path]
+        print(f"Running command: {' '.join(command)}")
+        # Track progress
+        total_pages = 1
+        current_page = 0
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,
+            universal_newlines=True
+        )
+        # Read stderr for progress updates (like your Node.js implementation)
+        stderr_output = ""
+        while True:
+            stderr_line = process.stderr.readline()
+            if not stderr_line:
+                break
+            stderr_output += stderr_line
+            if stderr_line.startswith('TOTAL_PAGES:'):
+                total_pages = int(stderr_line.split(':')[1].strip())
+                print(f"Processing document with {total_pages} pages")
+            elif stderr_line.startswith('CURRENT_PAGE:'):
+                current_page = int(stderr_line.split(':')[1].strip())
+                print(f"Processing page {current_page} of {total_pages}")
+        # Wait for process to complete and get stdout
+        stdout, remaining_stderr = process.communicate()
+        if process.returncode != 0:
+            print(f"OCR process failed with return code {process.returncode}")
+            print(f"stderr: {stderr_output + remaining_stderr}")
+            return {
+                "success": False,
+                "error": f"OCR process failed: {stderr_output + remaining_stderr}"
+            }
+        # Parse the JSON result from stdout
+        try:
+            result = json.loads(stdout.strip())
+            print(f"OCR completed successfully: {result.get('pages_processed', 0)}/{result.get('total_pages', 0)} pages")
+            return result
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse OCR result: {e}")
+            print(f"stdout: {stdout}")
+            return {
+                "success": False,
+                "error": f"Failed to parse OCR result: {str(e)}"
+            }
     except Exception as e:
+        print(f"Error running OCR subprocess: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
 def process_document(file):
+    """Process uploaded document using subprocess OCR"""
     if file is None:
         return "No file uploaded", "", ""
         file_path = file.name
         print(f"File path: {file_path}")
+        # Run OCR using subprocess (like your Node.js implementation)
+        ocr_result = run_paddle_ocr_subprocess(file_path)
+        if not ocr_result.get("success", False):
+            error_msg = f"❌ OCR failed: {ocr_result.get('error', 'Unknown error')}"
+            return error_msg, "", json.dumps(ocr_result)
+        # Extract results
+        extracted_text = ocr_result.get("text", "")
+        pages_processed = ocr_result.get("pages_processed", 0)
+        total_pages = ocr_result.get("total_pages", 1)
         processing_time = time.time() - start_time
         summary = f"""
 📄 **File**: {filename}
 📊 **Pages Processed**: {pages_processed}/{total_pages}
 ⏱️ **Processing Time**: {processing_time:.2f} seconds
 📝 **Text Length**: {len(extracted_text)} characters
+🔧 **OCR Engine**: PaddleOCR (Subprocess)
+✅ **Method**: Subprocess execution (like your local Node.js implementation)
         """
         api_response = json.dumps({
             "pages_processed": pages_processed,
             "total_pages": total_pages,
             "processing_time": processing_time,
+            "ocr_engine": "PaddleOCR",
+            "method": "subprocess"
         }, indent=2)
         return summary, extracted_text, api_response
     except Exception as e:
+        error_msg = f"❌ Error processing file: {str(e)}"
         print(f"Full error: {e}")
         import traceback
         traceback.print_exc()
             tmp_file_path = tmp_file.name
         try:
+            # Run OCR using subprocess
+            ocr_result = run_paddle_ocr_subprocess(tmp_file_path)
+            if ocr_result.get("success", False):
+                return json.dumps({
+                    "success": True,
+                    "text": ocr_result.get("text", ""),
+                    "filename": filename,
+                    "pages_processed": ocr_result.get("pages_processed", 0),
+                    "total_pages": ocr_result.get("total_pages", 1),
+                    "ocr_engine": "PaddleOCR",
+                    "method": "subprocess"
+                })
+            else:
+                return json.dumps(ocr_result)
         finally:
             os.unlink(tmp_file_path)
       "success": true,
       "text": "Extracted text content...",
       "filename": "lab_report.pdf",
+      "ocr_engine": "PaddleOCR",
+      "method": "subprocess"
     }
   ]
 }
         ### 🚀 Features
         - High accuracy OCR with PaddleOCR
+        - Subprocess execution (mirrors your local Node.js implementation)
         - Medical document optimization
         - Multi-page PDF support
         - RESTful API integration
         ### 🔗 Integration URL
         `https://mbuck17-paddleocr-processor.hf.space/api/predict`
+        ### ⚙️ Architecture
+        This implementation uses subprocess execution just like your local Node.js version,
+        ensuring maximum compatibility with PaddleOCR's PDF processing capabilities.
         """)
 # Launch the app

paddle_ocr_standalone.py ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/usr/bin/env python3
+# paddle_ocr_standalone.py - Standalone script that mirrors your local implementation
+import sys
+import os
+import json
+from paddleocr import PaddleOCR
+import fitz  # PyMuPDF for PDF page counting
+# Apply monkey patch for PyMuPDF compatibility
+if not hasattr(fitz.Document, 'pageCount'):
+    def pageCount_property(self):
+        return self.page_count
+    fitz.Document.pageCount = property(pageCount_property)
+if not hasattr(fitz.Page, 'getPixmap'):
+    def getPixmap(self, matrix=None, alpha=True):
+        return self.get_pixmap(matrix=matrix, alpha=alpha)
+    fitz.Page.getPixmap = getPixmap
+if not hasattr(fitz.Page, 'getText'):
+    def getText(self, option="text"):
+        return self.get_text(option)
+    fitz.Page.getText = getText
+# Check if file path was provided
+if len(sys.argv) < 2:
+    print(json.dumps({"error": "Usage: python paddle_ocr_standalone.py <file_path>"}))
+    sys.exit(1)
+file_path = sys.argv[1]
+try:
+    # Initialize PaddleOCR - exactly like your local implementation
+    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+    # Count total pages if it's a PDF
+    def count_pdf_pages(file_path):
+        try:
+            if file_path.lower().endswith('.pdf'):
+                doc = fitz.open(file_path)
+                page_count = len(doc)
+                doc.close()
+                return page_count
+            else:
+                return 1  # Images are considered as 1 page
+        except:
+            return 1  # Default to 1 if we can't determine
+    # Get total pages
+    total_pages = count_pdf_pages(file_path)
+    print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
+    # Process the file - exactly like your local implementation
+    result = ocr.ocr(file_path, cls=True)
+    # Extract text and output results
+    extracted_text = ""
+    pages_processed = 0
+    # Print recognized text with page information
+    for page_idx, page_result in enumerate(result):
+        current_page = page_idx + 1
+        print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
+        if page_result:
+            pages_processed += 1
+            page_text = ""
+            for line in page_result:
+                if len(line) >= 2:
+                    page_text += line[1][0] + "\n"
+            if page_text.strip():
+                extracted_text += f"\n--- Page {current_page} ---\n"
+                extracted_text += page_text
+    # Output the final result as JSON to stdout
+    result_data = {
+        "success": True,
+        "text": extracted_text,
+        "total_pages": total_pages,
+        "pages_processed": pages_processed
+    }
+    print(json.dumps(result_data))
+except Exception as e:
+    error_data = {
+        "success": False,
+        "error": str(e)
+    }
+    print(json.dumps(error_data))
+    sys.exit(1)