Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

4fd5951

1 Parent(s): 5745364

Fixed PDF to Image conversion

Browse files

Files changed (1) hide show

paddle_ocr_standalone.py +307 -150

paddle_ocr_standalone.py CHANGED Viewed

@@ -1,172 +1,329 @@
-#!/usr/bin/env python3
-# paddle_ocr_standalone.py - Fixed version with PDF to image conversion
-import sys
 import os
-import json
 import tempfile
-# Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
-import fitz  # PyMuPDF for PDF page counting
-if not hasattr(fitz.Document, 'pageCount'):
-    def pageCount_property(self):
-        return self.page_count
-    fitz.Document.pageCount = property(pageCount_property)
-if not hasattr(fitz.Page, 'getPixmap'):
-    def getPixmap(self, matrix=None, alpha=True):
-        return self.get_pixmap(matrix=matrix, alpha=alpha)
-    fitz.Page.getPixmap = getPixmap
-if not hasattr(fitz.Page, 'getText'):
-    def getText(self, option="text"):
-        return self.get_text(option)
-    fitz.Page.getText = getText
-# NOW import PaddleOCR after applying the patches
-from paddleocr import PaddleOCR
-def pdf_to_images(pdf_path, dpi=200):
-    """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
     try:
-        doc = fitz.open(pdf_path)
-        image_paths = []
-        for page_num in range(len(doc)):
-            page = doc[page_num]
-            # Create a transformation matrix for higher DPI
-            mat = fitz.Matrix(dpi/72, dpi/72)  # 200 DPI for better OCR accuracy
-            # Render page to pixmap
-            if hasattr(page, 'getPixmap'):
-                pix = page.getPixmap(matrix=mat)
-            else:
-                pix = page.get_pixmap(matrix=mat)
-            # Save to temporary file
-            temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
-            pix.save(temp_img_path)
-            image_paths.append(temp_img_path)
-            print(f"Converted page {page_num + 1} to: {temp_img_path}", file=sys.stderr)
-        doc.close()
-        return image_paths
     except Exception as e:
-        print(f"Error converting PDF to images: {e}", file=sys.stderr)
-        return []
-def cleanup_temp_files(file_paths):
-    """Clean up temporary image files"""
-    for file_path in file_paths:
-        try:
-            if os.path.exists(file_path):
-                os.unlink(file_path)
-                print(f"Cleaned up: {file_path}", file=sys.stderr)
-        except Exception as e:
-            print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
-# Check if file path was provided
-if len(sys.argv) < 2:
-    result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
-    print(json.dumps(result))
-    sys.exit(1)
-file_path = sys.argv[1]
-temp_files = []
-try:
-    # Print progress to stderr (like your local implementation)
-    print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
-    # Initialize PaddleOCR - exactly like your local implementation
-    # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
-    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
-    print("PaddleOCR initialized successfully", file=sys.stderr)
-    # Check if it's a PDF or image
-    is_pdf = file_path.lower().endswith('.pdf')
-    if is_pdf:
-        print("Converting PDF to images for OCR processing...", file=sys.stderr)
-        image_paths = pdf_to_images(file_path)
-        temp_files = image_paths
-        if not image_paths:
-            raise Exception("Failed to convert PDF to images")
-        total_pages = len(image_paths)
-    else:
-        # For image files, use directly
-        image_paths = [file_path]
-        total_pages = 1
-    print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
-    # Process each image with OCR
-    extracted_text = ""
-    pages_processed = 0
-    for i, img_path in enumerate(image_paths):
         try:
-            current_page = i + 1
-            print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
-            print(f"Processing image: {img_path}", file=sys.stderr)
-            # Run OCR on the image
-            result = ocr.ocr(img_path, cls=True)
-            if result and result[0]:  # result is a list of pages, we have one page per image
-                pages_processed += 1
-                page_text = ""
-                for line in result[0]:
-                    if len(line) >= 2 and line[1][1] > 0.5:  # confidence threshold
-                        page_text += line[1][0] + "\n"
-                if page_text.strip():
-                    extracted_text += f"\n--- Page {current_page} ---\n"
-                    extracted_text += page_text
-                print(f"Page {current_page} processed successfully", file=sys.stderr)
             else:
-                print(f"No text found on page {current_page}", file=sys.stderr)
-        except Exception as page_error:
-            print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
-            continue
-    # Clean up temporary files
-    if temp_files:
-        cleanup_temp_files(temp_files)
-    # Output the final result as JSON to stdout
-    result_data = {
-        "success": True,
-        "text": extracted_text,
-        "total_pages": total_pages,
-        "pages_processed": pages_processed,
-        "method": "pdf_to_images" if is_pdf else "direct_image"
     }
-    print(json.dumps(result_data))
-    print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
-except Exception as e:
-    # Clean up on error
-    if temp_files:
-        cleanup_temp_files(temp_files)
-    print(f"Error during OCR processing: {e}", file=sys.stderr)
-    import traceback
-    traceback.print_exc(file=sys.stderr)
-    error_data = {
-        "success": False,
-        "error": str(e)
     }
-    print(json.dumps(error_data))
-    sys.exit(1)

+# app.py - Using subprocess approach like your local Node.js implementation
 import os
+import subprocess
+import sys
 import tempfile
+import time
+import base64
+import json
+# Import Gradio
+import gradio as gr
+def run_paddle_ocr_subprocess(file_path):
+    """Run PaddleOCR as a subprocess - mirrors your local Node.js approach"""
     try:
+        # Get the path to our standalone OCR script
+        script_path = os.path.join(os.path.dirname(__file__), 'paddle_ocr_standalone.py')
+        # Run the subprocess - exactly like your Node.js implementation
+        command = [sys.executable, script_path, file_path]
+        print(f"Running command: {' '.join(command)}")
+        # Track progress
+        total_pages = 1
+        current_page = 0
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,
+            universal_newlines=True
+        )
+        # Read stderr for progress updates (like your Node.js implementation)
+        stderr_output = ""
+        while True:
+            stderr_line = process.stderr.readline()
+            if not stderr_line:
+                break
+            stderr_output += stderr_line
+            if stderr_line.startswith('TOTAL_PAGES:'):
+                total_pages = int(stderr_line.split(':')[1].strip())
+                print(f"Processing document with {total_pages} pages")
+            elif stderr_line.startswith('CURRENT_PAGE:'):
+                current_page = int(stderr_line.split(':')[1].strip())
+                print(f"Processing page {current_page} of {total_pages}")
+        # Wait for process to complete and get stdout
+        stdout, remaining_stderr = process.communicate()
+        if process.returncode != 0:
+            print(f"OCR process failed with return code {process.returncode}")
+            print(f"stderr: {stderr_output + remaining_stderr}")
+            return {
+                "success": False,
+                "error": f"OCR process failed: {stderr_output + remaining_stderr}"
+            }
+        # Parse the JSON result from stdout - handle mixed output
+        try:
+            # PaddleOCR might output download messages to stdout, find the JSON
+            stdout_lines = stdout.strip().split('\n')
+            json_result = None
+            # Look for the JSON result (usually the last line that starts with {)
+            for line in reversed(stdout_lines):
+                line = line.strip()
+                if line.startswith('{') and line.endswith('}'):
+                    try:
+                        json_result = json.loads(line)
+                        break
+                    except json.JSONDecodeError:
+                        continue
+            if json_result is None:
+                # If no valid JSON found, try the entire stdout
+                json_result = json.loads(stdout.strip())
+            print(f"OCR completed successfully: {json_result.get('pages_processed', 0)}/{json_result.get('total_pages', 0)} pages")
+            return json_result
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse OCR result: {e}")
+            print(f"stdout: {stdout}")
+            print(f"Trying to extract JSON from mixed output...")
+            # Try to find JSON in the mixed output
+            import re
+            json_match = re.search(r'\{.*"success".*\}', stdout, re.DOTALL)
+            if json_match:
+                try:
+                    result = json.loads(json_match.group())
+                    print("Successfully extracted JSON from mixed output")
+                    return result
+                except json.JSONDecodeError:
+                    pass
+            return {
+                "success": False,
+                "error": f"Failed to parse OCR result: {str(e)}"
+            }
     except Exception as e:
+        print(f"Error running OCR subprocess: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+def process_document(file):
+    """Process uploaded document using subprocess OCR"""
+    if file is None:
+        return "No file uploaded", "", ""
+    start_time = time.time()
+    try:
+        filename = os.path.basename(file.name)
+        print(f"Processing: {filename}")
+        file_path = file.name
+        print(f"File path: {file_path}")
+        # Run OCR using subprocess (like your Node.js implementation)
+        ocr_result = run_paddle_ocr_subprocess(file_path)
+        if not ocr_result.get("success", False):
+            error_msg = f"❌ OCR failed: {ocr_result.get('error', 'Unknown error')}"
+            return error_msg, "", json.dumps(ocr_result)
+        # Extract results
+        extracted_text = ocr_result.get("text", "")
+        pages_processed = ocr_result.get("pages_processed", 0)
+        total_pages = ocr_result.get("total_pages", 1)
+        processing_time = time.time() - start_time
+        summary = f"""
+📄 **File**: {filename}
+📊 **Pages Processed**: {pages_processed}/{total_pages}
+⏱️ **Processing Time**: {processing_time:.2f} seconds
+📝 **Text Length**: {len(extracted_text)} characters
+🔧 **OCR Engine**: PaddleOCR (Subprocess)
+✅ **Method**: Subprocess execution (like your local Node.js implementation)
+        """
+        api_response = json.dumps({
+            "success": True,
+            "text": extracted_text,
+            "filename": filename,
+            "pages_processed": pages_processed,
+            "total_pages": total_pages,
+            "processing_time": processing_time,
+            "ocr_engine": "PaddleOCR",
+            "method": "subprocess"
+        }, indent=2)
+        return summary, extracted_text, api_response
+    except Exception as e:
+        error_msg = f"❌ Error processing file: {str(e)}"
+        print(f"Full error: {e}")
+        import traceback
+        traceback.print_exc()
+        return error_msg, "", json.dumps({"success": False, "error": str(e)})
+def process_api_request(api_data):
+    """Process API-style requests (for integration with your Vercel app)"""
+    try:
+        data = json.loads(api_data)
+        if 'file' not in data:
+            return json.dumps({"success": False, "error": "No file data provided"})
+        # Decode base64 file
+        file_data = base64.b64decode(data['file'])
+        filename = data.get('filename', 'unknown.pdf')
+        # Save to temp file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
+            tmp_file.write(file_data)
+            tmp_file_path = tmp_file.name
         try:
+            # Run OCR using subprocess
+            ocr_result = run_paddle_ocr_subprocess(tmp_file_path)
+            if ocr_result.get("success", False):
+                return json.dumps({
+                    "success": True,
+                    "text": ocr_result.get("text", ""),
+                    "filename": filename,
+                    "pages_processed": ocr_result.get("pages_processed", 0),
+                    "total_pages": ocr_result.get("total_pages", 1),
+                    "ocr_engine": "PaddleOCR",
+                    "method": "subprocess"
+                })
             else:
+                return json.dumps(ocr_result)
+        finally:
+            os.unlink(tmp_file_path)
+    except Exception as e:
+        return json.dumps({"success": False, "error": str(e)})
+# Create Gradio interface
+with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
+    gr.Markdown("# 🏥 PaddleOCR Medical Document Processor")
+    gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
+    with gr.Tab("📄 File Upload"):
+        with gr.Row():
+            with gr.Column():
+                file_input = gr.File(
+                    label="Upload Document (PDF, JPG, PNG)",
+                    file_types=[".pdf", ".jpg", ".jpeg", ".png"]
+                )
+                process_btn = gr.Button("🔍 Process Document", variant="primary")
+            with gr.Column():
+                summary_output = gr.Markdown(label="📊 Processing Summary")
+        with gr.Row():
+            text_output = gr.Textbox(
+                label="📝 Extracted Text",
+                lines=15,
+                max_lines=20
+            )
+        process_btn.click(
+            fn=process_document,
+            inputs=[file_input],
+            outputs=[summary_output, text_output, gr.Textbox(visible=False)]
+        )
+    with gr.Tab("🔌 API Integration"):
+        gr.Markdown("### For integration with your Vercel app:")
+        gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
+        gr.Markdown("**Method**: POST")
+        gr.Markdown("**Headers**: `Content-Type: application/json`")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("**Sample Request:**")
+                gr.Code('''
+{
+  "data": [
+    {
+      "file": "base64_encoded_file_data_here",
+      "filename": "lab_report.pdf"
     }
+  ]
+}
+                ''', language="json")
+            with gr.Column():
+                gr.Markdown("**Sample Response:**")
+                gr.Code('''
+{
+  "data": [
+    {
+      "success": true,
+      "text": "Extracted text content...",
+      "filename": "lab_report.pdf",
+      "ocr_engine": "PaddleOCR",
+      "method": "subprocess"
     }
+  ]
+}
+                ''', language="json")
+        gr.Markdown("### Test API Request:")
+        api_input = gr.Textbox(
+            label="API Request (JSON)",
+            placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
+            lines=5
+        )
+        api_btn = gr.Button("🧪 Test API Request")
+        api_output = gr.Textbox(
+            label="API Response (JSON)",
+            lines=10
+        )
+        api_btn.click(
+            fn=process_api_request,
+            inputs=[api_input],
+            outputs=[api_output]
+        )
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+        ### 🎯 Purpose
+        This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
+        ### 🔧 Integration
+        This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
+        ### 📚 Supported Formats
+        - PDF documents (multi-page)
+        - JPEG/JPG images
+        - PNG images
+        ### 🚀 Features
+        - High accuracy OCR with PaddleOCR
+        - Subprocess execution (mirrors your local Node.js implementation)
+        - Medical document optimization
+        - Multi-page PDF support
+        - RESTful API integration
+        - Free hosting on Hugging Face
+        ### 🔗 Integration URL
+        `https://mbuck17-paddleocr-processor.hf.space/api/predict`
+        ### ⚙️ Architecture
+        This implementation uses subprocess execution just like your local Node.js version,
+        ensuring maximum compatibility with PaddleOCR's PDF processing capabilities.
+        """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)