Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

9ae0d8b

1 Parent(s): 4fd5951

Version 6

Browse files

Files changed (1) hide show

paddle_ocr_standalone.py +175 -304

paddle_ocr_standalone.py CHANGED Viewed

@@ -1,329 +1,200 @@
-# app.py - Using subprocess approach like your local Node.js implementation
-import os
-import subprocess
 import sys
-import tempfile
-import time
-import base64
 import json
-# Import Gradio
-import gradio as gr
-def run_paddle_ocr_subprocess(file_path):
-    """Run PaddleOCR as a subprocess - mirrors your local Node.js approach"""
     try:
-        # Get the path to our standalone OCR script
-        script_path = os.path.join(os.path.dirname(__file__), 'paddle_ocr_standalone.py')
-        # Run the subprocess - exactly like your Node.js implementation
-        command = [sys.executable, script_path, file_path]
-        print(f"Running command: {' '.join(command)}")
-        # Track progress
-        total_pages = 1
-        current_page = 0
-        process = subprocess.Popen(
-            command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            bufsize=1,
-            universal_newlines=True
-        )
-        # Read stderr for progress updates (like your Node.js implementation)
-        stderr_output = ""
-        while True:
-            stderr_line = process.stderr.readline()
-            if not stderr_line:
-                break
-            stderr_output += stderr_line
-            if stderr_line.startswith('TOTAL_PAGES:'):
-                total_pages = int(stderr_line.split(':')[1].strip())
-                print(f"Processing document with {total_pages} pages")
-            elif stderr_line.startswith('CURRENT_PAGE:'):
-                current_page = int(stderr_line.split(':')[1].strip())
-                print(f"Processing page {current_page} of {total_pages}")
-        # Wait for process to complete and get stdout
-        stdout, remaining_stderr = process.communicate()
-        if process.returncode != 0:
-            print(f"OCR process failed with return code {process.returncode}")
-            print(f"stderr: {stderr_output + remaining_stderr}")
-            return {
-                "success": False,
-                "error": f"OCR process failed: {stderr_output + remaining_stderr}"
-            }
-        # Parse the JSON result from stdout - handle mixed output
-        try:
-            # PaddleOCR might output download messages to stdout, find the JSON
-            stdout_lines = stdout.strip().split('\n')
-            json_result = None
-            # Look for the JSON result (usually the last line that starts with {)
-            for line in reversed(stdout_lines):
-                line = line.strip()
-                if line.startswith('{') and line.endswith('}'):
-                    try:
-                        json_result = json.loads(line)
-                        break
-                    except json.JSONDecodeError:
-                        continue
-            if json_result is None:
-                # If no valid JSON found, try the entire stdout
-                json_result = json.loads(stdout.strip())
-            print(f"OCR completed successfully: {json_result.get('pages_processed', 0)}/{json_result.get('total_pages', 0)} pages")
-            return json_result
-        except json.JSONDecodeError as e:
-            print(f"Failed to parse OCR result: {e}")
-            print(f"stdout: {stdout}")
-            print(f"Trying to extract JSON from mixed output...")
-            # Try to find JSON in the mixed output
-            import re
-            json_match = re.search(r'\{.*"success".*\}', stdout, re.DOTALL)
-            if json_match:
-                try:
-                    result = json.loads(json_match.group())
-                    print("Successfully extracted JSON from mixed output")
-                    return result
-                except json.JSONDecodeError:
-                    pass
-            return {
-                "success": False,
-                "error": f"Failed to parse OCR result: {str(e)}"
-            }
-    except Exception as e:
-        print(f"Error running OCR subprocess: {e}")
-        return {
-            "success": False,
-            "error": str(e)
-        }
-def process_document(file):
-    """Process uploaded document using subprocess OCR"""
-    if file is None:
-        return "No file uploaded", "", ""
-    start_time = time.time()
-    try:
-        filename = os.path.basename(file.name)
-        print(f"Processing: {filename}")
-        file_path = file.name
-        print(f"File path: {file_path}")
-        # Run OCR using subprocess (like your Node.js implementation)
-        ocr_result = run_paddle_ocr_subprocess(file_path)
-        if not ocr_result.get("success", False):
-            error_msg = f"❌ OCR failed: {ocr_result.get('error', 'Unknown error')}"
-            return error_msg, "", json.dumps(ocr_result)
-        # Extract results
-        extracted_text = ocr_result.get("text", "")
-        pages_processed = ocr_result.get("pages_processed", 0)
-        total_pages = ocr_result.get("total_pages", 1)
-        processing_time = time.time() - start_time
-        summary = f"""
-📄 **File**: {filename}
-📊 **Pages Processed**: {pages_processed}/{total_pages}
-⏱️ **Processing Time**: {processing_time:.2f} seconds
-📝 **Text Length**: {len(extracted_text)} characters
-🔧 **OCR Engine**: PaddleOCR (Subprocess)
-✅ **Method**: Subprocess execution (like your local Node.js implementation)
-        """
-        api_response = json.dumps({
-            "success": True,
-            "text": extracted_text,
-            "filename": filename,
-            "pages_processed": pages_processed,
-            "total_pages": total_pages,
-            "processing_time": processing_time,
-            "ocr_engine": "PaddleOCR",
-            "method": "subprocess"
-        }, indent=2)
-        return summary, extracted_text, api_response
     except Exception as e:
-        error_msg = f"❌ Error processing file: {str(e)}"
-        print(f"Full error: {e}")
         import traceback
-        traceback.print_exc()
-        return error_msg, "", json.dumps({"success": False, "error": str(e)})
-def process_api_request(api_data):
-    """Process API-style requests (for integration with your Vercel app)"""
-    try:
-        data = json.loads(api_data)
-        if 'file' not in data:
-            return json.dumps({"success": False, "error": "No file data provided"})
-        # Decode base64 file
-        file_data = base64.b64decode(data['file'])
-        filename = data.get('filename', 'unknown.pdf')
-        # Save to temp file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
-            tmp_file.write(file_data)
-            tmp_file_path = tmp_file.name
         try:
-            # Run OCR using subprocess
-            ocr_result = run_paddle_ocr_subprocess(tmp_file_path)
-            if ocr_result.get("success", False):
-                return json.dumps({
-                    "success": True,
-                    "text": ocr_result.get("text", ""),
-                    "filename": filename,
-                    "pages_processed": ocr_result.get("pages_processed", 0),
-                    "total_pages": ocr_result.get("total_pages", 1),
-                    "ocr_engine": "PaddleOCR",
-                    "method": "subprocess"
-                })
-            else:
-                return json.dumps(ocr_result)
-        finally:
-            os.unlink(tmp_file_path)
-    except Exception as e:
-        return json.dumps({"success": False, "error": str(e)})
-# Create Gradio interface
-with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
-    gr.Markdown("# 🏥 PaddleOCR Medical Document Processor")
-    gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
-    with gr.Tab("📄 File Upload"):
-        with gr.Row():
-            with gr.Column():
-                file_input = gr.File(
-                    label="Upload Document (PDF, JPG, PNG)",
-                    file_types=[".pdf", ".jpg", ".jpeg", ".png"]
-                )
-                process_btn = gr.Button("🔍 Process Document", variant="primary")
-            with gr.Column():
-                summary_output = gr.Markdown(label="📊 Processing Summary")
-        with gr.Row():
-            text_output = gr.Textbox(
-                label="📝 Extracted Text",
-                lines=15,
-                max_lines=20
-            )
-        process_btn.click(
-            fn=process_document,
-            inputs=[file_input],
-            outputs=[summary_output, text_output, gr.Textbox(visible=False)]
-        )
-    with gr.Tab("🔌 API Integration"):
-        gr.Markdown("### For integration with your Vercel app:")
-        gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
-        gr.Markdown("**Method**: POST")
-        gr.Markdown("**Headers**: `Content-Type: application/json`")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("**Sample Request:**")
-                gr.Code('''
-{
-  "data": [
-    {
-      "file": "base64_encoded_file_data_here",
-      "filename": "lab_report.pdf"
-    }
-  ]
-}
-                ''', language="json")
-            with gr.Column():
-                gr.Markdown("**Sample Response:**")
-                gr.Code('''
-{
-  "data": [
-    {
-      "success": true,
-      "text": "Extracted text content...",
-      "filename": "lab_report.pdf",
-      "ocr_engine": "PaddleOCR",
-      "method": "subprocess"
     }
-  ]
-}
-                ''', language="json")
-        gr.Markdown("### Test API Request:")
-        api_input = gr.Textbox(
-            label="API Request (JSON)",
-            placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
-            lines=5
-        )
-        api_btn = gr.Button("🧪 Test API Request")
-        api_output = gr.Textbox(
-            label="API Response (JSON)",
-            lines=10
-        )
-        api_btn.click(
-            fn=process_api_request,
-            inputs=[api_input],
-            outputs=[api_output]
-        )
-    with gr.Tab("ℹ️ About"):
-        gr.Markdown("""
-        ### 🎯 Purpose
-        This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
-        ### 🔧 Integration
-        This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
-        ### 📚 Supported Formats
-        - PDF documents (multi-page)
-        - JPEG/JPG images
-        - PNG images
-        ### 🚀 Features
-        - High accuracy OCR with PaddleOCR
-        - Subprocess execution (mirrors your local Node.js implementation)
-        - Medical document optimization
-        - Multi-page PDF support
-        - RESTful API integration
-        - Free hosting on Hugging Face
-        ### 🔗 Integration URL
-        `https://mbuck17-paddleocr-processor.hf.space/api/predict`
-        ### ⚙️ Architecture
-        This implementation uses subprocess execution just like your local Node.js version,
-        ensuring maximum compatibility with PaddleOCR's PDF processing capabilities.
-        """)
-# Launch the app
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+#!/usr/bin/env python3
+# paddle_ocr_standalone.py - Fixed version with PDF to image conversion
 import sys
+import os
 import json
+import tempfile
+# Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
+import fitz  # PyMuPDF for PDF page counting
+if not hasattr(fitz.Document, 'pageCount'):
+    def pageCount_property(self):
+        return self.page_count
+    fitz.Document.pageCount = property(pageCount_property)
+if not hasattr(fitz.Page, 'getPixmap'):
+    def getPixmap(self, matrix=None, alpha=True):
+        return self.get_pixmap(matrix=matrix, alpha=alpha)
+    fitz.Page.getPixmap = getPixmap
+if not hasattr(fitz.Page, 'getText'):
+    def getText(self, option="text"):
+        return self.get_text(option)
+    fitz.Page.getText = getText
+# NOW import PaddleOCR after applying the patches
+from paddleocr import PaddleOCR
+def pdf_to_images(pdf_path, dpi=200):
+    """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
     try:
+        doc = fitz.open(pdf_path)
+        image_paths = []
+        print(f"PDF has {len(doc)} pages", file=sys.stderr)
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            # Create a transformation matrix for higher DPI
+            mat = fitz.Matrix(dpi/72, dpi/72)  # 200 DPI for better OCR accuracy
+            # Render page to pixmap
+            if hasattr(page, 'getPixmap'):
+                pix = page.getPixmap(matrix=mat)
+            else:
+                pix = page.get_pixmap(matrix=mat)
+            # Save to temporary file
+            temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
+            pix.save(temp_img_path)
+            # Check if file was created and get its size
+            if os.path.exists(temp_img_path):
+                file_size = os.path.getsize(temp_img_path)
+                print(f"Converted page {page_num + 1} to: {temp_img_path} (size: {file_size} bytes, dimensions: {pix.width}x{pix.height})", file=sys.stderr)
+            else:
+                print(f"Failed to create image file: {temp_img_path}", file=sys.stderr)
+                continue
+            image_paths.append(temp_img_path)
+        doc.close()
+        print(f"Successfully converted {len(image_paths)} pages to images", file=sys.stderr)
+        return image_paths
     except Exception as e:
+        print(f"Error converting PDF to images: {e}", file=sys.stderr)
         import traceback
+        traceback.print_exc(file=sys.stderr)
+        return []
+def cleanup_temp_files(file_paths):
+    """Clean up temporary image files"""
+    for file_path in file_paths:
+        try:
+            if os.path.exists(file_path):
+                os.unlink(file_path)
+                print(f"Cleaned up: {file_path}", file=sys.stderr)
+        except Exception as e:
+            print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
+# Check if file path was provided
+if len(sys.argv) < 2:
+    result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
+    print(json.dumps(result))
+    sys.exit(1)
+file_path = sys.argv[1]
+temp_files = []
+try:
+    # Print progress to stderr (like your local implementation)
+    print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
+    # Initialize PaddleOCR - exactly like your local implementation
+    # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
+    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+    print("PaddleOCR initialized successfully", file=sys.stderr)
+    # Check if it's a PDF or image
+    is_pdf = file_path.lower().endswith('.pdf')
+    if is_pdf:
+        print("Converting PDF to images for OCR processing...", file=sys.stderr)
+        image_paths = pdf_to_images(file_path)
+        temp_files = image_paths
+        if not image_paths:
+            raise Exception("Failed to convert PDF to images")
+        total_pages = len(image_paths)
+    else:
+        # For image files, use directly
+        image_paths = [file_path]
+        total_pages = 1
+    print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
+    # Process each image with OCR
+    extracted_text = ""
+    pages_processed = 0
+    for i, img_path in enumerate(image_paths):
         try:
+            current_page = i + 1
+            print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
+            print(f"Processing image: {img_path}", file=sys.stderr)
+            # Run OCR on the image
+            result = ocr.ocr(img_path, cls=True)
+            # Debug: print what OCR returns
+            print(f"OCR result for page {current_page}: {type(result)}, length: {len(result) if result else 'None'}", file=sys.stderr)
+            if result and result[0]:  # result is a list of pages, we have one page per image
+                print(f"Page {current_page} has {len(result[0])} text lines detected", file=sys.stderr)
+                pages_processed += 1
+                page_text = ""
+                for line_idx, line in enumerate(result[0]):
+                    if len(line) >= 2:
+                        text_content = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
+                        confidence = line[1][1] if isinstance(line[1], (list, tuple)) and len(line[1]) > 1 else 1.0
+                        print(f"Line {line_idx}: '{text_content}' (confidence: {confidence})", file=sys.stderr)
+                        if confidence > 0.3:  # Lower confidence threshold for debugging
+                            page_text += text_content + "\n"
+                if page_text.strip():
+                    extracted_text += f"\n--- Page {current_page} ---\n"
+                    extracted_text += page_text
+                    print(f"Page {current_page} text added: {len(page_text)} characters", file=sys.stderr)
+                else:
+                    print(f"Page {current_page}: No text above confidence threshold", file=sys.stderr)
+                print(f"Page {current_page} processed successfully", file=sys.stderr)
+            else:
+                print(f"No OCR results returned for page {current_page}", file=sys.stderr)
+                if result:
+                    print(f"Result structure: {result}", file=sys.stderr)
+        except Exception as page_error:
+            print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
+            continue
+    # Clean up temporary files
+    if temp_files:
+        cleanup_temp_files(temp_files)
+    # Output the final result as JSON to stdout
+    result_data = {
+        "success": True,
+        "text": extracted_text,
+        "total_pages": total_pages,
+        "pages_processed": pages_processed,
+        "method": "pdf_to_images" if is_pdf else "direct_image"
     }
+    print(json.dumps(result_data))
+    print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
+except Exception as e:
+    # Clean up on error
+    if temp_files:
+        cleanup_temp_files(temp_files)
+    print(f"Error during OCR processing: {e}", file=sys.stderr)
+    import traceback
+    traceback.print_exc(file=sys.stderr)
+    error_data = {
+        "success": False,
+        "error": str(e)
+    }
+    print(json.dumps(error_data))
+    sys.exit(1)