Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

mbuckle commited on Jun 3, 2025

Commit

04fbc97

1 Parent(s): 7fa5932

Version 9

Browse files

Files changed (1) hide show

paddle_ocr_standalone.py +209 -126

paddle_ocr_standalone.py CHANGED Viewed

@@ -1,185 +1,262 @@
 #!/usr/bin/env python3
-# paddle_ocr_standalone.py - Fixed version with PDF to image conversion
 import sys
 import os
 import json
 import tempfile
-# Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
-import fitz  # PyMuPDF for PDF page counting
-if not hasattr(fitz.Document, 'pageCount'):
-    def pageCount_property(self):
-        return self.page_count
-    fitz.Document.pageCount = property(pageCount_property)
-if not hasattr(fitz.Page, 'getPixmap'):
-    def getPixmap(self, matrix=None, alpha=True):
-        return self.get_pixmap(matrix=matrix, alpha=alpha)
-    fitz.Page.getPixmap = getPixmap
-if not hasattr(fitz.Page, 'getText'):
-    def getText(self, option="text"):
-        return self.get_text(option)
-    fitz.Page.getText = getText
-# NOW import PaddleOCR after applying the patches
-from paddleocr import PaddleOCR
-def pdf_to_images(pdf_path, dpi=200):
-    """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
     try:
-        doc = fitz.open(pdf_path)
-        image_paths = []
-        print(f"PDF has {len(doc)} pages", file=sys.stderr)
-        for page_num in range(len(doc)):
-            page = doc[page_num]
-            # Create a transformation matrix for higher DPI
-            mat = fitz.Matrix(dpi/72, dpi/72)  # 200 DPI for better OCR accuracy
-            # Render page to pixmap
-            if hasattr(page, 'getPixmap'):
-                pix = page.getPixmap(matrix=mat)
-            else:
-                pix = page.get_pixmap(matrix=mat)
-            # Save to temporary file
-            temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
-            pix.save(temp_img_path)
-            # Check if file was created and get its size
-            if os.path.exists(temp_img_path):
-                file_size = os.path.getsize(temp_img_path)
-                print(f"Converted page {page_num + 1} to: {temp_img_path} (size: {file_size} bytes, dimensions: {pix.width}x{pix.height})", file=sys.stderr)
-            else:
-                print(f"Failed to create image file: {temp_img_path}", file=sys.stderr)
-                continue
-            image_paths.append(temp_img_path)
-        doc.close()
-        print(f"Successfully converted {len(image_paths)} pages to images", file=sys.stderr)
-        return image_paths
     except Exception as e:
-        print(f"Error converting PDF to images: {e}", file=sys.stderr)
-        import traceback
-        traceback.print_exc(file=sys.stderr)
-        return []
-def cleanup_temp_files(file_paths):
-    """Clean up temporary image files"""
-    for file_path in file_paths:
-        try:
-            if os.path.exists(file_path):
-                os.unlink(file_path)
-                print(f"Cleaned up: {file_path}", file=sys.stderr)
-        except Exception as e:
-            print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
 # Check if file path was provided
 if len(sys.argv) < 2:
-    result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
-    print(json.dumps(result))
     sys.exit(1)
 file_path = sys.argv[1]
 temp_files = []
 try:
-    # Print progress to stderr (like your local implementation)
-    print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
-    # Initialize PaddleOCR - try different settings for better text detection
-    # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
-    ocr = PaddleOCR(
-        use_angle_cls=True,
-        lang='en',
-        show_log=False,
-        det_model_dir=None,  # Use default detection model
-        rec_model_dir=None,  # Use default recognition model
-        use_gpu=False        # Ensure CPU usage in serverless environment
-    )
-    print("PaddleOCR initialized successfully", file=sys.stderr)
-    # Check if it's a PDF or image
     is_pdf = file_path.lower().endswith('.pdf')
     if is_pdf:
-        print("Converting PDF to images for OCR processing...", file=sys.stderr)
-        # Try lower DPI first to see if it helps
-        image_paths = pdf_to_images(file_path, dpi=150)  # Reduced from 200
         temp_files = image_paths
         if not image_paths:
-            raise Exception("Failed to convert PDF to images")
         total_pages = len(image_paths)
     else:
-        # For image files, use directly
         image_paths = [file_path]
         total_pages = 1
-    print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
     # Process each image with OCR
     extracted_text = ""
     pages_processed = 0
     for i, img_path in enumerate(image_paths):
         try:
             current_page = i + 1
-            print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
-            print(f"Processing image: {img_path}", file=sys.stderr)
             # Run OCR on the image
             result = ocr.ocr(img_path, cls=True)
-            # Debug: print what OCR returns
-            print(f"OCR result for page {current_page}: {type(result)}, length: {len(result) if result else 'None'}", file=sys.stderr)
-            if result and result[0]:  # result is a list of pages, we have one page per image
-                print(f"Page {current_page} has {len(result[0])} text lines detected", file=sys.stderr)
                 pages_processed += 1
                 page_text = ""
                 for line_idx, line in enumerate(result[0]):
-                    if len(line) >= 2:
-                        text_content = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
-                        confidence = line[1][1] if isinstance(line[1], (list, tuple)) and len(line[1]) > 1 else 1.0
-                        print(f"Line {line_idx}: '{text_content}' (confidence: {confidence})", file=sys.stderr)
-                        if confidence > 0.3:  # Lower confidence threshold for debugging
-                            page_text += text_content + "\n"
                 if page_text.strip():
                     extracted_text += f"\n--- Page {current_page} ---\n"
                     extracted_text += page_text
-                    print(f"Page {current_page} text added: {len(page_text)} characters", file=sys.stderr)
                 else:
-                    print(f"Page {current_page}: No text above confidence threshold", file=sys.stderr)
-                print(f"Page {current_page} processed successfully", file=sys.stderr)
             else:
-                print(f"No OCR results returned for page {current_page}", file=sys.stderr)
-                if result:
-                    print(f"Result structure: {result}", file=sys.stderr)
         except Exception as page_error:
-            print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
             continue
     # Clean up temporary files
     if temp_files:
         cleanup_temp_files(temp_files)
-    # Output the final result as JSON to stdout
     result_data = {
         "success": True,
         "text": extracted_text,
@@ -188,21 +265,27 @@ try:
         "method": "pdf_to_images" if is_pdf else "direct_image"
     }
-    print(json.dumps(result_data))
-    print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
 except Exception as e:
     # Clean up on error
     if temp_files:
-        cleanup_temp_files(temp_files)
-    print(f"Error during OCR processing: {e}", file=sys.stderr)
-    import traceback
     traceback.print_exc(file=sys.stderr)
     error_data = {
         "success": False,
         "error": str(e)
     }
-    print(json.dumps(error_data))
     sys.exit(1)

 #!/usr/bin/env python3
+# paddle_ocr_standalone.py - Robust version with comprehensive error handling
 import sys
 import os
 import json
 import tempfile
+import traceback
+def safe_print_stderr(message):
+    """Safely print to stderr"""
+    try:
+        print(message, file=sys.stderr, flush=True)
+    except:
+        pass
+def safe_print_json(data):
+    """Safely print JSON to stdout"""
     try:
+        print(json.dumps(data), flush=True)
     except Exception as e:
+        safe_print_stderr(f"Error printing JSON: {e}")
+        print('{"success": false, "error": "JSON serialization failed"}')
 # Check if file path was provided
 if len(sys.argv) < 2:
+    safe_print_json({"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"})
     sys.exit(1)
 file_path = sys.argv[1]
 temp_files = []
 try:
+    safe_print_stderr(f"=== Starting OCR processing for: {os.path.basename(file_path)} ===")
+    # Check if file exists and is readable
+    if not os.path.exists(file_path):
+        raise Exception(f"File does not exist: {file_path}")
+    if not os.access(file_path, os.R_OK):
+        raise Exception(f"File is not readable: {file_path}")
+    file_size = os.path.getsize(file_path)
+    safe_print_stderr(f"File size: {file_size} bytes")
+    # Import dependencies one by one with error handling
+    safe_print_stderr("Importing PyMuPDF...")
+    try:
+        import fitz
+        safe_print_stderr("✓ PyMuPDF imported successfully")
+    except Exception as e:
+        raise Exception(f"Failed to import PyMuPDF: {e}")
+    # Apply monkey patch for PyMuPDF compatibility
+    safe_print_stderr("Applying PyMuPDF compatibility patches...")
+    try:
+        if not hasattr(fitz.Document, 'pageCount'):
+            def pageCount_property(self):
+                return self.page_count
+            fitz.Document.pageCount = property(pageCount_property)
+            safe_print_stderr("✓ Added pageCount property")
+        if not hasattr(fitz.Page, 'getPixmap'):
+            def getPixmap(self, matrix=None, alpha=True):
+                return self.get_pixmap(matrix=matrix, alpha=alpha)
+            fitz.Page.getPixmap = getPixmap
+            safe_print_stderr("✓ Added getPixmap method")
+        if not hasattr(fitz.Page, 'getText'):
+            def getText(self, option="text"):
+                return self.get_text(option)
+            fitz.Page.getText = getText
+            safe_print_stderr("✓ Added getText method")
+    except Exception as e:
+        safe_print_stderr(f"Warning: Monkey patch failed: {e}")
+    # Test PDF opening
+    safe_print_stderr("Testing PDF opening...")
+    try:
+        test_doc = fitz.open(file_path)
+        page_count = len(test_doc)
+        safe_print_stderr(f"✓ PDF opened successfully, {page_count} pages detected")
+        test_doc.close()
+    except Exception as e:
+        raise Exception(f"Failed to open PDF: {e}")
+    # Import PaddleOCR
+    safe_print_stderr("Importing PaddleOCR...")
+    try:
+        from paddleocr import PaddleOCR
+        safe_print_stderr("✓ PaddleOCR imported successfully")
+    except Exception as e:
+        raise Exception(f"Failed to import PaddleOCR: {e}")
+    # Initialize PaddleOCR
+    safe_print_stderr("Initializing PaddleOCR...")
+    try:
+        ocr = PaddleOCR(
+            use_angle_cls=True,
+            lang='en',
+            show_log=False,
+            use_gpu=False
+        )
+        safe_print_stderr("✓ PaddleOCR initialized successfully")
+    except Exception as e:
+        raise Exception(f"Failed to initialize PaddleOCR: {e}")
+    def pdf_to_images(pdf_path, dpi=150):
+        """Convert PDF pages to images"""
+        try:
+            safe_print_stderr(f"Converting PDF to images (DPI: {dpi})...")
+            doc = fitz.open(pdf_path)
+            image_paths = []
+            safe_print_stderr(f"PDF has {len(doc)} pages")
+            for page_num in range(len(doc)):
+                try:
+                    safe_print_stderr(f"Converting page {page_num + 1}...")
+                    page = doc[page_num]
+                    # Create transformation matrix
+                    mat = fitz.Matrix(dpi/72, dpi/72)
+                    # Render page to pixmap
+                    if hasattr(page, 'getPixmap'):
+                        pix = page.getPixmap(matrix=mat)
+                    else:
+                        pix = page.get_pixmap(matrix=mat)
+                    # Save to temporary file
+                    temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
+                    pix.save(temp_img_path)
+                    # Verify file creation
+                    if os.path.exists(temp_img_path):
+                        file_size = os.path.getsize(temp_img_path)
+                        safe_print_stderr(f"✓ Page {page_num + 1} converted: {temp_img_path} (size: {file_size} bytes, {pix.width}x{pix.height})")
+                        image_paths.append(temp_img_path)
+                    else:
+                        safe_print_stderr(f"✗ Failed to create image: {temp_img_path}")
+                except Exception as page_error:
+                    safe_print_stderr(f"✗ Error converting page {page_num + 1}: {page_error}")
+                    continue
+            doc.close()
+            safe_print_stderr(f"✓ Successfully converted {len(image_paths)}/{len(doc)} pages")
+            return image_paths
+        except Exception as e:
+            safe_print_stderr(f"✗ PDF conversion failed: {e}")
+            traceback.print_exc(file=sys.stderr)
+            return []
+    def cleanup_temp_files(file_paths):
+        """Clean up temporary files"""
+        for file_path in file_paths:
+            try:
+                if os.path.exists(file_path):
+                    os.unlink(file_path)
+                    safe_print_stderr(f"✓ Cleaned up: {file_path}")
+            except Exception as e:
+                safe_print_stderr(f"Warning: Could not clean up {file_path}: {e}")
+    # Determine file type and convert if needed
     is_pdf = file_path.lower().endswith('.pdf')
     if is_pdf:
+        safe_print_stderr("Processing PDF file...")
+        image_paths = pdf_to_images(file_path)
         temp_files = image_paths
         if not image_paths:
+            raise Exception("PDF conversion produced no images")
         total_pages = len(image_paths)
+        safe_print_stderr(f"Will process {total_pages} images")
     else:
+        safe_print_stderr("Processing image file...")
         image_paths = [file_path]
         total_pages = 1
+    safe_print_stderr(f"TOTAL_PAGES:{total_pages}")
     # Process each image with OCR
+    safe_print_stderr("Starting OCR processing...")
     extracted_text = ""
     pages_processed = 0
     for i, img_path in enumerate(image_paths):
         try:
             current_page = i + 1
+            safe_print_stderr(f"CURRENT_PAGE:{current_page}")
+            safe_print_stderr(f"Processing image: {img_path}")
+            # Verify image exists and is readable
+            if not os.path.exists(img_path):
+                safe_print_stderr(f"✗ Image file does not exist: {img_path}")
+                continue
+            img_size = os.path.getsize(img_path)
+            safe_print_stderr(f"Image size: {img_size} bytes")
             # Run OCR on the image
+            safe_print_stderr(f"Running OCR on page {current_page}...")
             result = ocr.ocr(img_path, cls=True)
+            safe_print_stderr(f"OCR result type: {type(result)}")
+            if result:
+                safe_print_stderr(f"OCR result length: {len(result)}")
+                if result[0]:
+                    safe_print_stderr(f"Page {current_page} has {len(result[0])} text regions detected")
+                else:
+                    safe_print_stderr(f"Page {current_page}: OCR returned empty result")
+            else:
+                safe_print_stderr(f"Page {current_page}: OCR returned None")
+                continue
+            if result and result[0]:
                 pages_processed += 1
                 page_text = ""
                 for line_idx, line in enumerate(result[0]):
+                    try:
+                        if len(line) >= 2:
+                            text_content = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
+                            confidence = line[1][1] if isinstance(line[1], (list, tuple)) and len(line[1]) > 1 else 1.0
+                            safe_print_stderr(f"Line {line_idx}: '{text_content}' (confidence: {confidence:.2f})")
+                            if confidence > 0.3:
+                                page_text += text_content + "\n"
+                    except Exception as line_error:
+                        safe_print_stderr(f"Error processing line {line_idx}: {line_error}")
+                        continue
                 if page_text.strip():
                     extracted_text += f"\n--- Page {current_page} ---\n"
                     extracted_text += page_text
+                    safe_print_stderr(f"✓ Page {current_page}: Added {len(page_text)} characters of text")
                 else:
+                    safe_print_stderr(f"Page {current_page}: No text above confidence threshold")
             else:
+                safe_print_stderr(f"Page {current_page}: No OCR results")
         except Exception as page_error:
+            safe_print_stderr(f"✗ Error processing page {current_page}: {page_error}")
+            traceback.print_exc(file=sys.stderr)
             continue
     # Clean up temporary files
     if temp_files:
+        safe_print_stderr("Cleaning up temporary files...")
         cleanup_temp_files(temp_files)
+    # Prepare final result
     result_data = {
         "success": True,
         "text": extracted_text,
         "method": "pdf_to_images" if is_pdf else "direct_image"
     }
+    safe_print_stderr(f"=== OCR Complete: {pages_processed}/{total_pages} pages processed ===")
+    safe_print_stderr(f"Total text length: {len(extracted_text)} characters")
+    # Output final JSON result
+    safe_print_json(result_data)
 except Exception as e:
     # Clean up on error
     if temp_files:
+        try:
+            cleanup_temp_files(temp_files)
+        except:
+            pass
+    safe_print_stderr(f"=== FATAL ERROR ===")
+    safe_print_stderr(f"Error: {e}")
     traceback.print_exc(file=sys.stderr)
     error_data = {
         "success": False,
         "error": str(e)
     }
+    safe_print_json(error_data)
     sys.exit(1)