#!/usr/bin/env python3 # paddle_ocr_standalone.py - Robust version with comprehensive error handling import sys import os import json import tempfile import traceback def safe_print_stderr(message): """Safely print to stderr""" try: print(message, file=sys.stderr, flush=True) except: pass def safe_print_json(data): """Safely print JSON to stdout""" try: print(json.dumps(data), flush=True) except Exception as e: safe_print_stderr(f"Error printing JSON: {e}") print('{"success": false, "error": "JSON serialization failed"}') # Check if file path was provided if len(sys.argv) < 2: safe_print_json({"success": False, "error": "Usage: python paddle_ocr_standalone.py "}) sys.exit(1) file_path = sys.argv[1] temp_files = [] try: safe_print_stderr(f"=== Starting OCR processing for: {os.path.basename(file_path)} ===") # Check if file exists and is readable if not os.path.exists(file_path): raise Exception(f"File does not exist: {file_path}") if not os.access(file_path, os.R_OK): raise Exception(f"File is not readable: {file_path}") file_size = os.path.getsize(file_path) safe_print_stderr(f"File size: {file_size} bytes") # Import dependencies one by one with error handling safe_print_stderr("Importing PyMuPDF...") try: import fitz safe_print_stderr("✓ PyMuPDF imported successfully") except Exception as e: raise Exception(f"Failed to import PyMuPDF: {e}") # Apply monkey patch for PyMuPDF compatibility safe_print_stderr("Applying PyMuPDF compatibility patches...") try: if not hasattr(fitz.Document, 'pageCount'): def pageCount_property(self): return self.page_count fitz.Document.pageCount = property(pageCount_property) safe_print_stderr("✓ Added pageCount property") if not hasattr(fitz.Page, 'getPixmap'): def getPixmap(self, matrix=None, alpha=True): return self.get_pixmap(matrix=matrix, alpha=alpha) fitz.Page.getPixmap = getPixmap safe_print_stderr("✓ Added getPixmap method") if not hasattr(fitz.Page, 'getText'): def getText(self, option="text"): return self.get_text(option) fitz.Page.getText = getText safe_print_stderr("✓ Added getText method") except Exception as e: safe_print_stderr(f"Warning: Monkey patch failed: {e}") # Test PDF opening safe_print_stderr("Testing PDF opening...") try: test_doc = fitz.open(file_path) page_count = len(test_doc) safe_print_stderr(f"✓ PDF opened successfully, {page_count} pages detected") test_doc.close() except Exception as e: raise Exception(f"Failed to open PDF: {e}") # Import PaddleOCR safe_print_stderr("Importing PaddleOCR...") try: from paddleocr import PaddleOCR safe_print_stderr("✓ PaddleOCR imported successfully") except Exception as e: raise Exception(f"Failed to import PaddleOCR: {e}") # Initialize PaddleOCR safe_print_stderr("Initializing PaddleOCR...") try: ocr = PaddleOCR( use_angle_cls=True, lang='en', show_log=False, use_gpu=False ) safe_print_stderr("✓ PaddleOCR initialized successfully") except Exception as e: raise Exception(f"Failed to initialize PaddleOCR: {e}") def pdf_to_images(pdf_path, dpi=150): """Convert PDF pages to images""" try: safe_print_stderr(f"Converting PDF to images (DPI: {dpi})...") doc = fitz.open(pdf_path) image_paths = [] total_pages = len(doc) # Store this before we close the document safe_print_stderr(f"PDF has {total_pages} pages") for page_num in range(total_pages): try: safe_print_stderr(f"Converting page {page_num + 1}...") page = doc[page_num] # Create transformation matrix mat = fitz.Matrix(dpi/72, dpi/72) # Render page to pixmap if hasattr(page, 'getPixmap'): pix = page.getPixmap(matrix=mat) else: pix = page.get_pixmap(matrix=mat) # Save to temporary file temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png" pix.save(temp_img_path) # Verify file creation if os.path.exists(temp_img_path): file_size = os.path.getsize(temp_img_path) safe_print_stderr(f"✓ Page {page_num + 1} converted: {temp_img_path} (size: {file_size} bytes, {pix.width}x{pix.height})") image_paths.append(temp_img_path) else: safe_print_stderr(f"✗ Failed to create image: {temp_img_path}") except Exception as page_error: safe_print_stderr(f"✗ Error converting page {page_num + 1}: {page_error}") continue doc.close() safe_print_stderr(f"✓ Successfully converted {len(image_paths)}/{total_pages} pages") return image_paths except Exception as e: safe_print_stderr(f"✗ PDF conversion failed: {e}") traceback.print_exc(file=sys.stderr) return [] def cleanup_temp_files(file_paths): """Clean up temporary files""" for file_path in file_paths: try: if os.path.exists(file_path): os.unlink(file_path) safe_print_stderr(f"✓ Cleaned up: {file_path}") except Exception as e: safe_print_stderr(f"Warning: Could not clean up {file_path}: {e}") # Determine file type and convert if needed is_pdf = file_path.lower().endswith('.pdf') if is_pdf: safe_print_stderr("Processing PDF file...") image_paths = pdf_to_images(file_path) temp_files = image_paths if not image_paths: raise Exception("PDF conversion produced no images") total_pages = len(image_paths) safe_print_stderr(f"Will process {total_pages} images") else: safe_print_stderr("Processing image file...") image_paths = [file_path] total_pages = 1 safe_print_stderr(f"TOTAL_PAGES:{total_pages}") # Process each image with OCR safe_print_stderr("Starting OCR processing...") extracted_text = "" pages_processed = 0 for i, img_path in enumerate(image_paths): try: current_page = i + 1 safe_print_stderr(f"CURRENT_PAGE:{current_page}") safe_print_stderr(f"Processing image: {img_path}") # Verify image exists and is readable if not os.path.exists(img_path): safe_print_stderr(f"✗ Image file does not exist: {img_path}") continue img_size = os.path.getsize(img_path) safe_print_stderr(f"Image size: {img_size} bytes") # Run OCR on the image safe_print_stderr(f"Running OCR on page {current_page}...") result = ocr.ocr(img_path, cls=True) safe_print_stderr(f"OCR result type: {type(result)}") if result: safe_print_stderr(f"OCR result length: {len(result)}") if result[0]: safe_print_stderr(f"Page {current_page} has {len(result[0])} text regions detected") else: safe_print_stderr(f"Page {current_page}: OCR returned empty result") else: safe_print_stderr(f"Page {current_page}: OCR returned None") continue if result and result[0]: pages_processed += 1 page_text = "" for line_idx, line in enumerate(result[0]): try: if len(line) >= 2: # Handle both string and float text values if isinstance(line[1], (list, tuple)): text_content = str(line[1][0]) confidence = float(line[1][1]) if len(line[1]) > 1 else 1.0 else: text_content = str(line[1]) confidence = 1.0 safe_print_stderr(f"Line {line_idx}: '{text_content}' (confidence: {confidence:.2f})") if confidence > 0.3: page_text += text_content + "\n" except Exception as line_error: safe_print_stderr(f"Error processing line {line_idx}: {line_error}") continue if page_text.strip(): extracted_text += f"\n--- Page {current_page} ---\n" extracted_text += page_text safe_print_stderr(f"✓ Page {current_page}: Added {len(page_text)} characters of text") else: safe_print_stderr(f"Page {current_page}: No text above confidence threshold") else: safe_print_stderr(f"Page {current_page}: No OCR results") except Exception as page_error: safe_print_stderr(f"✗ Error processing page {current_page}: {page_error}") traceback.print_exc(file=sys.stderr) continue # Clean up temporary files if temp_files: safe_print_stderr("Cleaning up temporary files...") cleanup_temp_files(temp_files) # Prepare final result result_data = { "success": True, "text": extracted_text, "total_pages": total_pages, "pages_processed": pages_processed, "method": "pdf_to_images" if is_pdf else "direct_image" } safe_print_stderr(f"=== OCR Complete: {pages_processed}/{total_pages} pages processed ===") safe_print_stderr(f"Total text length: {len(extracted_text)} characters") # Output final JSON result safe_print_json(result_data) except Exception as e: # Clean up on error if temp_files: try: cleanup_temp_files(temp_files) except: pass safe_print_stderr(f"=== FATAL ERROR ===") safe_print_stderr(f"Error: {e}") traceback.print_exc(file=sys.stderr) error_data = { "success": False, "error": str(e) } safe_print_json(error_data) sys.exit(1)