#!/usr/bin/env python3 # diagnostic_test.py - Debug PaddleOCR performance issues import sys import os import json import fitz import cv2 import numpy as np from paddleocr import PaddleOCR def diagnostic_test(): if len(sys.argv) < 2: print(json.dumps({"error": "No file path provided"})) return file_path = sys.argv[1] try: print("=== DIAGNOSTIC TEST START ===", file=sys.stderr) # Check system info print(f"Python version: {sys.version}", file=sys.stderr) print(f"OpenCV version: {cv2.__version__}", file=sys.stderr) # Check PaddleOCR installation try: import paddle print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr) except: print("PaddlePaddle not available", file=sys.stderr) # Open PDF and get basic info doc = fitz.open(file_path) total_pages = len(doc) print(f"PDF pages: {total_pages}", file=sys.stderr) # Test different extraction methods on first page page = doc[0] # Method 1: Standard quality (72 DPI) print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr) pix_72 = page.get_pixmap(alpha=False) temp_72 = "/tmp/test_72dpi.png" pix_72.save(temp_72) print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr) # Method 2: High quality (300 DPI) print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr) mat = fitz.Matrix(300/72, 300/72) pix_300 = page.get_pixmap(matrix=mat, alpha=False) temp_300 = "/tmp/test_300dpi.png" pix_300.save(temp_300) print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr) # Method 3: Try different preprocessing print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr) img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3) # Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB) img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) # Apply some preprocessing gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY) # Increase contrast clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) enhanced = clahe.apply(gray) temp_enhanced = "/tmp/test_enhanced.png" cv2.imwrite(temp_enhanced, enhanced) print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr) doc.close() # Test OCR with minimal settings first print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr) ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False) result_minimal = ocr_minimal.ocr(temp_72, cls=False) print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr) # Test OCR with your current settings print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr) ocr_current = PaddleOCR( use_angle_cls=True, lang='en', show_log=False, use_gpu=False, det_limit_side_len=2880, det_limit_type='max', rec_batch_num=8, max_text_length=50, use_space_char=True, drop_score=0.1 ) result_current = ocr_current.ocr(temp_300, cls=True) current_detections = len(result_current[0]) if result_current and result_current[0] else 0 print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr) # Test OCR with more aggressive settings print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr) ocr_aggressive = PaddleOCR( use_angle_cls=True, lang='en', show_log=False, use_gpu=False, det_limit_side_len=4000, # Even higher det_limit_type='max', rec_batch_num=1, # Lower batch for memory max_text_length=100, # Longer text use_space_char=True, drop_score=0.05 # Very low threshold ) result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True) aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0 print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr) # Show sample results from best performing method best_result = None best_count = 0 best_method = "none" if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count: best_result = result_minimal best_count = len(result_minimal[0]) best_method = "minimal" if current_detections > best_count: best_result = result_current best_count = current_detections best_method = "current" if aggressive_detections > best_count: best_result = result_aggressive best_count = aggressive_detections best_method = "aggressive" print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr) # Extract and show sample text from best result sample_texts = [] if best_result and best_result[0]: for i, detection in enumerate(best_result[0][:10]): # First 10 only if len(detection) >= 2: text_info = detection[1] if isinstance(text_info, (list, tuple)) and len(text_info) >= 2: text = str(text_info[0]) conf = float(text_info[1]) else: text = str(text_info) conf = 1.0 sample_texts.append(f"'{text}' ({conf:.2f})") print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr) # Clean up for temp_file in [temp_72, temp_300, temp_enhanced]: if os.path.exists(temp_file): os.unlink(temp_file) # Return diagnostic results result = { "success": True, "diagnostics": { "total_pages": total_pages, "minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0, "current_detections": current_detections, "aggressive_detections": aggressive_detections, "best_method": best_method, "best_count": best_count, "sample_texts": sample_texts } } print(json.dumps(result)) except Exception as e: print(f"Diagnostic error: {e}", file=sys.stderr) import traceback traceback.print_exc(file=sys.stderr) print(json.dumps({"success": False, "error": str(e)})) if __name__ == "__main__": diagnostic_test()