Spaces:

mbuck17
/

paddleocr-processor

Sleeping

File size: 7,337 Bytes

2a0cc07

#!/usr/bin/env python3
# diagnostic_test.py - Debug PaddleOCR performance issues

import sys
import os
import json
import fitz
import cv2
import numpy as np
from paddleocr import PaddleOCR

def diagnostic_test():
    if len(sys.argv) < 2:
        print(json.dumps({"error": "No file path provided"}))
        return
    
    file_path = sys.argv[1]
    
    try:
        print("=== DIAGNOSTIC TEST START ===", file=sys.stderr)
        
        # Check system info
        print(f"Python version: {sys.version}", file=sys.stderr)
        print(f"OpenCV version: {cv2.__version__}", file=sys.stderr)
        
        # Check PaddleOCR installation
        try:
            import paddle
            print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr)
        except:
            print("PaddlePaddle not available", file=sys.stderr)
        
        # Open PDF and get basic info
        doc = fitz.open(file_path)
        total_pages = len(doc)
        print(f"PDF pages: {total_pages}", file=sys.stderr)
        
        # Test different extraction methods on first page
        page = doc[0]
        
        # Method 1: Standard quality (72 DPI)
        print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr)
        pix_72 = page.get_pixmap(alpha=False)
        temp_72 = "/tmp/test_72dpi.png"
        pix_72.save(temp_72)
        print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr)
        
        # Method 2: High quality (300 DPI)
        print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr)
        mat = fitz.Matrix(300/72, 300/72)
        pix_300 = page.get_pixmap(matrix=mat, alpha=False)
        temp_300 = "/tmp/test_300dpi.png"
        pix_300.save(temp_300)
        print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr)
        
        # Method 3: Try different preprocessing
        print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr)
        img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3)
        # Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB)
        img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
        # Apply some preprocessing
        gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
        # Increase contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(gray)
        temp_enhanced = "/tmp/test_enhanced.png"
        cv2.imwrite(temp_enhanced, enhanced)
        print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr)
        
        doc.close()
        
        # Test OCR with minimal settings first
        print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr)
        ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
        result_minimal = ocr_minimal.ocr(temp_72, cls=False)
        print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr)
        
        # Test OCR with your current settings
        print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr)
        ocr_current = PaddleOCR(
            use_angle_cls=True,
            lang='en',
            show_log=False,
            use_gpu=False,
            det_limit_side_len=2880,
            det_limit_type='max',
            rec_batch_num=8,
            max_text_length=50,
            use_space_char=True,
            drop_score=0.1
        )
        result_current = ocr_current.ocr(temp_300, cls=True)
        current_detections = len(result_current[0]) if result_current and result_current[0] else 0
        print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr)
        
        # Test OCR with more aggressive settings
        print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr)
        ocr_aggressive = PaddleOCR(
            use_angle_cls=True,
            lang='en', 
            show_log=False,
            use_gpu=False,
            det_limit_side_len=4000,    # Even higher
            det_limit_type='max',
            rec_batch_num=1,            # Lower batch for memory
            max_text_length=100,        # Longer text
            use_space_char=True,
            drop_score=0.05             # Very low threshold
        )
        result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True)
        aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0
        print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr)
        
        # Show sample results from best performing method
        best_result = None
        best_count = 0
        best_method = "none"
        
        if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count:
            best_result = result_minimal
            best_count = len(result_minimal[0])
            best_method = "minimal"
            
        if current_detections > best_count:
            best_result = result_current
            best_count = current_detections
            best_method = "current"
            
        if aggressive_detections > best_count:
            best_result = result_aggressive
            best_count = aggressive_detections
            best_method = "aggressive"
        
        print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr)
        
        # Extract and show sample text from best result
        sample_texts = []
        if best_result and best_result[0]:
            for i, detection in enumerate(best_result[0][:10]):  # First 10 only
                if len(detection) >= 2:
                    text_info = detection[1]
                    if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
                        text = str(text_info[0])
                        conf = float(text_info[1])
                    else:
                        text = str(text_info)
                        conf = 1.0
                    
                    sample_texts.append(f"'{text}' ({conf:.2f})")
                    print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr)
        
        # Clean up
        for temp_file in [temp_72, temp_300, temp_enhanced]:
            if os.path.exists(temp_file):
                os.unlink(temp_file)
        
        # Return diagnostic results
        result = {
            "success": True,
            "diagnostics": {
                "total_pages": total_pages,
                "minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0,
                "current_detections": current_detections,
                "aggressive_detections": aggressive_detections,
                "best_method": best_method,
                "best_count": best_count,
                "sample_texts": sample_texts
            }
        }
        
        print(json.dumps(result))
        
    except Exception as e:
        print(f"Diagnostic error: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc(file=sys.stderr)
        print(json.dumps({"success": False, "error": str(e)}))

if __name__ == "__main__":
    diagnostic_test()