File size: 7,337 Bytes
2a0cc07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
# diagnostic_test.py - Debug PaddleOCR performance issues

import sys
import os
import json
import fitz
import cv2
import numpy as np
from paddleocr import PaddleOCR

def diagnostic_test():
    if len(sys.argv) < 2:
        print(json.dumps({"error": "No file path provided"}))
        return
    
    file_path = sys.argv[1]
    
    try:
        print("=== DIAGNOSTIC TEST START ===", file=sys.stderr)
        
        # Check system info
        print(f"Python version: {sys.version}", file=sys.stderr)
        print(f"OpenCV version: {cv2.__version__}", file=sys.stderr)
        
        # Check PaddleOCR installation
        try:
            import paddle
            print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr)
        except:
            print("PaddlePaddle not available", file=sys.stderr)
        
        # Open PDF and get basic info
        doc = fitz.open(file_path)
        total_pages = len(doc)
        print(f"PDF pages: {total_pages}", file=sys.stderr)
        
        # Test different extraction methods on first page
        page = doc[0]
        
        # Method 1: Standard quality (72 DPI)
        print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr)
        pix_72 = page.get_pixmap(alpha=False)
        temp_72 = "/tmp/test_72dpi.png"
        pix_72.save(temp_72)
        print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr)
        
        # Method 2: High quality (300 DPI)
        print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr)
        mat = fitz.Matrix(300/72, 300/72)
        pix_300 = page.get_pixmap(matrix=mat, alpha=False)
        temp_300 = "/tmp/test_300dpi.png"
        pix_300.save(temp_300)
        print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr)
        
        # Method 3: Try different preprocessing
        print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr)
        img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3)
        # Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB)
        img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
        # Apply some preprocessing
        gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
        # Increase contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(gray)
        temp_enhanced = "/tmp/test_enhanced.png"
        cv2.imwrite(temp_enhanced, enhanced)
        print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr)
        
        doc.close()
        
        # Test OCR with minimal settings first
        print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr)
        ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
        result_minimal = ocr_minimal.ocr(temp_72, cls=False)
        print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr)
        
        # Test OCR with your current settings
        print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr)
        ocr_current = PaddleOCR(
            use_angle_cls=True,
            lang='en',
            show_log=False,
            use_gpu=False,
            det_limit_side_len=2880,
            det_limit_type='max',
            rec_batch_num=8,
            max_text_length=50,
            use_space_char=True,
            drop_score=0.1
        )
        result_current = ocr_current.ocr(temp_300, cls=True)
        current_detections = len(result_current[0]) if result_current and result_current[0] else 0
        print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr)
        
        # Test OCR with more aggressive settings
        print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr)
        ocr_aggressive = PaddleOCR(
            use_angle_cls=True,
            lang='en', 
            show_log=False,
            use_gpu=False,
            det_limit_side_len=4000,    # Even higher
            det_limit_type='max',
            rec_batch_num=1,            # Lower batch for memory
            max_text_length=100,        # Longer text
            use_space_char=True,
            drop_score=0.05             # Very low threshold
        )
        result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True)
        aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0
        print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr)
        
        # Show sample results from best performing method
        best_result = None
        best_count = 0
        best_method = "none"
        
        if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count:
            best_result = result_minimal
            best_count = len(result_minimal[0])
            best_method = "minimal"
            
        if current_detections > best_count:
            best_result = result_current
            best_count = current_detections
            best_method = "current"
            
        if aggressive_detections > best_count:
            best_result = result_aggressive
            best_count = aggressive_detections
            best_method = "aggressive"
        
        print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr)
        
        # Extract and show sample text from best result
        sample_texts = []
        if best_result and best_result[0]:
            for i, detection in enumerate(best_result[0][:10]):  # First 10 only
                if len(detection) >= 2:
                    text_info = detection[1]
                    if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
                        text = str(text_info[0])
                        conf = float(text_info[1])
                    else:
                        text = str(text_info)
                        conf = 1.0
                    
                    sample_texts.append(f"'{text}' ({conf:.2f})")
                    print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr)
        
        # Clean up
        for temp_file in [temp_72, temp_300, temp_enhanced]:
            if os.path.exists(temp_file):
                os.unlink(temp_file)
        
        # Return diagnostic results
        result = {
            "success": True,
            "diagnostics": {
                "total_pages": total_pages,
                "minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0,
                "current_detections": current_detections,
                "aggressive_detections": aggressive_detections,
                "best_method": best_method,
                "best_count": best_count,
                "sample_texts": sample_texts
            }
        }
        
        print(json.dumps(result))
        
    except Exception as e:
        print(f"Diagnostic error: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc(file=sys.stderr)
        print(json.dumps({"success": False, "error": str(e)}))

if __name__ == "__main__":
    diagnostic_test()