Spaces:
Sleeping
Sleeping
File size: 7,337 Bytes
2a0cc07 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | #!/usr/bin/env python3
# diagnostic_test.py - Debug PaddleOCR performance issues
import sys
import os
import json
import fitz
import cv2
import numpy as np
from paddleocr import PaddleOCR
def diagnostic_test():
if len(sys.argv) < 2:
print(json.dumps({"error": "No file path provided"}))
return
file_path = sys.argv[1]
try:
print("=== DIAGNOSTIC TEST START ===", file=sys.stderr)
# Check system info
print(f"Python version: {sys.version}", file=sys.stderr)
print(f"OpenCV version: {cv2.__version__}", file=sys.stderr)
# Check PaddleOCR installation
try:
import paddle
print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr)
except:
print("PaddlePaddle not available", file=sys.stderr)
# Open PDF and get basic info
doc = fitz.open(file_path)
total_pages = len(doc)
print(f"PDF pages: {total_pages}", file=sys.stderr)
# Test different extraction methods on first page
page = doc[0]
# Method 1: Standard quality (72 DPI)
print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr)
pix_72 = page.get_pixmap(alpha=False)
temp_72 = "/tmp/test_72dpi.png"
pix_72.save(temp_72)
print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr)
# Method 2: High quality (300 DPI)
print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr)
mat = fitz.Matrix(300/72, 300/72)
pix_300 = page.get_pixmap(matrix=mat, alpha=False)
temp_300 = "/tmp/test_300dpi.png"
pix_300.save(temp_300)
print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr)
# Method 3: Try different preprocessing
print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr)
img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3)
# Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB)
img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
# Apply some preprocessing
gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
# Increase contrast
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
enhanced = clahe.apply(gray)
temp_enhanced = "/tmp/test_enhanced.png"
cv2.imwrite(temp_enhanced, enhanced)
print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr)
doc.close()
# Test OCR with minimal settings first
print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr)
ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
result_minimal = ocr_minimal.ocr(temp_72, cls=False)
print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr)
# Test OCR with your current settings
print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr)
ocr_current = PaddleOCR(
use_angle_cls=True,
lang='en',
show_log=False,
use_gpu=False,
det_limit_side_len=2880,
det_limit_type='max',
rec_batch_num=8,
max_text_length=50,
use_space_char=True,
drop_score=0.1
)
result_current = ocr_current.ocr(temp_300, cls=True)
current_detections = len(result_current[0]) if result_current and result_current[0] else 0
print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr)
# Test OCR with more aggressive settings
print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr)
ocr_aggressive = PaddleOCR(
use_angle_cls=True,
lang='en',
show_log=False,
use_gpu=False,
det_limit_side_len=4000, # Even higher
det_limit_type='max',
rec_batch_num=1, # Lower batch for memory
max_text_length=100, # Longer text
use_space_char=True,
drop_score=0.05 # Very low threshold
)
result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True)
aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0
print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr)
# Show sample results from best performing method
best_result = None
best_count = 0
best_method = "none"
if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count:
best_result = result_minimal
best_count = len(result_minimal[0])
best_method = "minimal"
if current_detections > best_count:
best_result = result_current
best_count = current_detections
best_method = "current"
if aggressive_detections > best_count:
best_result = result_aggressive
best_count = aggressive_detections
best_method = "aggressive"
print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr)
# Extract and show sample text from best result
sample_texts = []
if best_result and best_result[0]:
for i, detection in enumerate(best_result[0][:10]): # First 10 only
if len(detection) >= 2:
text_info = detection[1]
if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
text = str(text_info[0])
conf = float(text_info[1])
else:
text = str(text_info)
conf = 1.0
sample_texts.append(f"'{text}' ({conf:.2f})")
print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr)
# Clean up
for temp_file in [temp_72, temp_300, temp_enhanced]:
if os.path.exists(temp_file):
os.unlink(temp_file)
# Return diagnostic results
result = {
"success": True,
"diagnostics": {
"total_pages": total_pages,
"minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0,
"current_detections": current_detections,
"aggressive_detections": aggressive_detections,
"best_method": best_method,
"best_count": best_count,
"sample_texts": sample_texts
}
}
print(json.dumps(result))
except Exception as e:
print(f"Diagnostic error: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
print(json.dumps({"success": False, "error": str(e)}))
if __name__ == "__main__":
diagnostic_test() |