paddleocr-processor / diagnostic_test.py
mbuckle's picture
Diagnostic test
2a0cc07
#!/usr/bin/env python3
# diagnostic_test.py - Debug PaddleOCR performance issues
import sys
import os
import json
import fitz
import cv2
import numpy as np
from paddleocr import PaddleOCR
def diagnostic_test():
if len(sys.argv) < 2:
print(json.dumps({"error": "No file path provided"}))
return
file_path = sys.argv[1]
try:
print("=== DIAGNOSTIC TEST START ===", file=sys.stderr)
# Check system info
print(f"Python version: {sys.version}", file=sys.stderr)
print(f"OpenCV version: {cv2.__version__}", file=sys.stderr)
# Check PaddleOCR installation
try:
import paddle
print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr)
except:
print("PaddlePaddle not available", file=sys.stderr)
# Open PDF and get basic info
doc = fitz.open(file_path)
total_pages = len(doc)
print(f"PDF pages: {total_pages}", file=sys.stderr)
# Test different extraction methods on first page
page = doc[0]
# Method 1: Standard quality (72 DPI)
print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr)
pix_72 = page.get_pixmap(alpha=False)
temp_72 = "/tmp/test_72dpi.png"
pix_72.save(temp_72)
print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr)
# Method 2: High quality (300 DPI)
print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr)
mat = fitz.Matrix(300/72, 300/72)
pix_300 = page.get_pixmap(matrix=mat, alpha=False)
temp_300 = "/tmp/test_300dpi.png"
pix_300.save(temp_300)
print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr)
# Method 3: Try different preprocessing
print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr)
img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3)
# Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB)
img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
# Apply some preprocessing
gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
# Increase contrast
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
enhanced = clahe.apply(gray)
temp_enhanced = "/tmp/test_enhanced.png"
cv2.imwrite(temp_enhanced, enhanced)
print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr)
doc.close()
# Test OCR with minimal settings first
print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr)
ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
result_minimal = ocr_minimal.ocr(temp_72, cls=False)
print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr)
# Test OCR with your current settings
print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr)
ocr_current = PaddleOCR(
use_angle_cls=True,
lang='en',
show_log=False,
use_gpu=False,
det_limit_side_len=2880,
det_limit_type='max',
rec_batch_num=8,
max_text_length=50,
use_space_char=True,
drop_score=0.1
)
result_current = ocr_current.ocr(temp_300, cls=True)
current_detections = len(result_current[0]) if result_current and result_current[0] else 0
print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr)
# Test OCR with more aggressive settings
print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr)
ocr_aggressive = PaddleOCR(
use_angle_cls=True,
lang='en',
show_log=False,
use_gpu=False,
det_limit_side_len=4000, # Even higher
det_limit_type='max',
rec_batch_num=1, # Lower batch for memory
max_text_length=100, # Longer text
use_space_char=True,
drop_score=0.05 # Very low threshold
)
result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True)
aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0
print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr)
# Show sample results from best performing method
best_result = None
best_count = 0
best_method = "none"
if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count:
best_result = result_minimal
best_count = len(result_minimal[0])
best_method = "minimal"
if current_detections > best_count:
best_result = result_current
best_count = current_detections
best_method = "current"
if aggressive_detections > best_count:
best_result = result_aggressive
best_count = aggressive_detections
best_method = "aggressive"
print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr)
# Extract and show sample text from best result
sample_texts = []
if best_result and best_result[0]:
for i, detection in enumerate(best_result[0][:10]): # First 10 only
if len(detection) >= 2:
text_info = detection[1]
if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
text = str(text_info[0])
conf = float(text_info[1])
else:
text = str(text_info)
conf = 1.0
sample_texts.append(f"'{text}' ({conf:.2f})")
print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr)
# Clean up
for temp_file in [temp_72, temp_300, temp_enhanced]:
if os.path.exists(temp_file):
os.unlink(temp_file)
# Return diagnostic results
result = {
"success": True,
"diagnostics": {
"total_pages": total_pages,
"minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0,
"current_detections": current_detections,
"aggressive_detections": aggressive_detections,
"best_method": best_method,
"best_count": best_count,
"sample_texts": sample_texts
}
}
print(json.dumps(result))
except Exception as e:
print(f"Diagnostic error: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
print(json.dumps({"success": False, "error": str(e)}))
if __name__ == "__main__":
diagnostic_test()