Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

paddleocr-processor / diagnostic_test.py

mbuckle

Diagnostic test

2a0cc07 9 months ago

raw

history blame contribute delete

7.34 kB

	#!/usr/bin/env python3
	# diagnostic_test.py - Debug PaddleOCR performance issues

	import sys
	import os
	import json
	import fitz
	import cv2
	import numpy as np
	from paddleocr import PaddleOCR

	def diagnostic_test():
	if len(sys.argv) < 2:
	print(json.dumps({"error": "No file path provided"}))
	return

	file_path = sys.argv[1]

	try:
	print("=== DIAGNOSTIC TEST START ===", file=sys.stderr)

	# Check system info
	print(f"Python version: {sys.version}", file=sys.stderr)
	print(f"OpenCV version: {cv2.__version__}", file=sys.stderr)

	# Check PaddleOCR installation
	try:
	import paddle
	print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr)
	except:
	print("PaddlePaddle not available", file=sys.stderr)

	# Open PDF and get basic info
	doc = fitz.open(file_path)
	total_pages = len(doc)
	print(f"PDF pages: {total_pages}", file=sys.stderr)

	# Test different extraction methods on first page
	page = doc[0]

	# Method 1: Standard quality (72 DPI)
	print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr)
	pix_72 = page.get_pixmap(alpha=False)
	temp_72 = "/tmp/test_72dpi.png"
	pix_72.save(temp_72)
	print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr)

	# Method 2: High quality (300 DPI)
	print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr)
	mat = fitz.Matrix(300/72, 300/72)
	pix_300 = page.get_pixmap(matrix=mat, alpha=False)
	temp_300 = "/tmp/test_300dpi.png"
	pix_300.save(temp_300)
	print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr)

	# Method 3: Try different preprocessing
	print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr)
	img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3)
	# Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB)
	img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
	# Apply some preprocessing
	gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
	# Increase contrast
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
	enhanced = clahe.apply(gray)
	temp_enhanced = "/tmp/test_enhanced.png"
	cv2.imwrite(temp_enhanced, enhanced)
	print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr)

	doc.close()

	# Test OCR with minimal settings first
	print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr)
	ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False)
	result_minimal = ocr_minimal.ocr(temp_72, cls=False)
	print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr)

	# Test OCR with your current settings
	print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr)
	ocr_current = PaddleOCR(
	use_angle_cls=True,
	lang='en',
	show_log=False,
	use_gpu=False,
	det_limit_side_len=2880,
	det_limit_type='max',
	rec_batch_num=8,
	max_text_length=50,
	use_space_char=True,
	drop_score=0.1
	)
	result_current = ocr_current.ocr(temp_300, cls=True)
	current_detections = len(result_current[0]) if result_current and result_current[0] else 0
	print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr)

	# Test OCR with more aggressive settings
	print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr)
	ocr_aggressive = PaddleOCR(
	use_angle_cls=True,
	lang='en',
	show_log=False,
	use_gpu=False,
	det_limit_side_len=4000, # Even higher
	det_limit_type='max',
	rec_batch_num=1, # Lower batch for memory
	max_text_length=100, # Longer text
	use_space_char=True,
	drop_score=0.05 # Very low threshold
	)
	result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True)
	aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0
	print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr)

	# Show sample results from best performing method
	best_result = None
	best_count = 0
	best_method = "none"

	if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count:
	best_result = result_minimal
	best_count = len(result_minimal[0])
	best_method = "minimal"

	if current_detections > best_count:
	best_result = result_current
	best_count = current_detections
	best_method = "current"

	if aggressive_detections > best_count:
	best_result = result_aggressive
	best_count = aggressive_detections
	best_method = "aggressive"

	print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr)

	# Extract and show sample text from best result
	sample_texts = []
	if best_result and best_result[0]:
	for i, detection in enumerate(best_result[0][:10]): # First 10 only
	if len(detection) >= 2:
	text_info = detection[1]
	if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
	text = str(text_info[0])
	conf = float(text_info[1])
	else:
	text = str(text_info)
	conf = 1.0

	sample_texts.append(f"'{text}' ({conf:.2f})")
	print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr)

	# Clean up
	for temp_file in [temp_72, temp_300, temp_enhanced]:
	if os.path.exists(temp_file):
	os.unlink(temp_file)

	# Return diagnostic results
	result = {
	"success": True,
	"diagnostics": {
	"total_pages": total_pages,
	"minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0,
	"current_detections": current_detections,
	"aggressive_detections": aggressive_detections,
	"best_method": best_method,
	"best_count": best_count,
	"sample_texts": sample_texts
	}
	}

	print(json.dumps(result))

	except Exception as e:
	print(f"Diagnostic error: {e}", file=sys.stderr)
	import traceback
	traceback.print_exc(file=sys.stderr)
	print(json.dumps({"success": False, "error": str(e)}))

	if __name__ == "__main__":
	diagnostic_test()