Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # diagnostic_test.py - Debug PaddleOCR performance issues | |
| import sys | |
| import os | |
| import json | |
| import fitz | |
| import cv2 | |
| import numpy as np | |
| from paddleocr import PaddleOCR | |
| def diagnostic_test(): | |
| if len(sys.argv) < 2: | |
| print(json.dumps({"error": "No file path provided"})) | |
| return | |
| file_path = sys.argv[1] | |
| try: | |
| print("=== DIAGNOSTIC TEST START ===", file=sys.stderr) | |
| # Check system info | |
| print(f"Python version: {sys.version}", file=sys.stderr) | |
| print(f"OpenCV version: {cv2.__version__}", file=sys.stderr) | |
| # Check PaddleOCR installation | |
| try: | |
| import paddle | |
| print(f"PaddlePaddle version: {paddle.__version__}", file=sys.stderr) | |
| except: | |
| print("PaddlePaddle not available", file=sys.stderr) | |
| # Open PDF and get basic info | |
| doc = fitz.open(file_path) | |
| total_pages = len(doc) | |
| print(f"PDF pages: {total_pages}", file=sys.stderr) | |
| # Test different extraction methods on first page | |
| page = doc[0] | |
| # Method 1: Standard quality (72 DPI) | |
| print("\n=== METHOD 1: Standard 72 DPI ===", file=sys.stderr) | |
| pix_72 = page.get_pixmap(alpha=False) | |
| temp_72 = "/tmp/test_72dpi.png" | |
| pix_72.save(temp_72) | |
| print(f"72 DPI image: {pix_72.width}x{pix_72.height}, size: {os.path.getsize(temp_72)}", file=sys.stderr) | |
| # Method 2: High quality (300 DPI) | |
| print("\n=== METHOD 2: High 300 DPI ===", file=sys.stderr) | |
| mat = fitz.Matrix(300/72, 300/72) | |
| pix_300 = page.get_pixmap(matrix=mat, alpha=False) | |
| temp_300 = "/tmp/test_300dpi.png" | |
| pix_300.save(temp_300) | |
| print(f"300 DPI image: {pix_300.width}x{pix_300.height}, size: {os.path.getsize(temp_300)}", file=sys.stderr) | |
| # Method 3: Try different preprocessing | |
| print("\n=== METHOD 3: Preprocessed Image ===", file=sys.stderr) | |
| img_array = np.frombuffer(pix_300.samples, dtype=np.uint8).reshape(pix_300.height, pix_300.width, 3) | |
| # Convert BGR to RGB (OpenCV uses BGR, PIL uses RGB) | |
| img_rgb = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) | |
| # Apply some preprocessing | |
| gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY) | |
| # Increase contrast | |
| clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) | |
| enhanced = clahe.apply(gray) | |
| temp_enhanced = "/tmp/test_enhanced.png" | |
| cv2.imwrite(temp_enhanced, enhanced) | |
| print(f"Enhanced image saved: {os.path.getsize(temp_enhanced)} bytes", file=sys.stderr) | |
| doc.close() | |
| # Test OCR with minimal settings first | |
| print("\n=== OCR TEST 1: Minimal Settings ===", file=sys.stderr) | |
| ocr_minimal = PaddleOCR(use_angle_cls=False, lang='en', show_log=False) | |
| result_minimal = ocr_minimal.ocr(temp_72, cls=False) | |
| print(f"Minimal OCR on 72 DPI: {len(result_minimal[0]) if result_minimal and result_minimal[0] else 0} detections", file=sys.stderr) | |
| # Test OCR with your current settings | |
| print("\n=== OCR TEST 2: Current Settings ===", file=sys.stderr) | |
| ocr_current = PaddleOCR( | |
| use_angle_cls=True, | |
| lang='en', | |
| show_log=False, | |
| use_gpu=False, | |
| det_limit_side_len=2880, | |
| det_limit_type='max', | |
| rec_batch_num=8, | |
| max_text_length=50, | |
| use_space_char=True, | |
| drop_score=0.1 | |
| ) | |
| result_current = ocr_current.ocr(temp_300, cls=True) | |
| current_detections = len(result_current[0]) if result_current and result_current[0] else 0 | |
| print(f"Current OCR on 300 DPI: {current_detections} detections", file=sys.stderr) | |
| # Test OCR with more aggressive settings | |
| print("\n=== OCR TEST 3: Aggressive Settings ===", file=sys.stderr) | |
| ocr_aggressive = PaddleOCR( | |
| use_angle_cls=True, | |
| lang='en', | |
| show_log=False, | |
| use_gpu=False, | |
| det_limit_side_len=4000, # Even higher | |
| det_limit_type='max', | |
| rec_batch_num=1, # Lower batch for memory | |
| max_text_length=100, # Longer text | |
| use_space_char=True, | |
| drop_score=0.05 # Very low threshold | |
| ) | |
| result_aggressive = ocr_aggressive.ocr(temp_enhanced, cls=True) | |
| aggressive_detections = len(result_aggressive[0]) if result_aggressive and result_aggressive[0] else 0 | |
| print(f"Aggressive OCR on enhanced: {aggressive_detections} detections", file=sys.stderr) | |
| # Show sample results from best performing method | |
| best_result = None | |
| best_count = 0 | |
| best_method = "none" | |
| if len(result_minimal[0] if result_minimal and result_minimal[0] else []) > best_count: | |
| best_result = result_minimal | |
| best_count = len(result_minimal[0]) | |
| best_method = "minimal" | |
| if current_detections > best_count: | |
| best_result = result_current | |
| best_count = current_detections | |
| best_method = "current" | |
| if aggressive_detections > best_count: | |
| best_result = result_aggressive | |
| best_count = aggressive_detections | |
| best_method = "aggressive" | |
| print(f"\nBest method: {best_method} with {best_count} detections", file=sys.stderr) | |
| # Extract and show sample text from best result | |
| sample_texts = [] | |
| if best_result and best_result[0]: | |
| for i, detection in enumerate(best_result[0][:10]): # First 10 only | |
| if len(detection) >= 2: | |
| text_info = detection[1] | |
| if isinstance(text_info, (list, tuple)) and len(text_info) >= 2: | |
| text = str(text_info[0]) | |
| conf = float(text_info[1]) | |
| else: | |
| text = str(text_info) | |
| conf = 1.0 | |
| sample_texts.append(f"'{text}' ({conf:.2f})") | |
| print(f"Sample {i}: '{text}' (conf: {conf:.2f})", file=sys.stderr) | |
| # Clean up | |
| for temp_file in [temp_72, temp_300, temp_enhanced]: | |
| if os.path.exists(temp_file): | |
| os.unlink(temp_file) | |
| # Return diagnostic results | |
| result = { | |
| "success": True, | |
| "diagnostics": { | |
| "total_pages": total_pages, | |
| "minimal_detections": len(result_minimal[0]) if result_minimal and result_minimal[0] else 0, | |
| "current_detections": current_detections, | |
| "aggressive_detections": aggressive_detections, | |
| "best_method": best_method, | |
| "best_count": best_count, | |
| "sample_texts": sample_texts | |
| } | |
| } | |
| print(json.dumps(result)) | |
| except Exception as e: | |
| print(f"Diagnostic error: {e}", file=sys.stderr) | |
| import traceback | |
| traceback.print_exc(file=sys.stderr) | |
| print(json.dumps({"success": False, "error": str(e)})) | |
| if __name__ == "__main__": | |
| diagnostic_test() |