#!/usr/bin/env python3 # enhanced_paddle_test.py - Improved to match local implementation import sys import os import json import fitz from paddleocr import PaddleOCR def test_high_quality_ocr(): if len(sys.argv) < 2: print(json.dumps({"error": "No file path provided"})) return file_path = sys.argv[1] try: print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr) # Open PDF doc = fitz.open(file_path) total_pages = len(doc) print(f"PDF has {total_pages} pages", file=sys.stderr) all_text_parts = [] all_numbers = [] all_medical_terms = [] total_detections = 0 # Initialize OCR once with optimized settings for medical documents print("Initializing OCR with medical document settings...", file=sys.stderr) ocr = PaddleOCR( use_angle_cls=True, # Detect text orientation lang='en', # English language show_log=False, # Suppress logs use_gpu=False, # CPU mode for serverless det_limit_side_len=2880, # Higher detection limit for high-res images det_limit_type='max', # Max side length limit rec_batch_num=8, # Process more text regions at once max_text_length=50, # Allow longer text detection use_space_char=True, # Preserve spaces in text drop_score=0.1 # Much lower threshold to catch more text ) print("OCR initialized with medical settings", file=sys.stderr) # Process all pages (not just first page) for page_num in range(total_pages): print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr) page = doc[page_num] # Use higher DPI and better quality settings mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR temp_img = f"/tmp/high_quality_page_{page_num}.png" pix.save(temp_img) if os.path.exists(temp_img): img_size = os.path.getsize(temp_img) print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr) else: print(f"Failed to create high quality image for page {page_num}", file=sys.stderr) continue # Run OCR on this page print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr) result = ocr.ocr(temp_img, cls=True) if result and result[0]: page_detections = len(result[0]) total_detections += page_detections print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr) # Extract text with lower confidence threshold page_text_parts = [] for i, detection in enumerate(result[0]): if len(detection) >= 2: text_info = detection[1] if isinstance(text_info, (list, tuple)) and len(text_info) >= 2: text = str(text_info[0]) conf = float(text_info[1]) else: text = str(text_info) conf = 1.0 # Show some detections for debugging (first page only) if page_num == 0 and i < 20: print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr) # Use very low confidence threshold (0.1 instead of 0.2) if conf > 0.1 and len(text.strip()) > 0: page_text_parts.append(text) all_text_parts.append(text) # Categorize detections if any(char.isdigit() for char in text): # Look for numbers with decimals or medical values if '.' in text or any(c.isdigit() for c in text): all_numbers.append(text) elif len(text) > 2 and any(c.isalpha() for c in text): # Look for potential medical terms all_medical_terms.append(text) print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr) # Clean up page image if os.path.exists(temp_img): os.unlink(temp_img) doc.close() # Combine all text full_text = '\n'.join(all_text_parts) print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr) print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr) # Apply basic lab patterns similar to local implementation lab_values = apply_basic_patterns(full_text) # Return comprehensive result result_data = { "success": True, "text": full_text, "total_detections": total_detections, "pages_processed": total_pages, "numbers_found": all_numbers[:20], # First 20 numbers "terms_found": all_medical_terms[:20], # First 20 terms "lab_values": lab_values, "settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages" } print(json.dumps(result_data)) except Exception as e: # Clean up on error for i in range(10): # Clean up any temp files temp_file = f"/tmp/high_quality_page_{i}.png" if os.path.exists(temp_file): os.unlink(temp_file) print(f"Error: {e}", file=sys.stderr) import traceback traceback.print_exc(file=sys.stderr) print(json.dumps({"success": False, "error": str(e)})) def apply_basic_patterns(text): """Apply basic lab value patterns similar to local implementation""" lab_values = {} if not text: return lab_values # Define basic patterns for common lab values patterns = { 'TSH': r'TSH[:\s]*(\d+\.?\d*)', 'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)', 'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)', 'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)', 'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)', 'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)', 'Glucose': r'Glucose[:\s]*(\d+\.?\d*)', 'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)', 'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)', 'WBC': r'WBC[:\s]*(\d+\.?\d*)', 'RBC': r'RBC[:\s]*(\d+\.?\d*)' } import re # Normalize text for pattern matching normalized_text = re.sub(r'\s+', ' ', text) for test_name, pattern in patterns.items(): try: match = re.search(pattern, normalized_text, re.IGNORECASE) if match: value = float(match.group(1)) lab_values[test_name] = { "value": value, "raw_text": match.group(0), "confidence": 0.8 } print(f"Found {test_name}: {value}", file=sys.stderr) except (ValueError, IndexError) as e: print(f"Error parsing {test_name}: {e}", file=sys.stderr) continue return lab_values if __name__ == "__main__": test_high_quality_ocr()