Spaces:

mbuck17
/

paddleocr-processor

Sleeping

File size: 8,151 Bytes

#!/usr/bin/env python3
# enhanced_paddle_test.py - Improved to match local implementation

import sys
import os
import json
import fitz
from paddleocr import PaddleOCR

def test_high_quality_ocr():
    if len(sys.argv) < 2:
        print(json.dumps({"error": "No file path provided"}))
        return
    
    file_path = sys.argv[1]
    
    try:
        print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr)
        
        # Open PDF
        doc = fitz.open(file_path)
        total_pages = len(doc)
        print(f"PDF has {total_pages} pages", file=sys.stderr)
        
        all_text_parts = []
        all_numbers = []
        all_medical_terms = []
        total_detections = 0
        
        # Initialize OCR once with optimized settings for medical documents
        print("Initializing OCR with medical document settings...", file=sys.stderr)
        ocr = PaddleOCR(
            use_angle_cls=True,          # Detect text orientation
            lang='en',                   # English language
            show_log=False,              # Suppress logs
            use_gpu=False,               # CPU mode for serverless
            det_limit_side_len=2880,     # Higher detection limit for high-res images
            det_limit_type='max',        # Max side length limit
            rec_batch_num=8,             # Process more text regions at once
            max_text_length=50,          # Allow longer text detection
            use_space_char=True,         # Preserve spaces in text
            drop_score=0.1               # Much lower threshold to catch more text
        )
        print("OCR initialized with medical settings", file=sys.stderr)
        
        # Process all pages (not just first page)
        for page_num in range(total_pages):
            print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr)
            
            page = doc[page_num]
            
            # Use higher DPI and better quality settings
            mat = fitz.Matrix(300/72, 300/72)  # 300 DPI like professional scanners
            pix = page.get_pixmap(matrix=mat, alpha=False)  # No alpha for better OCR
            
            temp_img = f"/tmp/high_quality_page_{page_num}.png"
            pix.save(temp_img)
            
            if os.path.exists(temp_img):
                img_size = os.path.getsize(temp_img)
                print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
            else:
                print(f"Failed to create high quality image for page {page_num}", file=sys.stderr)
                continue
            
            # Run OCR on this page
            print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr)
            result = ocr.ocr(temp_img, cls=True)
            
            if result and result[0]:
                page_detections = len(result[0])
                total_detections += page_detections
                print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr)
                
                # Extract text with lower confidence threshold
                page_text_parts = []
                
                for i, detection in enumerate(result[0]):
                    if len(detection) >= 2:
                        text_info = detection[1]
                        if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
                            text = str(text_info[0])
                            conf = float(text_info[1])
                        else:
                            text = str(text_info)
                            conf = 1.0
                        
                        # Show some detections for debugging (first page only)
                        if page_num == 0 and i < 20:
                            print(f"  {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
                        
                        # Use very low confidence threshold (0.1 instead of 0.2)
                        if conf > 0.1 and len(text.strip()) > 0:
                            page_text_parts.append(text)
                            all_text_parts.append(text)
                            
                            # Categorize detections
                            if any(char.isdigit() for char in text):
                                # Look for numbers with decimals or medical values
                                if '.' in text or any(c.isdigit() for c in text):
                                    all_numbers.append(text)
                            elif len(text) > 2 and any(c.isalpha() for c in text):
                                # Look for potential medical terms
                                all_medical_terms.append(text)
                
                print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr)
            
            # Clean up page image
            if os.path.exists(temp_img):
                os.unlink(temp_img)
        
        doc.close()
        
        # Combine all text
        full_text = '\n'.join(all_text_parts)
        
        print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr)
        print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr)
        
        # Apply basic lab patterns similar to local implementation
        lab_values = apply_basic_patterns(full_text)
        
        # Return comprehensive result
        result_data = {
            "success": True,
            "text": full_text,
            "total_detections": total_detections,
            "pages_processed": total_pages,
            "numbers_found": all_numbers[:20],  # First 20 numbers
            "terms_found": all_medical_terms[:20],  # First 20 terms
            "lab_values": lab_values,
            "settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages"
        }
        
        print(json.dumps(result_data))
        
    except Exception as e:
        # Clean up on error
        for i in range(10):  # Clean up any temp files
            temp_file = f"/tmp/high_quality_page_{i}.png"
            if os.path.exists(temp_file):
                os.unlink(temp_file)
        
        print(f"Error: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc(file=sys.stderr)
        print(json.dumps({"success": False, "error": str(e)}))

def apply_basic_patterns(text):
    """Apply basic lab value patterns similar to local implementation"""
    lab_values = {}
    
    if not text:
        return lab_values
    
    # Define basic patterns for common lab values
    patterns = {
        'TSH': r'TSH[:\s]*(\d+\.?\d*)',
        'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)',
        'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)',
        'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)',
        'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)',
        'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)',
        'Glucose': r'Glucose[:\s]*(\d+\.?\d*)',
        'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)',
        'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)',
        'WBC': r'WBC[:\s]*(\d+\.?\d*)',
        'RBC': r'RBC[:\s]*(\d+\.?\d*)'
    }
    
    import re
    
    # Normalize text for pattern matching
    normalized_text = re.sub(r'\s+', ' ', text)
    
    for test_name, pattern in patterns.items():
        try:
            match = re.search(pattern, normalized_text, re.IGNORECASE)
            if match:
                value = float(match.group(1))
                lab_values[test_name] = {
                    "value": value,
                    "raw_text": match.group(0),
                    "confidence": 0.8
                }
                print(f"Found {test_name}: {value}", file=sys.stderr)
        except (ValueError, IndexError) as e:
            print(f"Error parsing {test_name}: {e}", file=sys.stderr)
            continue
    
    return lab_values

if __name__ == "__main__":
    test_high_quality_ocr()