File size: 8,151 Bytes
78b142a
b92fc27
78b142a
 
 
 
 
 
 
3ca6417
78b142a
 
 
 
 
 
 
3ca6417
78b142a
3ca6417
78b142a
b92fc27
 
78b142a
b92fc27
 
 
 
78b142a
b92fc27
3ca6417
 
 
 
 
 
b92fc27
3ca6417
b92fc27
 
3ca6417
b92fc27
3ca6417
 
 
b92fc27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b142a
3ca6417
b92fc27
78b142a
3ca6417
 
b92fc27
 
 
 
 
 
 
3ca6417
b92fc27
 
3ca6417
 
b92fc27
 
 
 
78b142a
3ca6417
b92fc27
 
 
 
 
 
 
78b142a
b92fc27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b142a
 
3ca6417
b92fc27
 
 
 
 
78b142a
 
 
 
 
b92fc27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b142a
3ca6417
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python3
# enhanced_paddle_test.py - Improved to match local implementation

import sys
import os
import json
import fitz
from paddleocr import PaddleOCR

def test_high_quality_ocr():
    if len(sys.argv) < 2:
        print(json.dumps({"error": "No file path provided"}))
        return
    
    file_path = sys.argv[1]
    
    try:
        print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr)
        
        # Open PDF
        doc = fitz.open(file_path)
        total_pages = len(doc)
        print(f"PDF has {total_pages} pages", file=sys.stderr)
        
        all_text_parts = []
        all_numbers = []
        all_medical_terms = []
        total_detections = 0
        
        # Initialize OCR once with optimized settings for medical documents
        print("Initializing OCR with medical document settings...", file=sys.stderr)
        ocr = PaddleOCR(
            use_angle_cls=True,          # Detect text orientation
            lang='en',                   # English language
            show_log=False,              # Suppress logs
            use_gpu=False,               # CPU mode for serverless
            det_limit_side_len=2880,     # Higher detection limit for high-res images
            det_limit_type='max',        # Max side length limit
            rec_batch_num=8,             # Process more text regions at once
            max_text_length=50,          # Allow longer text detection
            use_space_char=True,         # Preserve spaces in text
            drop_score=0.1               # Much lower threshold to catch more text
        )
        print("OCR initialized with medical settings", file=sys.stderr)
        
        # Process all pages (not just first page)
        for page_num in range(total_pages):
            print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr)
            
            page = doc[page_num]
            
            # Use higher DPI and better quality settings
            mat = fitz.Matrix(300/72, 300/72)  # 300 DPI like professional scanners
            pix = page.get_pixmap(matrix=mat, alpha=False)  # No alpha for better OCR
            
            temp_img = f"/tmp/high_quality_page_{page_num}.png"
            pix.save(temp_img)
            
            if os.path.exists(temp_img):
                img_size = os.path.getsize(temp_img)
                print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
            else:
                print(f"Failed to create high quality image for page {page_num}", file=sys.stderr)
                continue
            
            # Run OCR on this page
            print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr)
            result = ocr.ocr(temp_img, cls=True)
            
            if result and result[0]:
                page_detections = len(result[0])
                total_detections += page_detections
                print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr)
                
                # Extract text with lower confidence threshold
                page_text_parts = []
                
                for i, detection in enumerate(result[0]):
                    if len(detection) >= 2:
                        text_info = detection[1]
                        if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
                            text = str(text_info[0])
                            conf = float(text_info[1])
                        else:
                            text = str(text_info)
                            conf = 1.0
                        
                        # Show some detections for debugging (first page only)
                        if page_num == 0 and i < 20:
                            print(f"  {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
                        
                        # Use very low confidence threshold (0.1 instead of 0.2)
                        if conf > 0.1 and len(text.strip()) > 0:
                            page_text_parts.append(text)
                            all_text_parts.append(text)
                            
                            # Categorize detections
                            if any(char.isdigit() for char in text):
                                # Look for numbers with decimals or medical values
                                if '.' in text or any(c.isdigit() for c in text):
                                    all_numbers.append(text)
                            elif len(text) > 2 and any(c.isalpha() for c in text):
                                # Look for potential medical terms
                                all_medical_terms.append(text)
                
                print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr)
            
            # Clean up page image
            if os.path.exists(temp_img):
                os.unlink(temp_img)
        
        doc.close()
        
        # Combine all text
        full_text = '\n'.join(all_text_parts)
        
        print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr)
        print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr)
        
        # Apply basic lab patterns similar to local implementation
        lab_values = apply_basic_patterns(full_text)
        
        # Return comprehensive result
        result_data = {
            "success": True,
            "text": full_text,
            "total_detections": total_detections,
            "pages_processed": total_pages,
            "numbers_found": all_numbers[:20],  # First 20 numbers
            "terms_found": all_medical_terms[:20],  # First 20 terms
            "lab_values": lab_values,
            "settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages"
        }
        
        print(json.dumps(result_data))
        
    except Exception as e:
        # Clean up on error
        for i in range(10):  # Clean up any temp files
            temp_file = f"/tmp/high_quality_page_{i}.png"
            if os.path.exists(temp_file):
                os.unlink(temp_file)
        
        print(f"Error: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc(file=sys.stderr)
        print(json.dumps({"success": False, "error": str(e)}))

def apply_basic_patterns(text):
    """Apply basic lab value patterns similar to local implementation"""
    lab_values = {}
    
    if not text:
        return lab_values
    
    # Define basic patterns for common lab values
    patterns = {
        'TSH': r'TSH[:\s]*(\d+\.?\d*)',
        'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)',
        'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)',
        'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)',
        'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)',
        'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)',
        'Glucose': r'Glucose[:\s]*(\d+\.?\d*)',
        'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)',
        'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)',
        'WBC': r'WBC[:\s]*(\d+\.?\d*)',
        'RBC': r'RBC[:\s]*(\d+\.?\d*)'
    }
    
    import re
    
    # Normalize text for pattern matching
    normalized_text = re.sub(r'\s+', ' ', text)
    
    for test_name, pattern in patterns.items():
        try:
            match = re.search(pattern, normalized_text, re.IGNORECASE)
            if match:
                value = float(match.group(1))
                lab_values[test_name] = {
                    "value": value,
                    "raw_text": match.group(0),
                    "confidence": 0.8
                }
                print(f"Found {test_name}: {value}", file=sys.stderr)
        except (ValueError, IndexError) as e:
            print(f"Error parsing {test_name}: {e}", file=sys.stderr)
            continue
    
    return lab_values

if __name__ == "__main__":
    test_high_quality_ocr()