Spaces:
Sleeping
Sleeping
File size: 8,151 Bytes
78b142a b92fc27 78b142a 3ca6417 78b142a 3ca6417 78b142a 3ca6417 78b142a b92fc27 78b142a b92fc27 78b142a b92fc27 3ca6417 b92fc27 3ca6417 b92fc27 3ca6417 b92fc27 3ca6417 b92fc27 78b142a 3ca6417 b92fc27 78b142a 3ca6417 b92fc27 3ca6417 b92fc27 3ca6417 b92fc27 78b142a 3ca6417 b92fc27 78b142a b92fc27 78b142a 3ca6417 b92fc27 78b142a b92fc27 78b142a 3ca6417 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | #!/usr/bin/env python3
# enhanced_paddle_test.py - Improved to match local implementation
import sys
import os
import json
import fitz
from paddleocr import PaddleOCR
def test_high_quality_ocr():
if len(sys.argv) < 2:
print(json.dumps({"error": "No file path provided"}))
return
file_path = sys.argv[1]
try:
print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr)
# Open PDF
doc = fitz.open(file_path)
total_pages = len(doc)
print(f"PDF has {total_pages} pages", file=sys.stderr)
all_text_parts = []
all_numbers = []
all_medical_terms = []
total_detections = 0
# Initialize OCR once with optimized settings for medical documents
print("Initializing OCR with medical document settings...", file=sys.stderr)
ocr = PaddleOCR(
use_angle_cls=True, # Detect text orientation
lang='en', # English language
show_log=False, # Suppress logs
use_gpu=False, # CPU mode for serverless
det_limit_side_len=2880, # Higher detection limit for high-res images
det_limit_type='max', # Max side length limit
rec_batch_num=8, # Process more text regions at once
max_text_length=50, # Allow longer text detection
use_space_char=True, # Preserve spaces in text
drop_score=0.1 # Much lower threshold to catch more text
)
print("OCR initialized with medical settings", file=sys.stderr)
# Process all pages (not just first page)
for page_num in range(total_pages):
print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr)
page = doc[page_num]
# Use higher DPI and better quality settings
mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners
pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR
temp_img = f"/tmp/high_quality_page_{page_num}.png"
pix.save(temp_img)
if os.path.exists(temp_img):
img_size = os.path.getsize(temp_img)
print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
else:
print(f"Failed to create high quality image for page {page_num}", file=sys.stderr)
continue
# Run OCR on this page
print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr)
result = ocr.ocr(temp_img, cls=True)
if result and result[0]:
page_detections = len(result[0])
total_detections += page_detections
print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr)
# Extract text with lower confidence threshold
page_text_parts = []
for i, detection in enumerate(result[0]):
if len(detection) >= 2:
text_info = detection[1]
if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
text = str(text_info[0])
conf = float(text_info[1])
else:
text = str(text_info)
conf = 1.0
# Show some detections for debugging (first page only)
if page_num == 0 and i < 20:
print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
# Use very low confidence threshold (0.1 instead of 0.2)
if conf > 0.1 and len(text.strip()) > 0:
page_text_parts.append(text)
all_text_parts.append(text)
# Categorize detections
if any(char.isdigit() for char in text):
# Look for numbers with decimals or medical values
if '.' in text or any(c.isdigit() for c in text):
all_numbers.append(text)
elif len(text) > 2 and any(c.isalpha() for c in text):
# Look for potential medical terms
all_medical_terms.append(text)
print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr)
# Clean up page image
if os.path.exists(temp_img):
os.unlink(temp_img)
doc.close()
# Combine all text
full_text = '\n'.join(all_text_parts)
print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr)
print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr)
# Apply basic lab patterns similar to local implementation
lab_values = apply_basic_patterns(full_text)
# Return comprehensive result
result_data = {
"success": True,
"text": full_text,
"total_detections": total_detections,
"pages_processed": total_pages,
"numbers_found": all_numbers[:20], # First 20 numbers
"terms_found": all_medical_terms[:20], # First 20 terms
"lab_values": lab_values,
"settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages"
}
print(json.dumps(result_data))
except Exception as e:
# Clean up on error
for i in range(10): # Clean up any temp files
temp_file = f"/tmp/high_quality_page_{i}.png"
if os.path.exists(temp_file):
os.unlink(temp_file)
print(f"Error: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
print(json.dumps({"success": False, "error": str(e)}))
def apply_basic_patterns(text):
"""Apply basic lab value patterns similar to local implementation"""
lab_values = {}
if not text:
return lab_values
# Define basic patterns for common lab values
patterns = {
'TSH': r'TSH[:\s]*(\d+\.?\d*)',
'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)',
'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)',
'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)',
'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)',
'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)',
'Glucose': r'Glucose[:\s]*(\d+\.?\d*)',
'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)',
'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)',
'WBC': r'WBC[:\s]*(\d+\.?\d*)',
'RBC': r'RBC[:\s]*(\d+\.?\d*)'
}
import re
# Normalize text for pattern matching
normalized_text = re.sub(r'\s+', ' ', text)
for test_name, pattern in patterns.items():
try:
match = re.search(pattern, normalized_text, re.IGNORECASE)
if match:
value = float(match.group(1))
lab_values[test_name] = {
"value": value,
"raw_text": match.group(0),
"confidence": 0.8
}
print(f"Found {test_name}: {value}", file=sys.stderr)
except (ValueError, IndexError) as e:
print(f"Error parsing {test_name}: {e}", file=sys.stderr)
continue
return lab_values
if __name__ == "__main__":
test_high_quality_ocr() |