paddleocr-processor / archive /enhanced_paddle_test.py
mbuckle's picture
Diagnostic test
2a0cc07
#!/usr/bin/env python3
# enhanced_paddle_test.py - Improved to match local implementation
import sys
import os
import json
import fitz
from paddleocr import PaddleOCR
def test_high_quality_ocr():
if len(sys.argv) < 2:
print(json.dumps({"error": "No file path provided"}))
return
file_path = sys.argv[1]
try:
print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr)
# Open PDF
doc = fitz.open(file_path)
total_pages = len(doc)
print(f"PDF has {total_pages} pages", file=sys.stderr)
all_text_parts = []
all_numbers = []
all_medical_terms = []
total_detections = 0
# Initialize OCR once with optimized settings for medical documents
print("Initializing OCR with medical document settings...", file=sys.stderr)
ocr = PaddleOCR(
use_angle_cls=True, # Detect text orientation
lang='en', # English language
show_log=False, # Suppress logs
use_gpu=False, # CPU mode for serverless
det_limit_side_len=2880, # Higher detection limit for high-res images
det_limit_type='max', # Max side length limit
rec_batch_num=8, # Process more text regions at once
max_text_length=50, # Allow longer text detection
use_space_char=True, # Preserve spaces in text
drop_score=0.1 # Much lower threshold to catch more text
)
print("OCR initialized with medical settings", file=sys.stderr)
# Process all pages (not just first page)
for page_num in range(total_pages):
print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr)
page = doc[page_num]
# Use higher DPI and better quality settings
mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners
pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR
temp_img = f"/tmp/high_quality_page_{page_num}.png"
pix.save(temp_img)
if os.path.exists(temp_img):
img_size = os.path.getsize(temp_img)
print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
else:
print(f"Failed to create high quality image for page {page_num}", file=sys.stderr)
continue
# Run OCR on this page
print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr)
result = ocr.ocr(temp_img, cls=True)
if result and result[0]:
page_detections = len(result[0])
total_detections += page_detections
print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr)
# Extract text with lower confidence threshold
page_text_parts = []
for i, detection in enumerate(result[0]):
if len(detection) >= 2:
text_info = detection[1]
if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
text = str(text_info[0])
conf = float(text_info[1])
else:
text = str(text_info)
conf = 1.0
# Show some detections for debugging (first page only)
if page_num == 0 and i < 20:
print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)
# Use very low confidence threshold (0.1 instead of 0.2)
if conf > 0.1 and len(text.strip()) > 0:
page_text_parts.append(text)
all_text_parts.append(text)
# Categorize detections
if any(char.isdigit() for char in text):
# Look for numbers with decimals or medical values
if '.' in text or any(c.isdigit() for c in text):
all_numbers.append(text)
elif len(text) > 2 and any(c.isalpha() for c in text):
# Look for potential medical terms
all_medical_terms.append(text)
print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr)
# Clean up page image
if os.path.exists(temp_img):
os.unlink(temp_img)
doc.close()
# Combine all text
full_text = '\n'.join(all_text_parts)
print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr)
print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr)
# Apply basic lab patterns similar to local implementation
lab_values = apply_basic_patterns(full_text)
# Return comprehensive result
result_data = {
"success": True,
"text": full_text,
"total_detections": total_detections,
"pages_processed": total_pages,
"numbers_found": all_numbers[:20], # First 20 numbers
"terms_found": all_medical_terms[:20], # First 20 terms
"lab_values": lab_values,
"settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages"
}
print(json.dumps(result_data))
except Exception as e:
# Clean up on error
for i in range(10): # Clean up any temp files
temp_file = f"/tmp/high_quality_page_{i}.png"
if os.path.exists(temp_file):
os.unlink(temp_file)
print(f"Error: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
print(json.dumps({"success": False, "error": str(e)}))
def apply_basic_patterns(text):
"""Apply basic lab value patterns similar to local implementation"""
lab_values = {}
if not text:
return lab_values
# Define basic patterns for common lab values
patterns = {
'TSH': r'TSH[:\s]*(\d+\.?\d*)',
'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)',
'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)',
'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)',
'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)',
'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)',
'Glucose': r'Glucose[:\s]*(\d+\.?\d*)',
'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)',
'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)',
'WBC': r'WBC[:\s]*(\d+\.?\d*)',
'RBC': r'RBC[:\s]*(\d+\.?\d*)'
}
import re
# Normalize text for pattern matching
normalized_text = re.sub(r'\s+', ' ', text)
for test_name, pattern in patterns.items():
try:
match = re.search(pattern, normalized_text, re.IGNORECASE)
if match:
value = float(match.group(1))
lab_values[test_name] = {
"value": value,
"raw_text": match.group(0),
"confidence": 0.8
}
print(f"Found {test_name}: {value}", file=sys.stderr)
except (ValueError, IndexError) as e:
print(f"Error parsing {test_name}: {e}", file=sys.stderr)
continue
return lab_values
if __name__ == "__main__":
test_high_quality_ocr()