Spaces:

mbuck17
/

paddleocr-processor

Sleeping

App Files Files Community

paddleocr-processor / archive /enhanced_paddle_test.py

mbuckle

Diagnostic test

2a0cc07 9 months ago

raw

history blame contribute delete

8.15 kB

	#!/usr/bin/env python3
	# enhanced_paddle_test.py - Improved to match local implementation

	import sys
	import os
	import json
	import fitz
	from paddleocr import PaddleOCR

	def test_high_quality_ocr():
	if len(sys.argv) < 2:
	print(json.dumps({"error": "No file path provided"}))
	return

	file_path = sys.argv[1]

	try:
	print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr)

	# Open PDF
	doc = fitz.open(file_path)
	total_pages = len(doc)
	print(f"PDF has {total_pages} pages", file=sys.stderr)

	all_text_parts = []
	all_numbers = []
	all_medical_terms = []
	total_detections = 0

	# Initialize OCR once with optimized settings for medical documents
	print("Initializing OCR with medical document settings...", file=sys.stderr)
	ocr = PaddleOCR(
	use_angle_cls=True, # Detect text orientation
	lang='en', # English language
	show_log=False, # Suppress logs
	use_gpu=False, # CPU mode for serverless
	det_limit_side_len=2880, # Higher detection limit for high-res images
	det_limit_type='max', # Max side length limit
	rec_batch_num=8, # Process more text regions at once
	max_text_length=50, # Allow longer text detection
	use_space_char=True, # Preserve spaces in text
	drop_score=0.1 # Much lower threshold to catch more text
	)
	print("OCR initialized with medical settings", file=sys.stderr)

	# Process all pages (not just first page)
	for page_num in range(total_pages):
	print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr)

	page = doc[page_num]

	# Use higher DPI and better quality settings
	mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners
	pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR

	temp_img = f"/tmp/high_quality_page_{page_num}.png"
	pix.save(temp_img)

	if os.path.exists(temp_img):
	img_size = os.path.getsize(temp_img)
	print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
	else:
	print(f"Failed to create high quality image for page {page_num}", file=sys.stderr)
	continue

	# Run OCR on this page
	print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr)
	result = ocr.ocr(temp_img, cls=True)

	if result and result[0]:
	page_detections = len(result[0])
	total_detections += page_detections
	print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr)

	# Extract text with lower confidence threshold
	page_text_parts = []

	for i, detection in enumerate(result[0]):
	if len(detection) >= 2:
	text_info = detection[1]
	if isinstance(text_info, (list, tuple)) and len(text_info) >= 2:
	text = str(text_info[0])
	conf = float(text_info[1])
	else:
	text = str(text_info)
	conf = 1.0

	# Show some detections for debugging (first page only)
	if page_num == 0 and i < 20:
	print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr)

	# Use very low confidence threshold (0.1 instead of 0.2)
	if conf > 0.1 and len(text.strip()) > 0:
	page_text_parts.append(text)
	all_text_parts.append(text)

	# Categorize detections
	if any(char.isdigit() for char in text):
	# Look for numbers with decimals or medical values
	if '.' in text or any(c.isdigit() for c in text):
	all_numbers.append(text)
	elif len(text) > 2 and any(c.isalpha() for c in text):
	# Look for potential medical terms
	all_medical_terms.append(text)

	print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr)

	# Clean up page image
	if os.path.exists(temp_img):
	os.unlink(temp_img)

	doc.close()

	# Combine all text
	full_text = '\n'.join(all_text_parts)

	print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr)
	print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr)

	# Apply basic lab patterns similar to local implementation
	lab_values = apply_basic_patterns(full_text)

	# Return comprehensive result
	result_data = {
	"success": True,
	"text": full_text,
	"total_detections": total_detections,
	"pages_processed": total_pages,
	"numbers_found": all_numbers[:20], # First 20 numbers
	"terms_found": all_medical_terms[:20], # First 20 terms
	"lab_values": lab_values,
	"settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages"
	}

	print(json.dumps(result_data))

	except Exception as e:
	# Clean up on error
	for i in range(10): # Clean up any temp files
	temp_file = f"/tmp/high_quality_page_{i}.png"
	if os.path.exists(temp_file):
	os.unlink(temp_file)

	print(f"Error: {e}", file=sys.stderr)
	import traceback
	traceback.print_exc(file=sys.stderr)
	print(json.dumps({"success": False, "error": str(e)}))

	def apply_basic_patterns(text):
	"""Apply basic lab value patterns similar to local implementation"""
	lab_values = {}

	if not text:
	return lab_values

	# Define basic patterns for common lab values
	patterns = {
	'TSH': r'TSH[:\s](\d+\.?\d)',
	'Testosterone': r'Testosterone[:\s](\d+\.?\d)',
	'C-Reactive Protein': r'C[-\s]Reactive[-\s]Protein[:\s](\d+\.?\d)',
	'HDL': r'HDL[-\s]C?[:\s](\d+\.?\d*)',
	'LDL': r'LDL[-\s]C?[:\s](\d+\.?\d*)',
	'Triglycerides': r'Triglycerides[:\s](\d+\.?\d)',
	'Glucose': r'Glucose[:\s](\d+\.?\d)',
	'Creatinine': r'Creatinine[:\s](\d+\.?\d)',
	'Hemoglobin': r'Hemoglobin[:\s](\d+\.?\d)',
	'WBC': r'WBC[:\s](\d+\.?\d)',
	'RBC': r'RBC[:\s](\d+\.?\d)'
	}

	import re

	# Normalize text for pattern matching
	normalized_text = re.sub(r'\s+', ' ', text)

	for test_name, pattern in patterns.items():
	try:
	match = re.search(pattern, normalized_text, re.IGNORECASE)
	if match:
	value = float(match.group(1))
	lab_values[test_name] = {
	"value": value,
	"raw_text": match.group(0),
	"confidence": 0.8
	}
	print(f"Found {test_name}: {value}", file=sys.stderr)
	except (ValueError, IndexError) as e:
	print(f"Error parsing {test_name}: {e}", file=sys.stderr)
	continue

	return lab_values

	if __name__ == "__main__":
	test_high_quality_ocr()