Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # enhanced_paddle_test.py - Improved to match local implementation | |
| import sys | |
| import os | |
| import json | |
| import fitz | |
| from paddleocr import PaddleOCR | |
| def test_high_quality_ocr(): | |
| if len(sys.argv) < 2: | |
| print(json.dumps({"error": "No file path provided"})) | |
| return | |
| file_path = sys.argv[1] | |
| try: | |
| print(f"Testing high-quality OCR on: {file_path}", file=sys.stderr) | |
| # Open PDF | |
| doc = fitz.open(file_path) | |
| total_pages = len(doc) | |
| print(f"PDF has {total_pages} pages", file=sys.stderr) | |
| all_text_parts = [] | |
| all_numbers = [] | |
| all_medical_terms = [] | |
| total_detections = 0 | |
| # Initialize OCR once with optimized settings for medical documents | |
| print("Initializing OCR with medical document settings...", file=sys.stderr) | |
| ocr = PaddleOCR( | |
| use_angle_cls=True, # Detect text orientation | |
| lang='en', # English language | |
| show_log=False, # Suppress logs | |
| use_gpu=False, # CPU mode for serverless | |
| det_limit_side_len=2880, # Higher detection limit for high-res images | |
| det_limit_type='max', # Max side length limit | |
| rec_batch_num=8, # Process more text regions at once | |
| max_text_length=50, # Allow longer text detection | |
| use_space_char=True, # Preserve spaces in text | |
| drop_score=0.1 # Much lower threshold to catch more text | |
| ) | |
| print("OCR initialized with medical settings", file=sys.stderr) | |
| # Process all pages (not just first page) | |
| for page_num in range(total_pages): | |
| print(f"Processing page {page_num + 1} of {total_pages}", file=sys.stderr) | |
| page = doc[page_num] | |
| # Use higher DPI and better quality settings | |
| mat = fitz.Matrix(300/72, 300/72) # 300 DPI like professional scanners | |
| pix = page.get_pixmap(matrix=mat, alpha=False) # No alpha for better OCR | |
| temp_img = f"/tmp/high_quality_page_{page_num}.png" | |
| pix.save(temp_img) | |
| if os.path.exists(temp_img): | |
| img_size = os.path.getsize(temp_img) | |
| print(f"High quality image: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr) | |
| else: | |
| print(f"Failed to create high quality image for page {page_num}", file=sys.stderr) | |
| continue | |
| # Run OCR on this page | |
| print(f"Running optimized OCR on page {page_num + 1}...", file=sys.stderr) | |
| result = ocr.ocr(temp_img, cls=True) | |
| if result and result[0]: | |
| page_detections = len(result[0]) | |
| total_detections += page_detections | |
| print(f"Page {page_num + 1}: found {page_detections} detections", file=sys.stderr) | |
| # Extract text with lower confidence threshold | |
| page_text_parts = [] | |
| for i, detection in enumerate(result[0]): | |
| if len(detection) >= 2: | |
| text_info = detection[1] | |
| if isinstance(text_info, (list, tuple)) and len(text_info) >= 2: | |
| text = str(text_info[0]) | |
| conf = float(text_info[1]) | |
| else: | |
| text = str(text_info) | |
| conf = 1.0 | |
| # Show some detections for debugging (first page only) | |
| if page_num == 0 and i < 20: | |
| print(f" {i}: '{text}' (confidence: {conf:.2f})", file=sys.stderr) | |
| # Use very low confidence threshold (0.1 instead of 0.2) | |
| if conf > 0.1 and len(text.strip()) > 0: | |
| page_text_parts.append(text) | |
| all_text_parts.append(text) | |
| # Categorize detections | |
| if any(char.isdigit() for char in text): | |
| # Look for numbers with decimals or medical values | |
| if '.' in text or any(c.isdigit() for c in text): | |
| all_numbers.append(text) | |
| elif len(text) > 2 and any(c.isalpha() for c in text): | |
| # Look for potential medical terms | |
| all_medical_terms.append(text) | |
| print(f"Page {page_num + 1}: extracted {len(page_text_parts)} text pieces", file=sys.stderr) | |
| # Clean up page image | |
| if os.path.exists(temp_img): | |
| os.unlink(temp_img) | |
| doc.close() | |
| # Combine all text | |
| full_text = '\n'.join(all_text_parts) | |
| print(f"Total extracted: {len(all_text_parts)} text pieces ({len(all_numbers)} numbers, {len(all_medical_terms)} terms)", file=sys.stderr) | |
| print(f"Total detections across {total_pages} pages: {total_detections}", file=sys.stderr) | |
| # Apply basic lab patterns similar to local implementation | |
| lab_values = apply_basic_patterns(full_text) | |
| # Return comprehensive result | |
| result_data = { | |
| "success": True, | |
| "text": full_text, | |
| "total_detections": total_detections, | |
| "pages_processed": total_pages, | |
| "numbers_found": all_numbers[:20], # First 20 numbers | |
| "terms_found": all_medical_terms[:20], # First 20 terms | |
| "lab_values": lab_values, | |
| "settings": f"High-quality 300 DPI with medical optimization, {total_pages} pages" | |
| } | |
| print(json.dumps(result_data)) | |
| except Exception as e: | |
| # Clean up on error | |
| for i in range(10): # Clean up any temp files | |
| temp_file = f"/tmp/high_quality_page_{i}.png" | |
| if os.path.exists(temp_file): | |
| os.unlink(temp_file) | |
| print(f"Error: {e}", file=sys.stderr) | |
| import traceback | |
| traceback.print_exc(file=sys.stderr) | |
| print(json.dumps({"success": False, "error": str(e)})) | |
| def apply_basic_patterns(text): | |
| """Apply basic lab value patterns similar to local implementation""" | |
| lab_values = {} | |
| if not text: | |
| return lab_values | |
| # Define basic patterns for common lab values | |
| patterns = { | |
| 'TSH': r'TSH[:\s]*(\d+\.?\d*)', | |
| 'Testosterone': r'Testosterone[:\s]*(\d+\.?\d*)', | |
| 'C-Reactive Protein': r'C[-\s]*Reactive[-\s]*Protein[:\s]*(\d+\.?\d*)', | |
| 'HDL': r'HDL[-\s]*C?[:\s]*(\d+\.?\d*)', | |
| 'LDL': r'LDL[-\s]*C?[:\s]*(\d+\.?\d*)', | |
| 'Triglycerides': r'Triglycerides[:\s]*(\d+\.?\d*)', | |
| 'Glucose': r'Glucose[:\s]*(\d+\.?\d*)', | |
| 'Creatinine': r'Creatinine[:\s]*(\d+\.?\d*)', | |
| 'Hemoglobin': r'Hemoglobin[:\s]*(\d+\.?\d*)', | |
| 'WBC': r'WBC[:\s]*(\d+\.?\d*)', | |
| 'RBC': r'RBC[:\s]*(\d+\.?\d*)' | |
| } | |
| import re | |
| # Normalize text for pattern matching | |
| normalized_text = re.sub(r'\s+', ' ', text) | |
| for test_name, pattern in patterns.items(): | |
| try: | |
| match = re.search(pattern, normalized_text, re.IGNORECASE) | |
| if match: | |
| value = float(match.group(1)) | |
| lab_values[test_name] = { | |
| "value": value, | |
| "raw_text": match.group(0), | |
| "confidence": 0.8 | |
| } | |
| print(f"Found {test_name}: {value}", file=sys.stderr) | |
| except (ValueError, IndexError) as e: | |
| print(f"Error parsing {test_name}: {e}", file=sys.stderr) | |
| continue | |
| return lab_values | |
| if __name__ == "__main__": | |
| test_high_quality_ocr() |