Spaces:

MakPr016
/

clinical-analysis-api

Sleeping

File size: 3,870 Bytes

e158d2f

"""
Text extraction from PDFs and images using EasyOCR
Smart extraction: tries text layer first, falls back to OCR
"""

import fitz  # PyMuPDF
import easyocr
from PIL import Image
from pdf2image import convert_from_bytes
import io
import numpy as np
from typing import Tuple, Optional

print("Initializing EasyOCR Reader...")
try:
    reader = easyocr.Reader(['en'], gpu=False, verbose=False)
    print("✓ EasyOCR Reader initialized successfully")
except Exception as e:
    print(f"✗ EasyOCR initialization failed: {e}")
    reader = None

def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]:
    """
    Extract text from PDF with smart OCR fallback
    
    Returns:
        (extracted_text, ocr_used)
    """
    if not pdf_bytes:
        return None, False
    
    try:
        # Try extracting text layer first (fast)
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        full_text = ""
        
        for page in doc:
            full_text += page.get_text()
        
        doc.close()
        
        # Check if meaningful text was extracted
        if len(full_text.strip()) > 50:
            print(f"✓ Extracted {len(full_text)} chars from text layer")
            return full_text.strip(), False
        
        # No text layer - use OCR
        print("⚠ No text layer detected, using EasyOCR...")
        text = extract_text_from_pdf_via_ocr(pdf_bytes)
        return text, True
        
    except Exception as e:
        print(f"✗ Error in PDF text extraction: {e}")
        return None, False

def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]:
    """
    Extract text using EasyOCR on PDF pages converted to images
    """
    if not reader:
        raise RuntimeError("EasyOCR not initialized")
    
    try:
        # Convert PDF to images
        images = convert_from_bytes(pdf_bytes, dpi=300)
        full_text = ""
        
        for i, image in enumerate(images):
            print(f"   OCR processing page {i+1}/{len(images)}...")
            
            # Convert PIL to numpy array
            img_array = np.array(image)
            
            # Run EasyOCR
            results = reader.readtext(img_array, detail=0, paragraph=True)
            page_text = ' '.join(results)
            full_text += page_text + "\n\n"
        
        print(f"✓ EasyOCR extracted {len(full_text)} chars from {len(images)} pages")
        return full_text.strip()
        
    except Exception as e:
        print(f"✗ OCR failed: {e}")
        return None

def extract_text_from_image(image_bytes: bytes) -> Optional[str]:
    """
    Extract text from image file using EasyOCR
    """
    if not reader:
        raise RuntimeError("EasyOCR not initialized")
    
    try:
        print("Processing image with EasyOCR...")
        
        # Open and prepare image
        image = Image.open(io.BytesIO(image_bytes))
        
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Convert to numpy
        img_array = np.array(image)
        
        # Run EasyOCR
        results = reader.readtext(img_array, detail=0, paragraph=True)
        text = ' '.join(results)
        
        print(f"✓ EasyOCR extracted {len(text)} chars from image")
        return text.strip()
        
    except Exception as e:
        print(f"✗ Image OCR failed: {e}")
        return None

def get_ocr_confidence(image_array: np.ndarray) -> list:
    """
    Get detailed OCR results with confidence scores
    """
    if not reader:
        return []
    
    try:
        results = reader.readtext(image_array, detail=1)
        return [
            {
                "text": text,
                "confidence": round(conf, 3),
                "bbox": bbox
            }
            for bbox, text, conf in results
        ]
    except:
        return []