Spaces:

xeeshan404
/

agentic-image2word

Sleeping

File size: 4,707 Bytes

5f3b8de

"""
OCR Engine — Multi-engine OCR with docTR primary and Tesseract fallback.
"""

import numpy as np
from PIL import Image
from dataclasses import dataclass, field
from typing import List
import logging

logger = logging.getLogger(__name__)


@dataclass
class WordResult:
    """A single detected word with position and confidence."""
    text: str
    confidence: float
    x: int
    y: int
    width: int
    height: int
    block_id: int = 0
    line_id: int = 0


@dataclass
class OCRResult:
    """Complete OCR result for an image."""
    words: List[WordResult] = field(default_factory=list)
    raw_text: str = ""
    average_confidence: float = 0.0
    engine_used: str = "unknown"
    image_width: int = 0
    image_height: int = 0


def run_ocr(image_input, engine: str = "auto") -> OCRResult:
    """Run OCR on an image. engine: 'doctr', 'tesseract', or 'auto'."""
    if engine == "auto":
        try:
            result = _run_doctr(image_input)
            if result.words:
                logger.info(f"docTR: {len(result.words)} words, conf={result.average_confidence:.2f}")
                return result
        except Exception as e:
            logger.warning(f"docTR failed ({e}), falling back to Tesseract")
        try:
            return _run_tesseract(image_input)
        except Exception as e:
            raise RuntimeError(f"All OCR engines failed: {e}")
    elif engine == "doctr":
        return _run_doctr(image_input)
    elif engine == "tesseract":
        return _run_tesseract(image_input)
    raise ValueError(f"Unknown engine: {engine}")


def _run_doctr(image_input) -> OCRResult:
    """Run OCR using docTR."""
    from doctr.io import DocumentFile
    from doctr.models import ocr_predictor

    result = OCRResult(engine_used="docTR")

    if isinstance(image_input, str):
        doc = DocumentFile.from_images(image_input)
        pil_img = Image.open(image_input)
        result.image_width, result.image_height = pil_img.size
    elif isinstance(image_input, np.ndarray):
        doc = [image_input]
        result.image_height, result.image_width = image_input.shape[:2]
    else:
        raise ValueError("image_input must be a file path or numpy array")

    predictor = ocr_predictor(
        det_arch='db_resnet50', reco_arch='crnn_vgg16_bn',
        pretrained=True, assume_straight_pages=True
    )
    output = predictor(doc)

    block_id = 0
    all_conf = []
    for page in output.pages:
        for block in page.blocks:
            for line_idx, line in enumerate(block.lines):
                for word in line.words:
                    (x_min, y_min), (x_max, y_max) = word.geometry
                    w = WordResult(
                        text=word.value, confidence=word.confidence,
                        x=int(x_min * result.image_width), y=int(y_min * result.image_height),
                        width=int((x_max - x_min) * result.image_width),
                        height=int((y_max - y_min) * result.image_height),
                        block_id=block_id, line_id=line_idx,
                    )
                    result.words.append(w)
                    all_conf.append(word.confidence)
            block_id += 1

    result.raw_text = " ".join(w.text for w in result.words)
    result.average_confidence = sum(all_conf) / len(all_conf) if all_conf else 0.0
    return result


def _run_tesseract(image_input) -> OCRResult:
    """Run OCR using Tesseract (fallback)."""
    import pytesseract

    result = OCRResult(engine_used="Tesseract")

    if isinstance(image_input, str):
        pil_img = Image.open(image_input)
    elif isinstance(image_input, np.ndarray):
        pil_img = Image.fromarray(image_input)
    else:
        raise ValueError("image_input must be a file path or numpy array")

    result.image_width, result.image_height = pil_img.size
    data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, config='--psm 6')

    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        if not text:
            continue
        conf = max(0, int(data['conf'][i])) / 100.0
        w = WordResult(
            text=text, confidence=conf,
            x=data['left'][i], y=data['top'][i],
            width=data['width'][i], height=data['height'][i],
            block_id=data['block_num'][i], line_id=data['line_num'][i],
        )
        result.words.append(w)

    result.raw_text = " ".join(w.text for w in result.words)
    confs = [w.confidence for w in result.words]
    result.average_confidence = sum(confs) / len(confs) if confs else 0.0
    logger.info(f"Tesseract: {len(result.words)} words, conf={result.average_confidence:.2f}")
    return result