""" OCR Engine — Multi-engine OCR with docTR primary and Tesseract fallback. """ import numpy as np from PIL import Image from dataclasses import dataclass, field from typing import List import logging logger = logging.getLogger(__name__) @dataclass class WordResult: """A single detected word with position and confidence.""" text: str confidence: float x: int y: int width: int height: int block_id: int = 0 line_id: int = 0 @dataclass class OCRResult: """Complete OCR result for an image.""" words: List[WordResult] = field(default_factory=list) raw_text: str = "" average_confidence: float = 0.0 engine_used: str = "unknown" image_width: int = 0 image_height: int = 0 def run_ocr(image_input, engine: str = "auto") -> OCRResult: """Run OCR on an image. engine: 'doctr', 'tesseract', or 'auto'.""" if engine == "auto": try: result = _run_doctr(image_input) if result.words: logger.info(f"docTR: {len(result.words)} words, conf={result.average_confidence:.2f}") return result except Exception as e: logger.warning(f"docTR failed ({e}), falling back to Tesseract") try: return _run_tesseract(image_input) except Exception as e: raise RuntimeError(f"All OCR engines failed: {e}") elif engine == "doctr": return _run_doctr(image_input) elif engine == "tesseract": return _run_tesseract(image_input) raise ValueError(f"Unknown engine: {engine}") def _run_doctr(image_input) -> OCRResult: """Run OCR using docTR.""" from doctr.io import DocumentFile from doctr.models import ocr_predictor result = OCRResult(engine_used="docTR") if isinstance(image_input, str): doc = DocumentFile.from_images(image_input) pil_img = Image.open(image_input) result.image_width, result.image_height = pil_img.size elif isinstance(image_input, np.ndarray): doc = [image_input] result.image_height, result.image_width = image_input.shape[:2] else: raise ValueError("image_input must be a file path or numpy array") predictor = ocr_predictor( det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True, assume_straight_pages=True ) output = predictor(doc) block_id = 0 all_conf = [] for page in output.pages: for block in page.blocks: for line_idx, line in enumerate(block.lines): for word in line.words: (x_min, y_min), (x_max, y_max) = word.geometry w = WordResult( text=word.value, confidence=word.confidence, x=int(x_min * result.image_width), y=int(y_min * result.image_height), width=int((x_max - x_min) * result.image_width), height=int((y_max - y_min) * result.image_height), block_id=block_id, line_id=line_idx, ) result.words.append(w) all_conf.append(word.confidence) block_id += 1 result.raw_text = " ".join(w.text for w in result.words) result.average_confidence = sum(all_conf) / len(all_conf) if all_conf else 0.0 return result def _run_tesseract(image_input) -> OCRResult: """Run OCR using Tesseract (fallback).""" import pytesseract result = OCRResult(engine_used="Tesseract") if isinstance(image_input, str): pil_img = Image.open(image_input) elif isinstance(image_input, np.ndarray): pil_img = Image.fromarray(image_input) else: raise ValueError("image_input must be a file path or numpy array") result.image_width, result.image_height = pil_img.size data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, config='--psm 6') for i in range(len(data['text'])): text = data['text'][i].strip() if not text: continue conf = max(0, int(data['conf'][i])) / 100.0 w = WordResult( text=text, confidence=conf, x=data['left'][i], y=data['top'][i], width=data['width'][i], height=data['height'][i], block_id=data['block_num'][i], line_id=data['line_num'][i], ) result.words.append(w) result.raw_text = " ".join(w.text for w in result.words) confs = [w.confidence for w in result.words] result.average_confidence = sum(confs) / len(confs) if confs else 0.0 logger.info(f"Tesseract: {len(result.words)} words, conf={result.average_confidence:.2f}") return result