agentic-image2word / core /ocr_engine.py
xeeshan404's picture
Initial deployment: Agentic Image2Word Converter
5f3b8de verified
"""
OCR Engine — Multi-engine OCR with docTR primary and Tesseract fallback.
"""
import numpy as np
from PIL import Image
from dataclasses import dataclass, field
from typing import List
import logging
logger = logging.getLogger(__name__)
@dataclass
class WordResult:
"""A single detected word with position and confidence."""
text: str
confidence: float
x: int
y: int
width: int
height: int
block_id: int = 0
line_id: int = 0
@dataclass
class OCRResult:
"""Complete OCR result for an image."""
words: List[WordResult] = field(default_factory=list)
raw_text: str = ""
average_confidence: float = 0.0
engine_used: str = "unknown"
image_width: int = 0
image_height: int = 0
def run_ocr(image_input, engine: str = "auto") -> OCRResult:
"""Run OCR on an image. engine: 'doctr', 'tesseract', or 'auto'."""
if engine == "auto":
try:
result = _run_doctr(image_input)
if result.words:
logger.info(f"docTR: {len(result.words)} words, conf={result.average_confidence:.2f}")
return result
except Exception as e:
logger.warning(f"docTR failed ({e}), falling back to Tesseract")
try:
return _run_tesseract(image_input)
except Exception as e:
raise RuntimeError(f"All OCR engines failed: {e}")
elif engine == "doctr":
return _run_doctr(image_input)
elif engine == "tesseract":
return _run_tesseract(image_input)
raise ValueError(f"Unknown engine: {engine}")
def _run_doctr(image_input) -> OCRResult:
"""Run OCR using docTR."""
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
result = OCRResult(engine_used="docTR")
if isinstance(image_input, str):
doc = DocumentFile.from_images(image_input)
pil_img = Image.open(image_input)
result.image_width, result.image_height = pil_img.size
elif isinstance(image_input, np.ndarray):
doc = [image_input]
result.image_height, result.image_width = image_input.shape[:2]
else:
raise ValueError("image_input must be a file path or numpy array")
predictor = ocr_predictor(
det_arch='db_resnet50', reco_arch='crnn_vgg16_bn',
pretrained=True, assume_straight_pages=True
)
output = predictor(doc)
block_id = 0
all_conf = []
for page in output.pages:
for block in page.blocks:
for line_idx, line in enumerate(block.lines):
for word in line.words:
(x_min, y_min), (x_max, y_max) = word.geometry
w = WordResult(
text=word.value, confidence=word.confidence,
x=int(x_min * result.image_width), y=int(y_min * result.image_height),
width=int((x_max - x_min) * result.image_width),
height=int((y_max - y_min) * result.image_height),
block_id=block_id, line_id=line_idx,
)
result.words.append(w)
all_conf.append(word.confidence)
block_id += 1
result.raw_text = " ".join(w.text for w in result.words)
result.average_confidence = sum(all_conf) / len(all_conf) if all_conf else 0.0
return result
def _run_tesseract(image_input) -> OCRResult:
"""Run OCR using Tesseract (fallback)."""
import pytesseract
result = OCRResult(engine_used="Tesseract")
if isinstance(image_input, str):
pil_img = Image.open(image_input)
elif isinstance(image_input, np.ndarray):
pil_img = Image.fromarray(image_input)
else:
raise ValueError("image_input must be a file path or numpy array")
result.image_width, result.image_height = pil_img.size
data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, config='--psm 6')
for i in range(len(data['text'])):
text = data['text'][i].strip()
if not text:
continue
conf = max(0, int(data['conf'][i])) / 100.0
w = WordResult(
text=text, confidence=conf,
x=data['left'][i], y=data['top'][i],
width=data['width'][i], height=data['height'][i],
block_id=data['block_num'][i], line_id=data['line_num'][i],
)
result.words.append(w)
result.raw_text = " ".join(w.text for w in result.words)
confs = [w.confidence for w in result.words]
result.average_confidence = sum(confs) / len(confs) if confs else 0.0
logger.info(f"Tesseract: {len(result.words)} words, conf={result.average_confidence:.2f}")
return result