"""Stage 1b — image OCR channel. Real OCR via Tesseract (pytesseract) or EasyOCR when installed. When neither is present, a transparent *sidecar fallback* reads `.txt` next to the document (produced by the sample generator) so the offline demo still exercises the OCR channel end-to-end. The fallback is clearly labelled `engine="sidecar-fallback"` and sets `simulated=True` so nothing is silently faked. """ from __future__ import annotations import importlib.util import os import shutil from dataclasses import dataclass from pathlib import Path from .layout import Block, ChannelResult def _has(mod: str) -> bool: return importlib.util.find_spec(mod) is not None def tesseract_available() -> bool: # The tesseract binary is what matters; we drive it via the CLI (the pytesseract # Python binding has a stderr-decode bug on some systems), so pytesseract is optional. return shutil.which("tesseract") is not None def easyocr_available() -> bool: return _has("easyocr") @dataclass class OCRResult(ChannelResult): simulated: bool = False def run_ocr(images: list, source_path: str | Path) -> OCRResult: """OCR a list of PIL page images. Falls back to the sidecar text file.""" if images and tesseract_available(): return _ocr_tesseract(images) if images and easyocr_available(): return _ocr_easyocr(images) return _ocr_sidecar(source_path) def _ocr_tesseract(images: list) -> OCRResult: """Drive the tesseract CLI via stdin (robust + sandbox-safe — no temp files).""" import io import subprocess exe = shutil.which("tesseract") parts, blocks = [], [] for pno, img in enumerate(images): buf = io.BytesIO() img.save(buf, format="PNG") try: out = subprocess.run([exe, "stdin", "stdout"], input=buf.getvalue(), capture_output=True, timeout=120) page_text = out.stdout.decode("utf-8", errors="ignore") except Exception: page_text = "" parts.append(page_text) for w in page_text.split(): blocks.append(Block(text=w, page=pno, source="ocr", confidence=0.8)) text = "\n".join(parts) return OCRResult(text=text, blocks=blocks, pages=len(images), available=bool(text.strip()), engine="tesseract") def _ocr_easyocr(images: list) -> OCRResult: import easyocr import numpy as np reader = easyocr.Reader(["en"], gpu=False, verbose=False) parts, blocks = [], [] for pno, img in enumerate(images): results = reader.readtext(np.array(img)) words = [] for bbox, text, conf in results: words.append(text) xs = [p[0] for p in bbox] ys = [p[1] for p in bbox] blocks.append( Block(text=text, page=pno, bbox=(min(xs), min(ys), max(xs), max(ys)), source="ocr", confidence=float(conf)) ) parts.append(" ".join(words)) text = "\n".join(parts) return OCRResult(text=text, blocks=blocks, pages=len(images), available=bool(text.strip()), engine="easyocr") def _ocr_sidecar(source_path: str | Path) -> OCRResult: """Fallback: read the `.txt` sidecar (the text the page would OCR to). This keeps the demo working with zero OCR deps. It is explicitly labelled as simulated so the dashboard can show 'install Tesseract for real OCR'. """ p = Path(source_path) sidecar = p.with_suffix(".txt") if not sidecar.exists(): # try `.ocr.txt` alt = p.with_name(p.stem + ".ocr.txt") sidecar = alt if alt.exists() else sidecar if sidecar.exists(): text = sidecar.read_text(encoding="utf-8", errors="ignore") blocks = [Block(text=w, page=0, source="ocr", confidence=0.85) for w in text.split()] return OCRResult(text=text, blocks=blocks, pages=1, available=bool(text.strip()), engine="sidecar-fallback", simulated=True) return OCRResult(available=False, engine="unavailable", simulated=True)