"""Stage 1b — image OCR channel.

Real OCR via Tesseract (pytesseract) or EasyOCR when installed. When neither is
present, a transparent *sidecar fallback* reads `<stem>.txt` next to the document
(produced by the sample generator) so the offline demo still exercises the OCR
channel end-to-end. The fallback is clearly labelled `engine="sidecar-fallback"`
and sets `simulated=True` so nothing is silently faked.
"""
from __future__ import annotations

import importlib.util
import os
import shutil
from dataclasses import dataclass
from pathlib import Path

from .layout import Block, ChannelResult


def _has(mod: str) -> bool:
    return importlib.util.find_spec(mod) is not None


def tesseract_available() -> bool:
    # The tesseract binary is what matters; we drive it via the CLI (the pytesseract
    # Python binding has a stderr-decode bug on some systems), so pytesseract is optional.
    return shutil.which("tesseract") is not None


def easyocr_available() -> bool:
    return _has("easyocr")


@dataclass
class OCRResult(ChannelResult):
    simulated: bool = False


def run_ocr(images: list, source_path: str | Path) -> OCRResult:
    """OCR a list of PIL page images. Falls back to the sidecar text file."""
    if images and tesseract_available():
        return _ocr_tesseract(images)
    if images and easyocr_available():
        return _ocr_easyocr(images)
    return _ocr_sidecar(source_path)


def _ocr_tesseract(images: list) -> OCRResult:
    """Drive the tesseract CLI via stdin (robust + sandbox-safe — no temp files)."""
    import io
    import subprocess

    exe = shutil.which("tesseract")
    parts, blocks = [], []
    for pno, img in enumerate(images):
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        try:
            out = subprocess.run([exe, "stdin", "stdout"], input=buf.getvalue(),
                                 capture_output=True, timeout=120)
            page_text = out.stdout.decode("utf-8", errors="ignore")
        except Exception:
            page_text = ""
        parts.append(page_text)
        for w in page_text.split():
            blocks.append(Block(text=w, page=pno, source="ocr", confidence=0.8))
    text = "\n".join(parts)
    return OCRResult(text=text, blocks=blocks, pages=len(images),
                     available=bool(text.strip()), engine="tesseract")


def _ocr_easyocr(images: list) -> OCRResult:
    import easyocr
    import numpy as np

    reader = easyocr.Reader(["en"], gpu=False, verbose=False)
    parts, blocks = [], []
    for pno, img in enumerate(images):
        results = reader.readtext(np.array(img))
        words = []
        for bbox, text, conf in results:
            words.append(text)
            xs = [p[0] for p in bbox]
            ys = [p[1] for p in bbox]
            blocks.append(
                Block(text=text, page=pno,
                      bbox=(min(xs), min(ys), max(xs), max(ys)),
                      source="ocr", confidence=float(conf))
            )
        parts.append(" ".join(words))
    text = "\n".join(parts)
    return OCRResult(text=text, blocks=blocks, pages=len(images),
                     available=bool(text.strip()), engine="easyocr")


def _ocr_sidecar(source_path: str | Path) -> OCRResult:
    """Fallback: read the `<stem>.txt` sidecar (the text the page would OCR to).

    This keeps the demo working with zero OCR deps. It is explicitly labelled as
    simulated so the dashboard can show 'install Tesseract for real OCR'.
    """
    p = Path(source_path)
    sidecar = p.with_suffix(".txt")
    if not sidecar.exists():
        # try `<stem>.ocr.txt`
        alt = p.with_name(p.stem + ".ocr.txt")
        sidecar = alt if alt.exists() else sidecar
    if sidecar.exists():
        text = sidecar.read_text(encoding="utf-8", errors="ignore")
        blocks = [Block(text=w, page=0, source="ocr", confidence=0.85)
                  for w in text.split()]
        return OCRResult(text=text, blocks=blocks, pages=1,
                         available=bool(text.strip()), engine="sidecar-fallback",
                         simulated=True)
    return OCRResult(available=False, engine="unavailable", simulated=True)