Spaces:

achimrabus
/

polyscriptor-htr-demo

Running

File size: 92,051 Bytes

78431ff

"""
Polyscriptor Web UI — FastAPI Backend

Thin wrapper around existing HTR engine code. Provides REST API + SSE
for browser-based transcription. All heavy lifting done by the same
modules the PyQt6 GUI uses.

Usage:
    source htr_gui/bin/activate
    python -m uvicorn web.polyscriptor_server:app --host 0.0.0.0 --port 8765

Author: Claude Code
Date: 2026-02-26
"""

import asyncio
import hashlib
import importlib
import json
import logging
import os
import sys
import time
import uuid
from dataclasses import dataclass, field
from types import SimpleNamespace
from pathlib import Path
from typing import Any, Dict, List, Optional

import numpy as np
from PIL import Image, ImageOps
from fastapi import Cookie, FastAPI, File, HTTPException, Query, Request, UploadFile
from fastapi.responses import FileResponse, Response, StreamingResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel

log = logging.getLogger("polyscriptor")
DEMO_MODE = os.environ.get("POLYSCRIPTOR_DEMO_MODE", "").strip().lower()

# Add project root to path so we can import existing modules
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# Load .env from project root (same as the Qt GUI does via CommercialAPIEngine)
try:
    from dotenv import load_dotenv
    _env_path = PROJECT_ROOT / ".env"
    if _env_path.exists():
        load_dotenv(_env_path)
        log.info(f"Loaded environment variables from {_env_path}")
except ImportError:
    pass  # python-dotenv not installed — env vars must be set externally

from htr_engine_base import get_global_registry, HTREngine, TranscriptionResult

# PDF support via PyMuPDF
try:
    import fitz as _fitz  # PyMuPDF
    PDF_AVAILABLE = True
except ImportError:
    PDF_AVAILABLE = False
    log.warning("PyMuPDF not installed — PDF upload disabled. Install with: pip install pymupdf")

# Lazy imports for segmentation (avoid slow startup)
_segmenters_imported = False


def _import_segmenters():
    global _segmenters_imported
    if _segmenters_imported:
        return
    global KrakenLineSegmenter, LineSegmenter, PYLAIA_MODELS
    from kraken_segmenter import KrakenLineSegmenter
    from inference_page import LineSegmenter
    try:
        from inference_pylaia_native import PYLAIA_MODELS
    except ImportError:
        PYLAIA_MODELS = {}
    _segmenters_imported = True


# ---------------------------------------------------------------------------
# App setup
# ---------------------------------------------------------------------------

app = FastAPI(title="Polyscriptor HTR", version="0.1.0")

# Serve static frontend files
STATIC_DIR = Path(__file__).parent / "static"
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")

# ---------------------------------------------------------------------------
# Engine pool — Phase 2: shared pool of loaded engine instances
# ---------------------------------------------------------------------------

@dataclass
class EngineSlot:
    """One loaded engine instance in the pool."""
    engine: Any  # HTREngine instance (not the registry singleton)
    engine_name: str
    config: dict
    pool_key: str
    ref_count: int = 0
    last_used: float = field(default_factory=time.time)
    lock: asyncio.Lock = field(default_factory=asyncio.Lock)

engine_pool: Dict[str, EngineSlot] = {}
pool_lock = asyncio.Lock()

# VRAM budget estimates (GB) for eviction decisions
_ENGINE_VRAM_GB = {
    "CRNN-CTC (PyLaia-inspired)": 2,
    "TrOCR": 3,
    "Qwen3-VL": 18,
    "Churro VLM": 10,
    "Kraken": 2,
    "Party": 4,
    "PaddleOCR": 2,
}
_NO_GPU_ENGINES = {"Commercial APIs", "OpenWebUI", "LightOnOCR", "DeepSeek-OCR"}
_TOTAL_VRAM_GB = 92  # 2x L40S @ 46GB each


# Factory: engine name -> (module, class) for creating fresh instances
_ENGINE_FACTORY = {
    "TrOCR":                       ("engines.trocr_engine",        "TrOCREngine"),
    "CRNN-CTC (PyLaia-inspired)":  ("engines.pylaia_engine",       "PyLaiaEngine"),
    "Qwen3-VL":                    ("engines.qwen3_engine",        "Qwen3Engine"),
    "Churro VLM":                   ("engines.churro_engine",       "ChurroEngine"),
    "Kraken":                       ("engines.kraken_engine",       "KrakenEngine"),
    "Commercial APIs":              ("engines.commercial_api_engine", "CommercialAPIEngine"),
    "Party":                        ("engines.party_engine",        "PartyEngine"),
    "OpenWebUI":                    ("engines.openwebui_engine",    "OpenWebUIEngine"),
    "DeepSeek-OCR":                 ("engines.deepseek_ocr_engine", "DeepSeekOCREngine"),
    "LightOnOCR":                   ("engines.lighton_ocr_engine",  "LightOnOCREngine"),
    "PaddleOCR":                    ("engines.paddle_engine",       "PaddleOCREngine"),
}


def _create_engine_instance(engine_name: str):
    """Create a fresh engine instance (not the registry singleton).

    The registry is used for discovery/availability only.
    Pool slots get their own instances so multiple models can coexist.
    """
    entry = _ENGINE_FACTORY.get(engine_name)
    if not entry:
        return None
    module_name, class_name = entry
    mod = importlib.import_module(module_name)
    cls = getattr(mod, class_name)
    return cls()


def _make_pool_key(engine_name: str, config: dict) -> str:
    """Build a key that uniquely identifies an engine+model combination."""
    if engine_name == "Commercial APIs":
        provider = config.get("provider", "unknown")
        model = config.get("model", "unknown")
        api_key = config.get("api_key", "")
        key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
        return f"{engine_name}::{provider}::{model}::{key_hash}"

    if engine_name == "OpenWebUI":
        model = config.get("model", "unknown")
        base_url = config.get("base_url", "unknown")
        api_key = config.get("api_key", "")
        key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
        return f"{engine_name}::{base_url}::{model}::{key_hash}"

    if engine_name == "TrOCR":
        return f"{engine_name}::{config.get('model_path', 'default')}"

    if engine_name in ("CRNN-CTC (PyLaia-inspired)", "Kraken"):
        return f"{engine_name}::{config.get('model_path', 'default')}"

    if engine_name == "Qwen3-VL":
        base = config.get("base_model", "default")
        adapter = config.get("adapter", "")
        return f"{engine_name}::{base}::{adapter or 'none'}"

    if engine_name == "Churro VLM":
        return f"{engine_name}::{config.get('model_name', 'default')}"

    if engine_name == "LightOnOCR":
        return f"{engine_name}::{config.get('model_path', 'default')}"

    # Fallback: hash the config
    config_hash = hashlib.sha256(str(sorted(config.items())).encode()).hexdigest()[:12]
    return f"{engine_name}::{config_hash}"


async def _maybe_evict(new_engine_name: str):
    """Evict LRU slots with ref_count==0 if VRAM is tight. Called UNDER pool_lock."""
    if new_engine_name in _NO_GPU_ENGINES:
        return
    needed = _ENGINE_VRAM_GB.get(new_engine_name, 4)
    used = sum(_ENGINE_VRAM_GB.get(s.engine_name, 4)
               for s in engine_pool.values()
               if s.engine_name not in _NO_GPU_ENGINES)
    if used + needed <= _TOTAL_VRAM_GB:
        return
    # Evict: ref_count==0, oldest first
    candidates = sorted(
        [(k, s) for k, s in engine_pool.items()
         if s.ref_count == 0 and s.engine_name not in _NO_GPU_ENGINES],
        key=lambda x: x[1].last_used
    )
    for key, slot in candidates:
        if used + needed <= _TOTAL_VRAM_GB:
            break
        log.info(f"Evicting engine slot '{key}' (last used {time.time() - slot.last_used:.0f}s ago)")
        try:
            slot.engine.unload_model()
        except Exception as e:
            log.warning(f"Error unloading evicted engine: {e}")
        del engine_pool[key]
        used -= _ENGINE_VRAM_GB.get(slot.engine_name, 4)
    if used + needed > _TOTAL_VRAM_GB:
        log.warning(f"VRAM tight: ~{used}GB used + ~{needed}GB needed > {_TOTAL_VRAM_GB}GB total")


# Compatibility shims — will be removed after full migration
loaded_engine: Optional[HTREngine] = None
loaded_engine_name: str = ""
loaded_config: dict = {}

# Persistent upload storage (survives server restarts)
UPLOAD_DIR = Path(__file__).parent / "uploads"
UPLOAD_DIR.mkdir(exist_ok=True)

# Upload TTL: 24 hours
_UPLOAD_TTL_SECONDS = 86400

# Session TTL: 2 hours of inactivity
_SESSION_TTL_SECONDS = 7200

# Cookie name for session tracking
_SESSION_COOKIE = "polyscriptor_session"


# ---------------------------------------------------------------------------
# Per-user sessions — Phase 1 of multi-user refactoring
# ---------------------------------------------------------------------------

@dataclass
class UserSession:
    session_id: str
    image_cache: Dict[str, dict] = field(default_factory=dict)
    cancel_events: Dict[str, asyncio.Event] = field(default_factory=dict)
    pool_key: Optional[str] = None  # Reference into engine_pool
    created_at: float = field(default_factory=time.time)
    last_active: float = field(default_factory=time.time)


sessions: Dict[str, UserSession] = {}
global_image_cache: Dict[str, dict] = {}


def _get_or_create_session(session_id: Optional[str]) -> tuple[UserSession, bool]:
    """Return (session, created). If session_id is missing/unknown, create a new one."""
    if session_id and session_id in sessions:
        session = sessions[session_id]
        session.last_active = time.time()
        return session, False
    new_id = str(uuid.uuid4())
    session = UserSession(session_id=new_id)
    sessions[new_id] = session
    return session, True


def _cleanup_expired_sessions() -> int:
    """Remove sessions inactive for more than _SESSION_TTL_SECONDS. Returns count removed."""
    cutoff = time.time() - _SESSION_TTL_SECONDS
    expired = [sid for sid, s in sessions.items() if s.last_active < cutoff]
    for sid in expired:
        session = sessions.pop(sid)
        # Release pool reference
        if session.pool_key and session.pool_key in engine_pool:
            slot = engine_pool[session.pool_key]
            slot.ref_count = max(0, slot.ref_count - 1)
            if slot.ref_count == 0:
                log.info(f"Immediate eviction (session expiry): '{slot.engine_name}'")
                try:
                    slot.engine.unload_model()
                except Exception as e:
                    log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
                if session.pool_key in engine_pool:
                    del engine_pool[session.pool_key]
        # Clean up upload files belonging to this session
        for iid, img_data in session.image_cache.items():
            p = img_data.get("path")
            if p:
                Path(p).unlink(missing_ok=True)
            xp = img_data.get("xml_path")
            if xp:
                Path(xp).unlink(missing_ok=True)
        log.info(f"Expired session {sid[:8]}... ({len(session.image_cache)} images)")
    return len(expired)


_SESSION_PASSTHROUGH_PATHS = {"/api/gpu", "/api/engines", "/api/kraken/presets"}


@app.middleware("http")
async def session_middleware(request: Request, call_next):
    """Inject session into request.state; set session cookie on new sessions.

    Pure status/discovery routes (GPU poll, engine list) are excluded from
    last_active updates so that background browser polling cannot keep a session
    alive indefinitely and prevent engine-slot eviction.
    """
    session_id = request.cookies.get(_SESSION_COOKIE)
    session, created = _get_or_create_session(session_id)
    request.state.session = session

    # Don't update last_active for polling-only routes
    if request.url.path in _SESSION_PASSTHROUGH_PATHS:
        session.last_active  # read only — no write
    else:
        session.last_active = time.time()

    response = await call_next(request)

    if created or session_id != session.session_id:
        cookie_kwargs = {
            "key": _SESSION_COOKIE,
            "value": session.session_id,
            "httponly": True,
            "max_age": _SESSION_TTL_SECONDS,
        }
        if DEMO_MODE == "hf_space":
            cookie_kwargs.update({"samesite": "none", "secure": True})
        else:
            cookie_kwargs.update({"samesite": "lax"})
        response.set_cookie(
            **cookie_kwargs
        )
    return response


def _get_session(request: Request) -> UserSession:
    """FastAPI dependency: extract session set by middleware."""
    return request.state.session


def _cleanup_old_uploads() -> int:
    """Delete uploads older than TTL and evict image_cache entries across all sessions."""
    cutoff = time.time() - _UPLOAD_TTL_SECONDS
    deleted = 0
    for f in list(UPLOAD_DIR.iterdir()):
        if f.is_file():
            try:
                if f.stat().st_mtime < cutoff:
                    f.unlink(missing_ok=True)
                    deleted += 1
            except OSError:
                pass
    # Evict stale image_cache entries whose file no longer exists (all sessions)
    for session in sessions.values():
        for iid in list(session.image_cache.keys()):
            p = session.image_cache[iid].get("path")
            if p and not Path(p).exists():
                del session.image_cache[iid]
    return deleted


_SLOT_IDLE_TTL_SECONDS = 6 * 3600  # evict loaded engines idle for 6h, regardless of ref_count


def _evict_idle_slots() -> int:
    """Evict engine slots that have not been used for _SLOT_IDLE_TTL_SECONDS.

    Called under no lock — must only be called from _periodic_cleanup (single-threaded).
    The GPU-status poll (/api/gpu) keeps sessions alive indefinitely, so we cannot rely
    on session expiry alone to release VRAM. This independently caps engine residency.
    """
    cutoff = time.time() - _SLOT_IDLE_TTL_SECONDS
    stale = [k for k, s in engine_pool.items() if s.last_used < cutoff
             and s.engine_name not in _NO_GPU_ENGINES]
    for key in stale:
        slot = engine_pool.pop(key)
        log.info(f"Idle eviction: '{slot.engine_name}' (idle {(time.time() - slot.last_used)/3600:.1f}h)")
        try:
            slot.engine.unload_model()
        except Exception as e:
            log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
        # Invalidate all sessions pointing at this slot
        for session in sessions.values():
            if session.pool_key == key:
                session.pool_key = None
    return len(stale)


async def _periodic_cleanup():
    """Background task: clean up uploads + expired sessions + idle engine slots every hour."""
    while True:
        await asyncio.sleep(3600)
        n = _cleanup_old_uploads()
        m = _cleanup_expired_sessions()
        p = _evict_idle_slots()
        if n or m or p:
            log.info(f"Periodic cleanup: {n} upload(s), {m} session(s), {p} idle engine slot(s).")


# ---------------------------------------------------------------------------
# API key resolution — keys never stored or shared server-side (Phase 3)
# Web UI users MUST provide their own keys via browser localStorage.
# Server env vars (.env) are NOT used by the web UI — they exist only for
# the PyQt GUI and CLI tools which run locally on the admin's machine.
# ---------------------------------------------------------------------------

# Known key slots (for validation only — env vars are NOT consulted)
_KEY_SLOTS = {"openai", "gemini", "claude", "openwebui"}


def _resolve_api_key(slot: str, request_value: str) -> str:
    """
    Return the API key from the browser request, or empty string.
    Server env vars are deliberately NOT used as fallback — each web user
    must supply their own key via browser localStorage.
    """
    if request_value and request_value.strip():
        return request_value.strip()
    return ""


# ---------------------------------------------------------------------------
# Startup config (web/server_config.yaml) — optional, auto-load an engine
# ---------------------------------------------------------------------------

def _load_startup_config() -> dict:
    cfg_path = Path(__file__).parent / "server_config.yaml"
    if not cfg_path.exists():
        return {}
    try:
        import yaml
        with open(cfg_path) as f:
            return yaml.safe_load(f) or {}
    except Exception as e:
        log.warning(f"Could not read server_config.yaml: {e}")
        return {}


@app.on_event("startup")
async def startup_event():
    """Clean old uploads, start periodic cleanup, auto-load engine."""
    # Clean up uploads left over from previous server runs
    n = _cleanup_old_uploads()
    if n:
        log.info(f"Startup cleanup: removed {n} old upload file(s).")

    # Schedule periodic cleanup (every hour)
    asyncio.create_task(_periodic_cleanup())

    # Auto-load default engine from server_config.yaml if present
    cfg = _load_startup_config()
    if not cfg.get("default_engine"):
        return
    engine_name = cfg["default_engine"]
    engine_config = cfg.get("default_config", {})
    log.info(f"Auto-loading engine '{engine_name}' from server_config.yaml ...")
    try:
        registry = get_global_registry()
        reg_engine = registry.get_engine_by_name(engine_name)
        if reg_engine and reg_engine.is_available():
            engine = _create_engine_instance(engine_name)
            if not engine:
                log.warning(f"Auto-load: cannot create instance for '{engine_name}'.")
                return
            ok = await asyncio.to_thread(engine.load_model, engine_config)
            if ok:
                pool_key = _make_pool_key(engine_name, engine_config)
                slot = EngineSlot(
                    engine=engine, engine_name=engine_name,
                    config=engine_config, pool_key=pool_key,
                    ref_count=0,  # No session owns it yet
                )
                engine_pool[pool_key] = slot
                # Update compat shims
                global loaded_engine, loaded_engine_name, loaded_config
                loaded_engine = engine
                loaded_engine_name = engine_name
                loaded_config = engine_config
                log.info(f"Auto-loaded '{engine_name}' into pool as '{pool_key}'.")
            else:
                log.warning(f"Auto-load of '{engine_name}' failed (load_model returned False).")
        else:
            log.warning(f"Auto-load: engine '{engine_name}' not found or not available.")
    except Exception as e:
        log.warning(f"Auto-load error: {e}")


# ---------------------------------------------------------------------------
# Config schemas — replaces Qt config widgets for the web UI
# ---------------------------------------------------------------------------

def _get_pylaia_model_options() -> list:
    _import_segmenters()
    from inference_pylaia_native import _scan_pylaia_models
    _scan_pylaia_models(str(Path(__file__).resolve().parents[1] / "models"))
    options = [{"label": k, "value": k} for k in PYLAIA_MODELS.keys()]
    options.append({"label": "Custom / local path…", "value": "__custom__"})
    return options


def _scan_kraken_models() -> list:
    """Scan models/ directory for local Kraken .mlmodel files and build select options."""
    options = []
    models_root = Path(__file__).resolve().parents[1] / "models"
    if models_root.exists():
        for p in sorted(models_root.rglob("*.mlmodel")):
            rel = str(p.relative_to(models_root.parent))  # e.g. models/kraken_cs/best.mlmodel
            label = f"{p.parent.name}/{p.name}"
            options.append({"label": label, "value": rel, "source": "local"})
    # Zenodo presets from kraken_engine (auto-download on load)
    try:
        from engines.kraken_engine import KRAKEN_MODELS
        for preset_id, info in KRAKEN_MODELS.items():
            if info.get("source") == "zenodo":
                options.append({
                    "label": f"{info.get('label', preset_id)} [Zenodo, auto-download]",
                    "value": f"__zenodo__{preset_id}",
                    "source": "zenodo",
                })
    except Exception:
        pass
    return options


def _scan_trocr_models() -> list:
    """Scan models/ directory for TrOCR checkpoints.

    A directory is considered a TrOCR model if it contains
    preprocessor_config.json (TrOCR/ViT-specific) AND config.json
    with model_type == 'vision-encoder-decoder'.
    This avoids picking up PyLaia/CRNN-CTC directories that also
    contain a config.json with training parameters.
    """
    import json as _json
    models_dir = PROJECT_ROOT / "models"
    options = [
        {"label": "Custom HuggingFace ID or local path…", "value": "__custom__"},
        {"label": "kazars24/trocr-base-handwritten-ru (HuggingFace)",
         "value": "kazars24/trocr-base-handwritten-ru",
         "source": "huggingface"},
        {"label": "microsoft/trocr-base-printed — printed text, base",
         "value": "microsoft/trocr-base-printed",
         "source": "huggingface"},
        {"label": "microsoft/trocr-large-printed — printed text, large",
         "value": "microsoft/trocr-large-printed",
         "source": "huggingface"},
        {"label": "dh-unibe/trocr-kurrent — German Kurrent 19th c. (CER 2.66%)",
         "value": "dh-unibe/trocr-kurrent",
         "source": "huggingface"},
        {"label": "dh-unibe/trocr-kurrent-XVI-XVII — German Kurrent 16th–18th c. (CER 5.42%)",
         "value": "dh-unibe/trocr-kurrent-XVI-XVII",
         "source": "huggingface"},
    ]
    if models_dir.exists():
        for d in sorted(models_dir.iterdir()):
            if not d.is_dir():
                continue
            # Require BOTH preprocessor_config.json AND config.json with
            # model_type == 'vision-encoder-decoder'.
            # preprocessor_config.json is ViT/TrOCR-specific (not in PyLaia).
            # config.json model_type disambiguates from Qwen3 adapters that
            # also ship a preprocessor_config but have no config.json.
            if not (d / "preprocessor_config.json").exists():
                continue
            cfg_path = d / "config.json"
            if not cfg_path.exists():
                continue
            try:
                cfg = _json.load(open(cfg_path))
                if cfg.get("model_type") != "vision-encoder-decoder":
                    continue
            except Exception:
                continue
            options.append({
                "label": d.name,
                "value": str(d),
                "source": "local",
            })
    return options


def _scan_vlm_models(engine_type: str = "qwen3") -> list:
    """Scan models/ directory for local VLM checkpoints (LoRA adapters and full models).

    Looks for directories containing adapter_config.json (LoRA fine-tunes) or
    config.json mentioning Qwen/VLM/vision architectures.

    Returns options list ending with a __custom__ sentinel for manual entry.
    """
    models_dir = PROJECT_ROOT / "models"
    options = []

    if models_dir.exists():
        for d in sorted(models_dir.iterdir()):
            if not d.is_dir():
                continue

            # Check for LoRA adapter at top-level
            if (d / "adapter_config.json").exists():
                try:
                    import json as _json
                    with open(d / "adapter_config.json") as f:
                        adapter_cfg = _json.load(f)
                    base = adapter_cfg.get("base_model_name_or_path", "")
                    is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
                    is_churro = "churro" in base.lower() or "churro" in d.name.lower()
                    if engine_type == "qwen3" and is_qwen and not is_churro:
                        options.append({
                            "label": f"{d.name} (LoRA → {base})",
                            "value": str(d),
                            "base_model": base,
                            "adapter": str(d),
                        })
                    elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
                        options.append({
                            "label": f"{d.name} (LoRA → {base})",
                            "value": str(d),
                            "base_model": base,
                            "adapter": str(d),
                        })
                except Exception:
                    pass
                continue  # Don't also check final_model subdirs

            # Check for final_model subdirectory with adapter
            final = d / "final_model"
            if final.is_dir() and (final / "adapter_config.json").exists():
                try:
                    import json as _json
                    with open(final / "adapter_config.json") as f:
                        adapter_cfg = _json.load(f)
                    base = adapter_cfg.get("base_model_name_or_path", "")
                    is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
                    is_churro = "churro" in base.lower() or "churro" in d.name.lower()
                    if engine_type == "qwen3" and is_qwen and not is_churro:
                        options.append({
                            "label": f"{d.name} (LoRA → {base})",
                            "value": str(final),
                            "base_model": base,
                            "adapter": str(final),
                        })
                    elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
                        options.append({
                            "label": f"{d.name} (LoRA → {base})",
                            "value": str(final),
                            "base_model": base,
                            "adapter": str(final),
                        })
                except Exception:
                    pass

    # Always append a "Custom / HuggingFace" sentinel as the last option
    options.append({
        "label": "Custom / HuggingFace model ID...",
        "value": "__custom__",
    })
    return options


ENGINE_SCHEMAS = {
    "CRNN-CTC (PyLaia-inspired)": lambda: {
        "fields": [
            {"key": "model_path", "type": "select", "label": "Model",
             "options": _get_pylaia_model_options(),
             "custom_key": "custom_model_path",
             "custom_placeholder": "Absolute path to best_model.pt (e.g. /home/…/models/pylaia_yiddish_20260326/best_model.pt)"},
            {"key": "enable_spaces", "type": "checkbox",
             "label": "Convert <space> tokens", "default": True},
            {"key": "flip_rtl", "type": "checkbox",
             "label": "RTL manuscript (flip line images)", "default": False,
             "hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
        ]
    },
    "TrOCR": lambda: {
        "fields": [
            {"key": "model_path", "type": "select", "label": "Model",
             "options": _scan_trocr_models(),
             "custom_key": "custom_model_path",
             "custom_placeholder": "HuggingFace model ID (e.g. microsoft/trocr-base-handwritten) or absolute local path"},
            {"key": "num_beams", "type": "number", "label": "Beam Search",
             "min": 1, "max": 10, "default": 4},
            {"key": "normalize_background", "type": "checkbox",
             "label": "Normalize Background", "default": False},
            {"key": "flip_rtl", "type": "checkbox",
             "label": "RTL manuscript (flip line images)", "default": False,
             "hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
        ]
    },
    "Qwen3-VL": lambda: {
        "fields": [
            {"key": "model_preset", "type": "select", "label": "Model",
             "options": _scan_vlm_models("qwen3"),
             "custom_key": "base_model",
             "custom_placeholder": "HuggingFace model ID, e.g. Qwen/Qwen3-VL-8B-Instruct"},
            {"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
             "min": 512, "max": 4096, "default": 1536},
        ]
    },
    "Churro VLM": lambda: {
        "fields": [
            {"key": "model_preset", "type": "select", "label": "Model",
             "options": _scan_vlm_models("churro"),
             "custom_key": "model_name",
             "custom_placeholder": "HuggingFace model ID, e.g. stanford-oval/churro-3B"},
            {"key": "device", "type": "select", "label": "Device",
             "options": [{"label": "Auto", "value": "auto"},
                         {"label": "GPU 0", "value": "cuda:0"},
                         {"label": "GPU 1", "value": "cuda:1"},
                         {"label": "CPU", "value": "cpu"}]},
            {"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
             "min": 512, "max": 4096, "default": 2048},
        ]
    },
    "Kraken": lambda: {
        "fields": [
            {"key": "model_path", "type": "select", "label": "Model",
             "options": _scan_kraken_models(),
             "custom_key": "custom_model_path",
             "custom_placeholder": "Absolute path on server, e.g. /home/user/models/my.mlmodel",
             "upload": True},
        ]
    },
    "Commercial APIs": lambda: {
        "fields": [
            {"key": "provider", "type": "select", "label": "Provider",
             "options": [
                 {"label": "OpenAI (GPT-4o, o1, …)", "value": "OpenAI"},
                 {"label": "Google Gemini", "value": "Gemini"},
                 {"label": "Anthropic Claude", "value": "Claude"},
             ]},
            {"key": "model", "type": "select", "label": "Model",
             "dynamic": True,
             "dynamic_hint": "Enter API key, then ↻ to load available models",
             # No static lists — always fetch live from the provider API
             "per_provider_options": {},
             "options": [],
             "custom_key": "custom_model_id",
             "custom_placeholder": "e.g. gpt-4.5, gemini-exp-1206, claude-opus-4"},
            {"key": "api_key", "type": "password", "label": "API Key",
             "default": "", "placeholder": "Paste your API key here"},
            {"key": "temperature", "type": "number", "label": "Temperature",
             "min": 0.0, "max": 2.0, "default": 0.0,
             "placeholder": "0.0 = deterministic (recommended for transcription)"},
            {"key": "max_output_tokens", "type": "number", "label": "Max output tokens (optional)",
             "min": 512, "max": 65536, "default": None,
             "placeholder": "Leave blank = model maximum"},
            {"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
             "default": "",
             "rows": 4,
             "placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
             "hint": "Leave blank to use the default prompt shown above"},
            {"key": "thinking_mode", "type": "select", "label": "Thinking Mode (Gemini only)",
             "options": [
                 {"label": "Auto (model decides, no cap)", "value": ""},
                 {"label": "Low (budget: 8k tokens)", "value": "low"},
                 {"label": "High (no cap, max reasoning)", "value": "high"},
             ], "default": ""},
        ]
    },
    "OpenWebUI": lambda: {
        "fields": [
            {"key": "base_url", "type": "text", "label": "Base URL",
             "default": "",
             "placeholder": "https://your-openwebui-instance/api or .../api/v1"},
            {"key": "api_key", "type": "password", "label": "API Key",
             "default": "", "placeholder": "Your OpenWebUI API key"},
            {"key": "model", "type": "select", "label": "Model",
             "dynamic": True,
             "dynamic_hint": "Enter API key & base URL, then ↻ to load available models",
             "options": [{"label": "Custom model ID…", "value": "__custom__"}],
             "default": "__custom__",
             "custom_key": "model_custom",
             "custom_placeholder": "e.g. llama3.1, qwen2.5vl, gemma3, ..."},
            {"key": "temperature", "type": "number", "label": "Temperature",
             "min": 0.0, "max": 2.0, "default": 0.1},
            {"key": "max_tokens", "type": "number", "label": "Max output tokens (optional)",
             "min": 512, "max": 65536, "default": None,
             "placeholder": "Leave blank = model maximum"},
            {"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
             "default": "",
             "rows": 3,
             "placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
             "hint": "Leave blank to use the default prompt shown above"},
        ]
    },
    "LightOnOCR": lambda: {
        "fields": [
            {"key": "model_path", "type": "select", "label": "Model",
             "options": (lambda: [
                 {"label": f"{name} — {info.get('description','')}", "value": info["id"]}
                 for name, info in __import__('lighton_models', fromlist=['LIGHTON_MODELS']).LIGHTON_MODELS.items()
             ] + [{"label": "Custom HuggingFace ID…", "value": "__custom__"}])(),
             "custom_key": "custom_model_path",
             "custom_placeholder": "HuggingFace model ID, e.g. lightonai/LightOnOCR-2-1B-base"},
            {"key": "max_new_tokens", "type": "number", "label": "Max new tokens",
             "min": 32, "max": 512, "default": 128},
        ]
    },
    "PaddleOCR": lambda: {
        "fields": [
            {"key": "lang", "type": "select", "label": "Language / Script",
             "default": "ch",
             "options": [
                 {"label": "Chinese + English (mixed, recommended default)",  "value": "ch"},
                 {"label": "English",                                          "value": "en"},
                 {"label": "German",                                           "value": "german"},
                 {"label": "French",                                           "value": "french"},
                 {"label": "Japanese",                                         "value": "japan"},
                 {"label": "Korean",                                           "value": "korean"},
                 {"label": "Arabic",                                           "value": "arabic"},
                 {"label": "Cyrillic (Russian/Ukrainian/Bulgarian)",           "value": "cyrillic"},
                 {"label": "Latin script (generic)",                           "value": "latin"},
                 {"label": "Custom (enter code below)",                        "value": "__custom__"},
             ],
             "custom_key": "custom_lang",
             "custom_placeholder": "PaddleOCR lang code, e.g. ru, uk, fr, es, it, pt, …",
             "hint": "One language model per run. 'ch' is bilingual (Chinese+English) and PaddleOCR's strongest model. For mixed-script documents outside this list, run separate passes."},
            {"key": "use_angle_cls", "type": "checkbox",
             "label": "Text-angle classifier (correct 180° rotation)", "default": True},
            {"key": "use_gpu", "type": "checkbox",
             "label": "Use GPU (requires paddlepaddle-gpu)", "default": False},
        ]
    },
}


# ---------------------------------------------------------------------------
# Request/response models
# ---------------------------------------------------------------------------

class EngineLoadRequest(BaseModel):
    engine_name: str
    config: Dict[str, Any] = {}


class TranscribeRequest(BaseModel):
    image_id: str
    seg_method: str = "kraken"  # kraken, kraken-blla, hpp
    seg_device: str = "cpu"
    max_columns: int = 6          # blla: max sub-columns per region (iterative splitting)
    split_width_fraction: float = 0.40  # blla: min region width (fraction of page) to trigger sub-split
    use_pagexml: bool = True      # use attached PAGE XML for segmentation when available
    text_direction: str = "horizontal-lr"  # reading order for Kraken: horizontal-lr, horizontal-rl, vertical-lr, vertical-rl
    engine_config_overrides: Dict[str, Any] = {}  # live form values merged into stored config at transcription time


# ---------------------------------------------------------------------------
# Routes
# ---------------------------------------------------------------------------

@app.get("/")
async def index():
    return FileResponse(str(STATIC_DIR / "index.html"))


@app.get("/demo")
async def pwa_demo():
    return FileResponse(str(STATIC_DIR / "pwa" / "demo.html"))


@app.get("/manifest.json")
async def pwa_manifest():
    """Serve the PWA manifest from root so scope / start_url are valid."""
    from fastapi.responses import FileResponse as _FR
    return _FR(str(STATIC_DIR / "pwa" / "manifest.json"), media_type="application/manifest+json")


@app.get("/sw.js")
async def pwa_service_worker():
    """Serve the PWA service worker from root scope so it can control /demo."""
    from fastapi.responses import FileResponse as _FR
    resp = _FR(str(STATIC_DIR / "pwa" / "sw.js"), media_type="application/javascript")
    resp.headers["Service-Worker-Allowed"] = "/"
    return resp


@app.get("/api/engines")
async def list_engines():
    registry = get_global_registry()
    engines = []
    for engine in registry.get_all_engines():
        available = engine.is_available()
        engines.append({
            "name": engine.get_name(),
            "description": engine.get_description(),
            "available": available,
            "unavailable_reason": engine.get_unavailable_reason() if not available else None,
            "requires_line_segmentation": engine.requires_line_segmentation(),
            "has_config_schema": engine.get_name() in ENGINE_SCHEMAS,
        })
    return engines


@app.get("/api/engine/{name}/config-schema")
async def get_config_schema(name: str):
    if name not in ENGINE_SCHEMAS:
        return {"fields": []}
    schema = ENGINE_SCHEMAS[name]()

    # Key status: always "missing" from server perspective — browser localStorage
    # is the only key store. The frontend checks localStorage client-side.
    for field in schema.get("fields", []):
        if field.get("type") == "password":
            field["key_status"] = "missing"

    return schema


def _openwebui_model_urls(base_url: str) -> list[str]:
    base = base_url.strip().rstrip("/")
    if not base:
        return []
    urls = [f"{base}/models"]
    if base.endswith("/api"):
        urls.append(f"{base}/v1/models")
        urls.append(f"{base[:-4]}/v1/models")
    elif base.endswith("/api/v1"):
        urls.append(f"{base[:-3]}/models")
        urls.append(f"{base}/models")
    elif base.endswith("/v1"):
        urls.append(f"{base[:-3]}/api/models")
    else:
        urls.append(f"{base}/api/models")
        urls.append(f"{base}/api/v1/models")
        urls.append(f"{base}/v1/models")
    return list(dict.fromkeys(urls))


def _extract_openwebui_model_ids(payload: Any) -> list[str]:
    if isinstance(payload, dict):
        for key in ("data", "models"):
            items = payload.get(key)
            if isinstance(items, list):
                return _extract_openwebui_model_ids(items)
        return [
            str(value.get("id") or value.get("name"))
            for value in payload.values()
            if isinstance(value, dict) and (value.get("id") or value.get("name"))
        ]

    if isinstance(payload, list):
        models = []
        for item in payload:
            if isinstance(item, str):
                models.append(item)
            elif isinstance(item, dict):
                model_id = item.get("id") or item.get("name") or item.get("model")
                if model_id:
                    models.append(str(model_id))
        return sorted(set(models))

    return []


def _fetch_openwebui_models(base_url: str, api_key: str) -> list[str]:
    import urllib.error
    import urllib.request

    errors = []
    for url in _openwebui_model_urls(base_url):
        req = urllib.request.Request(
            url,
            headers={
                "Authorization": f"Bearer {api_key}",
                "x-api-key": api_key,
                "Accept": "application/json",
                "Content-Type": "application/json",
                "User-Agent": "Polyscriptor-HTR-Demo/1.0",
            },
        )
        try:
            with urllib.request.urlopen(req, timeout=20) as resp:
                status = resp.status
                content_type = resp.headers.get("Content-Type", "")
                body = resp.read().decode("utf-8", errors="replace")
            try:
                payload = json.loads(body)
            except json.JSONDecodeError:
                sample = body.strip().replace("\n", " ")[:120] or "<empty response>"
                errors.append(f"{url}: HTTP {status}, non-JSON response ({content_type}): {sample}")
                continue
            models = _extract_openwebui_model_ids(payload)
            if models:
                return models
            errors.append(f"{url}: no model ids in response")
        except urllib.error.HTTPError as exc:
            body = exc.read().decode("utf-8", errors="replace")[:200]
            errors.append(f"{url}: HTTP {exc.code} {body}")
        except Exception as exc:
            errors.append(f"{url}: {exc}")
    raise RuntimeError("; ".join(errors) if errors else "No OpenWebUI model endpoint tried")


@app.get("/api/engine/status")
async def engine_status(request: Request):
    session = _get_session(request)
    if session.pool_key and session.pool_key in engine_pool:
        slot = engine_pool[session.pool_key]
        return {
            "loaded": slot.engine.is_model_loaded(),
            "engine_name": slot.engine_name,
            "config": slot.config,
        }
    # Fallback: compat shim for tests / startup
    return {
        "loaded": loaded_engine is not None and loaded_engine.is_model_loaded(),
        "engine_name": loaded_engine_name,
        "config": loaded_config,
    }


@app.get("/api/engine/{name}/models")
async def get_engine_models(
    name: str,
    api_key: str = "",
    provider: str = "openai",
    base_url: str = "",
):
    """
    Fetch available models for engines whose model list is dynamic.

    - OpenWebUI: queries the OpenWebUI /api/models endpoint
    - Commercial APIs: uses existing fetch_* helpers with fallback lists
    """
    if name == "OpenWebUI":
        resolved = _resolve_api_key("openwebui", api_key)
        if not resolved:
            return {"models": [], "error": "No API key — paste one in the form"}
        effective_url = base_url.strip().rstrip("/")
        if not effective_url:
            return {"models": [], "error": "Enter your OpenWebUI base URL"}
        try:
            models = await asyncio.to_thread(_fetch_openwebui_models, effective_url, resolved)
            return {"models": models}
        except Exception as e:
            return {"models": [], "error": str(e)}

    elif name == "Commercial APIs":
        prov = provider.lower()
        resolved = _resolve_api_key(prov, api_key)
        if not resolved:
            return {"models": [], "error": "No API key — paste one in the form"}
        try:
            sys.path.insert(0, str(PROJECT_ROOT))
            if prov == "openai":
                from inference_commercial_api import fetch_openai_models
                models = await asyncio.to_thread(fetch_openai_models, resolved)
                return {"models": models}
            elif prov == "gemini":
                from inference_commercial_api import fetch_gemini_models
                models = await asyncio.to_thread(fetch_gemini_models, resolved)
                return {"models": models}
            elif prov == "claude":
                from inference_commercial_api import fetch_claude_models
                models = await asyncio.to_thread(fetch_claude_models, resolved)
                return {"models": models}
            else:
                return {"models": [], "error": f"Unknown provider: {provider}"}
        except Exception as e:
            return {"models": [], "error": str(e)}

    return {"models": [], "error": f"Dynamic model listing not supported for '{name}'"}


@app.post("/api/engine/load")
async def load_engine(request: Request, req: EngineLoadRequest):
    global loaded_engine, loaded_engine_name, loaded_config
    session = _get_session(request)

    registry = get_global_registry()
    reg_engine = registry.get_engine_by_name(req.engine_name)
    if not reg_engine:
        raise HTTPException(404, f"Engine '{req.engine_name}' not found")
    if not reg_engine.is_available():
        raise HTTPException(400, f"Engine not available: {reg_engine.get_unavailable_reason()}")

    # --- Config resolution (unchanged logic) ---
    config = dict(req.config)

    if req.engine_name == "CRNN-CTC (PyLaia-inspired)" and "model_path" in config:
        custom_val = config.pop("custom_model_path", "").strip()
        if config["model_path"] == "__custom__":
            if not custom_val:
                raise HTTPException(400, "Please enter an absolute path to a best_model.pt file")
            config["model_path"] = custom_val
        # else: named preset from PYLAIA_MODELS — engine resolves it

    elif req.engine_name == "Kraken" and "model_path" in config:
        custom_val = config.pop("custom_model_path", "").strip()
        val = config["model_path"]
        if val == "__custom__":
            if not custom_val:
                raise HTTPException(400, "Please enter a path to a local .mlmodel file")
            config["model_path"] = custom_val
        elif val.startswith("__zenodo__"):
            # Zenodo preset: pass preset_id, let engine handle download
            config["preset_id"] = val[len("__zenodo__"):]
            config["model_path"] = None
        # else: relative local path from select (e.g. "models/kraken_cs/best.mlmodel") — use as-is

    elif req.engine_name == "TrOCR" and "model_path" in config:
        custom_val = config.pop("custom_model_path", "").strip()
        if config["model_path"] == "__custom__":
            if not custom_val:
                raise HTTPException(400, "Please enter a HuggingFace model ID or local path")
            config["model_path"] = custom_val
        from pathlib import Path as _P
        if _P(config["model_path"]).exists():
            config["model_source"] = "local"
        else:
            config["model_source"] = "huggingface"

    elif req.engine_name == "Qwen3-VL" and "model_preset" in config:
        preset_val = config.pop("model_preset")
        custom_val = config.pop("base_model", "").strip()
        if preset_val == "__custom__":
            config["base_model"] = custom_val or "Qwen/Qwen3-VL-8B-Instruct"
            config["adapter"] = None
        else:
            vlm_opts = _scan_vlm_models("qwen3")
            matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
            if matched:
                config["base_model"] = matched.get("base_model", preset_val)
                config["adapter"] = matched.get("adapter")
            else:
                config["base_model"] = preset_val
                config["adapter"] = None

    elif req.engine_name == "Churro VLM" and "model_preset" in config:
        preset_val = config.pop("model_preset")
        custom_val = config.pop("model_name", "").strip()
        if preset_val == "__custom__":
            config["model_name"] = custom_val or "stanford-oval/churro-3B"
            config["adapter_path"] = None
        else:
            vlm_opts = _scan_vlm_models("churro")
            matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
            if matched:
                config["model_name"] = matched.get("base_model", preset_val)
                config["adapter_path"] = matched.get("adapter")
            else:
                config["model_name"] = preset_val
                config["adapter_path"] = None

    elif req.engine_name == "LightOnOCR" and "model_path" in config:
        custom_val = config.pop("custom_model_path", "").strip()
        if config["model_path"] == "__custom__":
            if not custom_val:
                raise HTTPException(400, "Please enter a HuggingFace model ID for LightOnOCR")
            config["model_path"] = custom_val

    elif req.engine_name == "PaddleOCR" and "lang" in config:
        if config["lang"] == "__custom__":
            custom_lang = config.pop("custom_lang", "").strip()
            if not custom_lang:
                raise HTTPException(400, "Please enter a PaddleOCR language code")
            config["lang"] = custom_lang
        else:
            config.pop("custom_lang", None)

    elif req.engine_name == "Commercial APIs":
        if config.get("model") == "__custom__":
            config["model"] = config.pop("model_custom", "").strip() or "gpt-4o"

    elif req.engine_name == "OpenWebUI":
        if config.get("model") == "__custom__":
            custom_model = config.pop("model_custom", "").strip()
            if not custom_model:
                raise HTTPException(400, "Please enter an OpenWebUI model ID")
            config["model"] = custom_model

    # Resolve API keys
    if req.engine_name == "Commercial APIs":
        provider_slot = config.get("provider", "openai").lower()
        raw_key = config.get("api_key", "")
        resolved = _resolve_api_key(provider_slot, raw_key)
        if not resolved:
            raise HTTPException(400, f"No API key for {config.get('provider')}. "
                                     "Paste your API key in the field.")
        config["api_key"] = resolved

    elif req.engine_name == "OpenWebUI":
        base_url = config.get("base_url", "").strip().rstrip("/")
        if not base_url:
            raise HTTPException(400, "No OpenWebUI base URL. "
                                     "Enter your own OpenWebUI API base URL.")
        config["base_url"] = base_url
        raw_key = config.get("api_key", "")
        resolved = _resolve_api_key("openwebui", raw_key)
        if not resolved:
            raise HTTPException(400, "No API key for OpenWebUI. "
                                     "Paste your API key in the field.")
        config["api_key"] = resolved

    # Strip empty custom_prompt for API engines (use engine default)
    if req.engine_name in ("Commercial APIs", "OpenWebUI"):
        if not config.get("custom_prompt", "").strip():
            config["custom_prompt"] = None

    # --- Engine pool logic ---
    pool_key = _make_pool_key(req.engine_name, config)

    async with pool_lock:
        # Release previous engine reference for this session
        if session.pool_key and session.pool_key in engine_pool:
            prev_slot = engine_pool[session.pool_key]
            prev_slot.ref_count = max(0, prev_slot.ref_count - 1)
            if prev_slot.ref_count == 0:
                log.info(f"Immediate eviction (engine switch): '{prev_slot.engine_name}'")
                try:
                    prev_slot.engine.unload_model()
                except Exception as e:
                    log.warning(f"unload_model() failed for '{prev_slot.engine_name}': {e}")
                if session.pool_key in engine_pool:
                    del engine_pool[session.pool_key]

        # Check if this exact engine+model is already loaded
        if pool_key in engine_pool:
            slot = engine_pool[pool_key]
            slot.ref_count += 1
            slot.last_used = time.time()
            session.pool_key = pool_key
            # Update compat shims
            loaded_engine = slot.engine
            loaded_engine_name = slot.engine_name
            loaded_config = slot.config
            log.info(f"Pool hit: reusing '{pool_key}' (ref_count={slot.ref_count})")
            return {"success": True, "load_time_s": 0.0,
                    "engine_name": req.engine_name, "reused": True}

        # Need new slot — evict if VRAM tight
        await _maybe_evict(req.engine_name)

    # Load model OUTSIDE pool_lock (blocking I/O)
    engine = _create_engine_instance(req.engine_name)
    if not engine:
        raise HTTPException(500, f"Cannot create engine instance for '{req.engine_name}'")

    start = time.time()
    success = await asyncio.to_thread(engine.load_model, config)
    elapsed = time.time() - start

    if not success:
        raise HTTPException(500, "Failed to load model")

    slot = EngineSlot(
        engine=engine,
        engine_name=req.engine_name,
        config=config,
        pool_key=pool_key,
        ref_count=1,
        last_used=time.time(),
    )

    async with pool_lock:
        # Double-check: another request may have loaded the same key concurrently
        if pool_key in engine_pool:
            engine.unload_model()
            slot = engine_pool[pool_key]
            slot.ref_count += 1
            slot.last_used = time.time()
        else:
            engine_pool[pool_key] = slot

        session.pool_key = pool_key
        # Update compat shims
        loaded_engine = slot.engine
        loaded_engine_name = slot.engine_name
        loaded_config = slot.config

    log.info(f"Pool miss: loaded '{pool_key}' in {elapsed:.1f}s (pool size={len(engine_pool)})")
    return {"success": True, "load_time_s": round(elapsed, 2),
            "engine_name": req.engine_name, "reused": False}


@app.get("/api/keys")
async def list_keys():
    """Keys are stored in browser localStorage only. Server has no key info.

    This endpoint returns an empty dict — it exists for backwards compatibility.
    """
    return {}


@app.post("/api/admin/evict-all")
async def admin_evict_all(request: Request):
    """Force-evict all engine slots from VRAM (localhost admin only)."""
    if request.client and request.client.host not in ("127.0.0.1", "::1"):
        from fastapi import HTTPException
        raise HTTPException(status_code=403, detail="localhost only")
    async with pool_lock:
        evicted = []
        for key, slot in list(engine_pool.items()):
            try:
                slot.engine.unload_model()
            except Exception as e:
                log.warning(f"admin evict failed for '{key}': {e}")
            del engine_pool[key]
            evicted.append(key)
        for session in sessions.values():
            session.pool_key = None
        global loaded_engine, loaded_engine_name, loaded_config
        loaded_engine = None
        loaded_engine_name = ""
        loaded_config = {}
    log.info(f"Admin force-evict: cleared {len(evicted)} slot(s): {evicted}")
    return {"evicted": evicted}


@app.post("/api/engine/unload")
async def unload_engine(request: Request):
    global loaded_engine, loaded_engine_name, loaded_config
    session = _get_session(request)

    async with pool_lock:
        if session.pool_key and session.pool_key in engine_pool:
            slot = engine_pool[session.pool_key]
            slot.ref_count = max(0, slot.ref_count - 1)
            if slot.ref_count == 0:
                log.info(f"Immediate eviction (explicit unload): '{slot.engine_name}'")
                try:
                    slot.engine.unload_model()
                except Exception as e:
                    log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
                if session.pool_key in engine_pool:
                    del engine_pool[session.pool_key]
        session.pool_key = None
        # Update compat shims
        loaded_engine = None
        loaded_engine_name = ""
        loaded_config = {}

    return {"success": True}


def _register_image(session: UserSession, pil_image: Image.Image, filename: str, save_path: Path) -> str:
    """Store a PIL image in the session's cache and return its image_id."""
    image_id = str(uuid.uuid4())
    image_data = {
        "path": save_path,
        "xml_path": None,
        "pil_image": pil_image,
        "width": pil_image.width,
        "height": pil_image.height,
        "filename": filename,
        "lines": None,
    }
    session.image_cache[image_id] = image_data
    global_image_cache[image_id] = image_data
    return image_id


def _get_image_data(session: UserSession, image_id: str) -> Optional[dict]:
    """Return image data, tolerating missing cookies in embedded Space contexts."""
    if image_id in session.image_cache:
        return session.image_cache[image_id]
    img_data = global_image_cache.get(image_id)
    if img_data is not None:
        session.image_cache[image_id] = img_data
    return img_data


@app.post("/api/image/upload")
async def upload_image(
    request: Request,
    file: UploadFile = File(...),
    max_dim: Optional[int] = Query(default=None, ge=100, description="Resize long edge to this many pixels (mobile upload only)"),
):
    session = _get_session(request)
    filename = file.filename or "upload"
    is_pdf = (
        filename.lower().endswith(".pdf") or
        (file.content_type or "").startswith("application/pdf")
    )
    image_exts = {
        ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp"
    }
    is_image = (
        (file.content_type or "").startswith("image/") or
        Path(filename).suffix.lower() in image_exts
    )

    content = await file.read()
    if len(content) > 200 * 1024 * 1024:
        raise HTTPException(400, "File too large (max 200MB)")

    # ── PDF: render each page as a separate image ──────────────────────────
    if is_pdf:
        if not PDF_AVAILABLE:
            raise HTTPException(400, "PDF support requires PyMuPDF. Install with: pip install pymupdf")
        try:
            import asyncio
            from concurrent.futures import ThreadPoolExecutor

            def _render_pdf(data: bytes, stem: str, sess: UserSession) -> list:
                mat = _fitz.Matrix(150 / 72, 150 / 72)
                doc = _fitz.open(stream=data, filetype="pdf")
                results = []
                for i, page in enumerate(doc):
                    pix = page.get_pixmap(matrix=mat, colorspace=_fitz.csRGB)
                    pil_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    page_filename = f"{stem}_page{i+1:03d}.png"
                    save_path = UPLOAD_DIR / f"{uuid.uuid4()}.png"
                    pil_page.save(save_path)
                    pid = _register_image(sess, pil_page, page_filename, save_path)
                    results.append({
                        "image_id": pid,
                        "filename": page_filename,
                        "width": pil_page.width,
                        "height": pil_page.height,
                        "page": i + 1,
                    })
                doc.close()
                return results

            stem = Path(filename).stem
            loop = asyncio.get_event_loop()
            with ThreadPoolExecutor(max_workers=1) as pool:
                pages_out = await loop.run_in_executor(pool, _render_pdf, content, stem, session)
            return {
                "is_pdf": True,
                "filename": filename,
                "num_pages": len(pages_out),
                "pages": pages_out,
            }
        except Exception as e:
            raise HTTPException(400, f"Failed to render PDF: {e}")

    # ── Regular image ───────────────────────────────────────────────────────
    if not is_image:
        raise HTTPException(400, "File must be an image or PDF")

    ext = Path(filename).suffix or ".jpg"
    save_path = UPLOAD_DIR / f"{uuid.uuid4()}{ext}"
    save_path.write_bytes(content)

    try:
        pil_image = Image.open(save_path)
        pil_image = ImageOps.exif_transpose(pil_image)
        pil_image = pil_image.convert("RGB")
        if max_dim and max(pil_image.width, pil_image.height) > max_dim:
            pil_image.thumbnail((max_dim, max_dim), Image.LANCZOS)
            pil_image.save(save_path)
    except Exception as e:
        save_path.unlink(missing_ok=True)
        raise HTTPException(400, f"Invalid image: {e}")

    image_id = _register_image(session, pil_image, filename, save_path)
    return {
        "image_id": image_id,
        "width": pil_image.width,
        "height": pil_image.height,
        "filename": filename,
    }


@app.post("/api/image/{image_id}/xml")
async def upload_xml(request: Request, image_id: str, file: UploadFile = File(...)):
    """Attach a PAGE XML file to an already-uploaded image."""
    session = _get_session(request)
    img_data = _get_image_data(session, image_id)
    if img_data is None:
        raise HTTPException(404, "Image not found — upload image first")
    content = await file.read()
    if len(content) > 10 * 1024 * 1024:
        raise HTTPException(400, "XML too large (max 10MB)")
    xml_path = UPLOAD_DIR / f"{image_id}.xml"
    xml_path.write_bytes(content)
    img_data["xml_path"] = xml_path
    return {"success": True, "filename": file.filename}


@app.get("/api/image/{image_id}")
async def get_image(request: Request, image_id: str):
    session = _get_session(request)
    img_data = _get_image_data(session, image_id)
    if img_data is None:
        raise HTTPException(404, "Image not found")
    return FileResponse(str(img_data["path"]))


@app.get("/api/image/{image_id}/info")
async def image_info(request: Request, image_id: str):
    session = _get_session(request)
    d = _get_image_data(session, image_id)
    if d is None:
        raise HTTPException(404, "Image not found")
    return {
        "image_id": image_id,
        "filename": d["filename"],
        "width": d["width"],
        "height": d["height"],
        "has_xml": d["xml_path"] is not None,
    }


async def _run_segmentation(img_data: dict, method: str, device: str = "cpu",
                            max_columns: int = 6,
                            split_width_fraction: float = 0.40,
                            text_direction: str = "horizontal-lr") -> dict:
    """
    Shared segmentation helper.  Runs the appropriate segmenter, stores
    results in img_data, and returns a serialisable dict ready for SSE or JSON.
    Also populates img_data["line_regions"] with a per-line region index list
    so the transcription loop can tag each line with its column.
    """
    if DEMO_MODE == "hf_space" and method == "kraken-blla":
        method = "kraken"
        device = "cpu"
    pil_image = img_data["pil_image"]
    xml_path  = img_data.get("xml_path")

    if DEMO_MODE == "hf_space" and xml_path is None and method == "hpp":
        return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data)

    _import_segmenters()

    regions: list = []
    lines: list   = []

    xml_region_data: list = []  # TextRegion bboxes from PAGE XML (for visualization)
    if xml_path is not None:
        from inference_page import PageXMLSegmenter as _PXSeg
        segmenter = _PXSeg(str(xml_path))
        lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
        source = "pagexml"
        xml_region_data = getattr(segmenter, 'region_data', []) or []


    elif method == "kraken-blla":
        segmenter = KrakenLineSegmenter(device=device)
        regions, lines = await asyncio.to_thread(
            segmenter.segment_with_regions, pil_image,
            device=device,
            max_columns=max_columns,
            split_width_fraction=split_width_fraction,
            text_direction=text_direction,
        )
        source = "kraken-blla"

    elif method == "kraken":
        try:
            segmenter = KrakenLineSegmenter()
            # Use column-aware segmentation so multi-column pages read correctly
            regions, lines = await asyncio.to_thread(
                segmenter.segment_classical_with_regions, pil_image,
                max_columns=max_columns,
            )
            source = "kraken"
        except Exception as exc:
            if DEMO_MODE == "hf_space":
                log.warning("Kraken segmentation failed in HF Space; falling back to HPP: %s", exc)
                return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")
            raise

    else:  # hpp
        segmenter = LineSegmenter()
        lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
        source = "hpp"

    if DEMO_MODE == "hf_space" and method == "kraken" and not lines:
        log.warning("Kraken returned no lines in HF Space; falling back to HPP")
        return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")

    # Build per-line region index (used by transcription loop for column view)
    line_regions: list[int] = []
    if regions:
        offset = 0
        for ri, r in enumerate(regions):
            for _ in r.line_ids:
                line_regions.append(ri)
            offset += len(r.line_ids)
    else:
        line_regions = [0] * len(lines)

    img_data["lines"]        = lines
    img_data["line_regions"] = line_regions
    img_data["seg_source"]   = source
    # PAGE XML provides region bboxes directly; Kraken/blla provide SegRegion objects
    if xml_region_data:
        img_data["seg_regions"] = xml_region_data
    elif regions:
        img_data["seg_regions"] = [
            {"id": r.id, "bbox": list(r.bbox), "num_lines": len(r.line_ids)}
            for r in regions
        ]
    else:
        img_data["seg_regions"] = []

    result: dict = {
        "num_lines": len(lines),
        "bboxes":    [list(l.bbox) for l in lines],
        "source":    source,
    }
    if img_data["seg_regions"]:
        result["regions"] = img_data["seg_regions"]
    return result


def _run_demo_hpp_segmentation(img_data: dict, source: str = "hpp") -> dict:
    """Small dependency-light line segmenter for the hosted CPU demo fallback."""
    pil_image = img_data["pil_image"]
    gray = np.array(pil_image.convert("L"))
    if gray.size == 0:
        lines = []
    else:
        threshold = min(220, max(90, float(np.percentile(gray, 42))))
        ink = gray < threshold
        row_density = ink.mean(axis=1)
        kernel = np.ones(9, dtype=np.float32) / 9.0
        smooth = np.convolve(row_density, kernel, mode="same")
        active_threshold = max(0.01, float(smooth.max()) * 0.13)
        min_height = max(10, int(pil_image.height * 0.008))

        bands = []
        start = None
        for y, value in enumerate(smooth):
            if value > active_threshold and start is None:
                start = y
            elif (value <= active_threshold or y == len(smooth) - 1) and start is not None:
                end = y if y == len(smooth) - 1 else y - 1
                if end - start + 1 >= min_height:
                    bands.append((start, end))
                start = None

        lines = []
        for y1, y2 in bands[:100]:
            pad_y = max(3, int((y2 - y1 + 1) * 0.25))
            top = max(0, y1 - pad_y)
            bottom = min(pil_image.height, y2 + pad_y + 1)
            band_ink = ink[top:bottom, :]
            cols = np.where(band_ink.any(axis=0))[0]
            if cols.size:
                left = max(0, int(cols[0]) - 8)
                right = min(pil_image.width, int(cols[-1]) + 9)
            else:
                left = 0
                right = pil_image.width
            bbox = (left, top, right, bottom)
            lines.append(SimpleNamespace(
                image=pil_image.crop(bbox),
                bbox=bbox,
                coords=None,
            ))

    img_data["lines"] = lines
    img_data["line_regions"] = [0] * len(lines)
    img_data["seg_source"] = source
    img_data["seg_regions"] = []
    return {
        "num_lines": len(lines),
        "bboxes": [list(line.bbox) for line in lines],
        "source": source,
    }


@app.delete("/api/image/{image_id}/region/{region_index}")
async def delete_region(request: Request, image_id: str, region_index: int):
    """
    Remove one detected region and its lines from the cached segmentation.
    Returns updated segmentation data in the same format as /segment,
    so the client can redraw the canvas.
    """
    session = _get_session(request)
    img_data = _get_image_data(session, image_id)
    if img_data is None:
        raise HTTPException(404, "Image not found")

    seg_regions = img_data.get("seg_regions") or []
    if not seg_regions:
        raise HTTPException(400, "No segmentation data — run Segment first")
    if region_index < 0 or region_index >= len(seg_regions):
        raise HTTPException(400, f"Region index out of range (0–{len(seg_regions)-1})")

    lines        = img_data.get("lines") or []
    line_regions = img_data.get("line_regions") or ([0] * len(lines))

    # Keep lines that are NOT in the deleted region; re-index later regions
    new_lines: list = []
    new_line_regions: list = []
    for line, lr in zip(lines, line_regions):
        if lr == region_index:
            continue
        new_lines.append(line)
        new_line_regions.append(lr if lr < region_index else lr - 1)

    new_regions = [r for i, r in enumerate(seg_regions) if i != region_index]

    img_data["lines"]        = new_lines
    img_data["line_regions"] = new_line_regions
    img_data["seg_regions"]  = new_regions

    result: dict = {
        "num_lines": len(new_lines),
        "bboxes":    [list(l.bbox) for l in new_lines],
        "source":    img_data.get("seg_source", "modified"),
    }
    if new_regions:
        result["regions"] = new_regions
    return result


@app.get("/api/image/{image_id}/segment")
async def segment_image(
    request: Request,
    image_id: str,
    method: str = "kraken",
    device: str = "cpu",
    max_columns: int = 6,
    split_width_fraction: float = 0.40,
    text_direction: str = "horizontal-lr",
):
    """
    Run segmentation only (no transcription) and return line bboxes as JSON.
    Useful for previewing line layout before transcribing.
    """
    session = _get_session(request)
    img_data = _get_image_data(session, image_id)
    if img_data is None:
        raise HTTPException(404, "Image not found — upload first")

    try:
        return await _run_segmentation(img_data, method, device,
                                       max_columns, split_width_fraction, text_direction)
    except Exception as e:
        raise HTTPException(500, f"Segmentation failed: {e}")


@app.post("/api/transcribe")
async def transcribe(request: Request, req: TranscribeRequest):
    session = _get_session(request)

    # Resolve engine from session's pool slot
    if not session.pool_key or session.pool_key not in engine_pool:
        # Fallback: check compat shims (e.g. auto-loaded engine, no session yet)
        if not loaded_engine or not loaded_engine.is_model_loaded():
            raise HTTPException(400, "No engine loaded")
    slot = engine_pool.get(session.pool_key) if session.pool_key else None
    # Build effective engine/config references
    eff_engine = slot.engine if slot else loaded_engine
    _base_config = slot.config if slot else loaded_config
    # Merge live form overrides into a copy of the stored config so changes to
    # runtime-only fields (custom_prompt, thinking_mode, temperature, …) take
    # effect without requiring a model reload.  Never overwrite security-sensitive
    # keys that were set during load (api_key, provider, model, model_path, …).
    _RELOAD_ONLY_KEYS = {"api_key", "provider", "model", "model_path", "model_source",
                         "base_model", "adapter", "model_name", "preset_id", "lang",
                         "use_gpu", "venv_path"}
    if req.engine_config_overrides:
        eff_config = dict(_base_config)
        for k, v in req.engine_config_overrides.items():
            if k not in _RELOAD_ONLY_KEYS:
                eff_config[k] = v
    else:
        eff_config = _base_config
    eff_engine_name = slot.engine_name if slot else loaded_engine_name

    if not eff_engine or not eff_engine.is_model_loaded():
        raise HTTPException(400, "No engine loaded")

    img_data = _get_image_data(session, req.image_id)
    if img_data is None:
        raise HTTPException(404, "Image not found — upload first")

    pil_image = img_data["pil_image"]

    # Per-request cancel event (replaces global cancel_event)
    request_id = str(uuid.uuid4())
    cancel_evt = asyncio.Event()
    session.cancel_events[request_id] = cancel_evt

    async def event_stream():
        _import_segmenters()

        try:
            # --- Segmentation ---
            xml_path = img_data.get("xml_path") if req.use_pagexml else None

            if not eff_engine.requires_line_segmentation() and not xml_path:
                # Page-level engine with no PAGE XML — send whole page as single line
                from inference_page import LineSegment
                lines = [LineSegment(
                    image=pil_image,
                    bbox=(0, 0, pil_image.width, pil_image.height),
                    coords=None,
                )]
                img_data["lines"]        = lines
                img_data["line_regions"] = [0]
                img_data["seg_source"]   = "page"
                img_data["seg_regions"]  = []
                yield _sse("segmentation", {
                    "num_lines": 1,
                    "bboxes": [[0, 0, pil_image.width, pil_image.height]],
                    "source": "page",
                })
            else:
                # Reuse cached segmentation if method matches (e.g. user clicked Segment first)
                cached_lines   = img_data.get("lines")
                cached_source  = img_data.get("seg_source")
                desired_source = "pagexml" if (xml_path and req.use_pagexml) else req.seg_method

                if cached_lines and cached_source == desired_source:
                    lines = cached_lines
                    yield _sse("status", {"message": "Using cached segmentation..."})
                    seg_event: dict = {
                        "num_lines": len(lines),
                        "bboxes":    [list(l.bbox) for l in lines],
                        "source":    cached_source,
                    }
                    if img_data.get("seg_regions"):
                        seg_event["regions"] = img_data["seg_regions"]
                    yield _sse("segmentation", seg_event)
                elif xml_path is not None:
                    yield _sse("status", {"message": "Reading line layout from PAGE XML..."})
                    seg_result = await _run_segmentation(img_data, "pagexml",
                                                         req.seg_device, req.max_columns,
                                                         req.split_width_fraction,
                                                         req.text_direction)
                    lines = img_data["lines"]
                    yield _sse("segmentation", seg_result)
                else:
                    yield _sse("status", {"message": f"Segmenting with {req.seg_method}..."})
                    seg_result = await _run_segmentation(img_data, req.seg_method,
                                                         req.seg_device, req.max_columns,
                                                         req.split_width_fraction,
                                                         req.text_direction)
                    lines = img_data["lines"]
                    yield _sse("segmentation", seg_result)

            # --- Transcription ---
            results = []
            token_usage: Dict[str, Any] = {}
            start_time = time.time()
            line_regions = img_data.get("line_regions") or ([0] * len(lines))

            for i, line in enumerate(lines):
                # Check for cancellation before each line
                if cancel_evt.is_set():
                    yield _sse("cancelled", {})
                    return

                line_img = line.image if line.image is not None else pil_image.crop(line.bbox)
                img_array = np.array(line_img.convert("RGB"))

                # Use slot lock to serialize access to this engine instance
                if slot:
                    async with slot.lock:
                        slot.last_used = time.time()
                        result = await asyncio.to_thread(
                            eff_engine.transcribe_line, img_array, eff_config
                        )
                else:
                    result = await asyncio.to_thread(
                        eff_engine.transcribe_line, img_array, eff_config
                    )

                text = str(result.text) if hasattr(result, "text") else str(result)
                confidence = None
                if hasattr(result, "confidence") and result.confidence is not None:
                    confidence = float(result.confidence)
                    if confidence > 1:
                        confidence = confidence / 100.0
                # Accumulate token usage and extract thinking text from API engines (e.g. Gemini)
                thinking_text = None
                if hasattr(result, "metadata") and isinstance(result.metadata, dict):
                    tu = result.metadata.get("token_usage")
                    if tu:
                        for k, v in tu.items():
                            if v is not None:
                                token_usage[k] = token_usage.get(k, 0) + v
                    thinking_text = result.metadata.get("thinking_text")

                line_data = {
                    "index": i,
                    "text": text,
                    "confidence": confidence,
                    "bbox": list(line.bbox),
                    "region": line_regions[i] if i < len(line_regions) else 0,
                }
                if thinking_text:
                    line_data["thinking_text"] = thinking_text
                results.append(line_data)
                progress_data: Dict[str, Any] = {
                    "current": i + 1,
                    "total": len(lines),
                    "line": line_data,
                }
                if token_usage:
                    progress_data["token_usage"] = dict(token_usage)
                yield _sse("progress", progress_data)

                # Check for cancellation after each line's progress event
                if cancel_evt.is_set():
                    yield _sse("cancelled", {})
                    return

            # Store completed results in session image_cache for export
            img_data["results"] = results

            elapsed = time.time() - start_time
            complete_data: Dict[str, Any] = {
                "lines": results,
                "total_time_s": round(elapsed, 2),
                "engine": eff_engine_name,
            }
            if token_usage:
                complete_data["token_usage"] = token_usage
            yield _sse("complete", complete_data)

        except Exception as e:
            log.exception("Transcription error")
            yield _sse("error", {"message": str(e)})
        finally:
            # Clean up this request's cancel event
            session.cancel_events.pop(request_id, None)

    return StreamingResponse(
        event_stream(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "X-Accel-Buffering": "no",  # Disable nginx buffering if behind proxy
        },
    )


@app.post("/api/transcribe/cancel")
async def cancel_transcription(request: Request):
    """Signal all running transcriptions for this session to stop."""
    session = _get_session(request)
    for evt in session.cancel_events.values():
        evt.set()
    return {"success": True}


@app.post("/api/image/{image_id}/export-xml")
async def export_xml(request: Request, image_id: str):
    """Export transcription results for image_id as PAGE XML."""
    session = _get_session(request)
    pretty, stem = _build_xml_bytes(session, image_id)
    return Response(
        content=pretty,
        media_type="application/xml",
        headers={"Content-Disposition": f'attachment; filename="{stem}.xml"'},
    )


def _build_xml_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
    """Return (xml_bytes, stem) for a cached image, or raise HTTPException."""
    import xml.etree.ElementTree as ET
    from xml.dom import minidom
    from page_xml_exporter import PageXMLExporter

    img_data = _get_image_data(session, image_id)
    if img_data is None:
        raise HTTPException(404, f"Image {image_id} not found")
    results = img_data.get("results")
    if not results:
        raise HTTPException(400, f"No results for {image_id}")

    filename = img_data.get("filename", img_data["path"].name)
    width = img_data["width"]
    height = img_data["height"]

    class _SegProxy:
        __slots__ = ("bbox", "coords", "text", "confidence")
        def __init__(self, r):
            bbox = r.get("bbox")
            self.bbox = tuple(bbox) if bbox else (0, 0, width, height)
            self.coords = None
            self.text = r.get("text", "")
            self.confidence = r.get("confidence")

    segments = [_SegProxy(r) for r in results]
    exporter = PageXMLExporter(str(filename), width, height)
    root, page = exporter._make_root("Polyscriptor Web UI", None)

    reading_order = ET.SubElement(page, 'ReadingOrder')
    ordered_group = ET.SubElement(reading_order, 'OrderedGroup',
                                  {'id': 'ro_1', 'caption': 'Regions reading order'})
    ET.SubElement(ordered_group, 'RegionRefIndexed', {'index': '0', 'regionRef': 'region_1'})

    text_region = ET.SubElement(page, 'TextRegion',
                                 {'id': 'region_1', 'type': 'paragraph', 'custom': 'readingOrder {index:0;}'})
    if segments:
        x1 = min(s.bbox[0] for s in segments)
        y1 = min(s.bbox[1] for s in segments)
        x2 = max(s.bbox[2] for s in segments)
        y2 = max(s.bbox[3] for s in segments)
        ET.SubElement(text_region, 'Coords').set('points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}')
    for idx, seg in enumerate(segments):
        exporter._add_text_line(text_region, f'line_{idx + 1}', seg, seg.text, idx)

    xml_bytes = ET.tostring(root, encoding='utf-8', method='xml')
    pretty = minidom.parseString(xml_bytes).toprettyxml(indent='  ', encoding='utf-8')
    return pretty, Path(filename).stem


def _build_thinking_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
    """Return (thinking_bytes, stem) for a cached image, or raise HTTPException(404) if no thinking."""
    img_data = _get_image_data(session, image_id)
    if img_data is None:
        raise HTTPException(404, f"Image {image_id} not found")
    results = img_data.get("results")
    if not results:
        raise HTTPException(400, f"No results for {image_id}")
    filename = img_data.get("filename", img_data["path"].name)
    stem = Path(filename).stem
    blocks = []
    for i, r in enumerate(results):
        t = r.get("thinking_text", "")
        if t:
            if len(results) > 1:
                blocks.append(f"=== Line {i + 1} ===\n{t}")
            else:
                blocks.append(t)
    if not blocks:
        raise HTTPException(404, f"No thinking text for {image_id}")
    return "\n\n".join(blocks).encode("utf-8"), stem


def _build_txt_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
    """Return (txt_bytes, stem) for a cached image, or raise HTTPException."""
    img_data = _get_image_data(session, image_id)
    if img_data is None:
        raise HTTPException(404, f"Image {image_id} not found")
    results = img_data.get("results")
    if not results:
        raise HTTPException(400, f"No results for {image_id}")
    filename = img_data.get("filename", img_data["path"].name)
    text = "\n".join(r.get("text", "") for r in results)
    return text.encode("utf-8"), Path(filename).stem


class BatchXMLRequest(BaseModel):
    image_ids: list[str]


@app.post("/api/batch/export-thinking")
async def batch_export_thinking(request: Request, req: BatchXMLRequest):
    """Return a ZIP archive containing one thinking-text file per image (skips pages without thinking)."""
    session = _get_session(request)
    import zipfile, io
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
        for image_id in req.image_ids:
            try:
                thinking_bytes, stem = _build_thinking_bytes(session, image_id)
                zf.writestr(f"{stem}_thinking.txt", thinking_bytes)
            except HTTPException:
                pass  # skip pages without thinking
    buf.seek(0)
    return Response(
        content=buf.read(),
        media_type="application/zip",
        headers={"Content-Disposition": 'attachment; filename="batch_thinking.zip"'},
    )


@app.post("/api/batch/export-txt")
async def batch_export_txt(request: Request, req: BatchXMLRequest):
    """Return a ZIP archive containing one plain-text file per image."""
    session = _get_session(request)
    import zipfile, io
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
        for image_id in req.image_ids:
            try:
                txt_bytes, stem = _build_txt_bytes(session, image_id)
                zf.writestr(f"{stem}.txt", txt_bytes)
            except HTTPException:
                pass  # skip images without results
    buf.seek(0)
    return Response(
        content=buf.read(),
        media_type="application/zip",
        headers={"Content-Disposition": 'attachment; filename="batch_export_txt.zip"'},
    )


@app.post("/api/batch/export-xml")
async def batch_export_xml(request: Request, req: BatchXMLRequest):
    """Return a ZIP archive containing one PAGE XML file per image."""
    session = _get_session(request)
    import zipfile, io
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
        for image_id in req.image_ids:
            try:
                xml_bytes, stem = _build_xml_bytes(session, image_id)
                zf.writestr(f"{stem}.xml", xml_bytes)
            except HTTPException:
                pass  # skip images without results
    buf.seek(0)
    return Response(
        content=buf.read(),
        media_type="application/zip",
        headers={"Content-Disposition": 'attachment; filename="batch_export.zip"'},
    )


@app.get("/api/session")
async def session_info(request: Request):
    """Return info about the current session (useful for debugging)."""
    session = _get_session(request)
    return {
        "session_id": session.session_id[:8] + "...",
        "images": len(session.image_cache),
        "active_transcriptions": len(session.cancel_events),
        "pool_key": session.pool_key,
        "created_at": session.created_at,
        "last_active": session.last_active,
        "total_sessions": len(sessions),
    }


@app.get("/api/engine/pool")
async def pool_status():
    """Return current engine pool state (admin/debug endpoint)."""
    slots = []
    for key, slot in engine_pool.items():
        slots.append({
            "pool_key": key,
            "engine_name": slot.engine_name,
            "ref_count": slot.ref_count,
            "loaded": slot.engine.is_model_loaded(),
            "last_used": slot.last_used,
            "age_s": round(time.time() - slot.last_used, 0),
        })
    return {
        "pool_size": len(engine_pool),
        "slots": slots,
        "total_sessions": len(sessions),
    }


@app.get("/api/kraken/presets")
async def kraken_presets():
    """Return list of available Kraken model presets (local + Zenodo)."""
    try:
        from engines.kraken_engine import KRAKEN_MODELS
    except ImportError:
        return {"presets": []}
    presets = []
    for model_id, info in KRAKEN_MODELS.items():
        presets.append({
            "id": model_id,
            "label": info.get("description", model_id),
            "language": info.get("language", ""),
            "source": info.get("source", ""),
        })
    return {"presets": presets}


@app.post("/api/models/upload")
async def upload_model(file: UploadFile = File(...)):
    """Upload a Kraken .mlmodel file to the server's models/kraken_uploads/ directory."""
    filename = file.filename or "model.mlmodel"
    if not filename.lower().endswith(".mlmodel"):
        raise HTTPException(400, "Only .mlmodel files are accepted")

    content = await file.read()
    if len(content) > 500 * 1024 * 1024:
        raise HTTPException(400, "File too large (max 500 MB)")

    upload_dir = PROJECT_ROOT / "models" / "kraken_uploads"
    upload_dir.mkdir(parents=True, exist_ok=True)

    # Sanitize filename — keep only safe characters
    safe_name = Path(filename).name
    safe_name = "".join(c for c in safe_name if c.isalnum() or c in "._- ")
    safe_name = safe_name.strip() or "uploaded.mlmodel"

    dest = upload_dir / safe_name
    dest.write_bytes(content)
    log.info(f"Uploaded Kraken model: {dest} ({len(content)} bytes)")

    rel_path = str(dest.relative_to(PROJECT_ROOT))  # e.g. models/kraken_uploads/foo.mlmodel
    return {
        "path": rel_path,
        "filename": safe_name,
        "size": len(content),
        "options": _scan_kraken_models(),  # refreshed list for frontend to repopulate select
    }


@app.get("/api/gpu")
async def gpu_status():
    try:
        import torch
        if not torch.cuda.is_available():
            return {"available": False, "gpus": []}

        # pynvml (nvidia-ml-py) for utilization %; graceful fallback if missing
        nvml_utils: dict[int, dict] = {}
        try:
            import pynvml
            pynvml.nvmlInit()
            for _i in range(pynvml.nvmlDeviceGetCount()):
                h = pynvml.nvmlDeviceGetHandleByIndex(_i)
                u = pynvml.nvmlDeviceGetUtilizationRates(h)
                nvml_utils[_i] = {"gpu_pct": u.gpu, "mem_pct": u.memory}
        except Exception:
            pass  # pynvml unavailable — utilization fields omitted

        gpus = []
        for i in range(torch.cuda.device_count()):
            free, total = torch.cuda.mem_get_info(i)
            entry: dict = {
                "index": i,
                "name": torch.cuda.get_device_name(i),
                "memory_total_mb": round(total / 1e6),
                "memory_used_mb": round((total - free) / 1e6),
                "memory_free_mb": round(free / 1e6),
            }
            if i in nvml_utils:
                entry["utilization_gpu_pct"] = nvml_utils[i]["gpu_pct"]
                entry["utilization_mem_pct"] = nvml_utils[i]["mem_pct"]
            gpus.append(entry)
        return {"available": True, "gpus": gpus}
    except Exception:
        return {"available": False, "gpus": []}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _sse(event: str, data: dict) -> str:
    """Format a Server-Sent Event."""
    return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"