Spaces:

achimrabus
/

polyscriptor-htr-demo

Running

App Files Files Community

Achim Rabus commited on 18 days ago

Commit

78431ff

1 Parent(s): c8ba8c4

Deploy Polyscriptor HTR Space demo

Browse files

Files changed (35) hide show

.dockerignore +10 -0
.gitattributes +2 -0
Dockerfile +23 -0
README.md +73 -7
engines/__init__.py +17 -0
engines/commercial_api_engine.py +768 -0
engines/kraken_engine.py +535 -0
engines/openwebui_engine.py +505 -0
engines/pylaia_engine.py +414 -0
hf-space/README.md +43 -0
hf-space/SPACE_README.md +78 -0
hf-space/requirements.txt +21 -0
htr_engine_base.py +398 -0
inference_commercial_api.py +760 -0
inference_page.py +946 -0
inference_pylaia_native.py +453 -0
kraken_segmenter.py +823 -0
page_xml_exporter.py +276 -0
web/polyscriptor_server.py +2237 -0
web/server_config.yaml +25 -0
web/static/app.css +1269 -0
web/static/app.js +298 -0
web/static/components/batch-panel.js +735 -0
web/static/components/engine-panel.js +1091 -0
web/static/components/image-viewer.js +294 -0
web/static/components/transcription-panel.js +482 -0
web/static/fonts/MonomakhUnicode-Regular.woff2 +3 -0
web/static/index.html +323 -0
web/static/pwa/demo.css +698 -0
web/static/pwa/demo.html +204 -0
web/static/pwa/demo.js +1069 -0
web/static/pwa/icons/icon-192.png +3 -0
web/static/pwa/icons/icon-512.png +3 -0
web/static/pwa/manifest.json +27 -0
web/static/pwa/sw.js +60 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.git
+.pytest_cache
+__pycache__
+**/__pycache__
+*.pyc
+*.ipynb
+*.zip
+models
+htr_gui
+Documentation

.gitattributes CHANGED Viewed

@@ -19,6 +19,7 @@
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
@@ -29,6 +30,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text

 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
+*.woff2 filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    POLYSCRIPTOR_DEMO_MODE=hf_space \
+    HF_HOME=/tmp/huggingface \
+    PORT=7860
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+COPY hf-space/requirements.txt /tmp/requirements-hf-space.txt
+RUN pip install --no-cache-dir -r /tmp/requirements-hf-space.txt
+COPY . /app
+EXPOSE 7860
+CMD ["python", "-m", "uvicorn", "web.polyscriptor_server:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,78 @@
 ---
-title: Polyscriptor Htr Demo
-emoji: 📉
-colorFrom: gray
-colorTo: green
 sdk: docker
 pinned: false
-license: mit
-short_description: Demo of Polyscriptor HTR
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Polyscriptor HTR Demo
+emoji: 📝
+colorFrom: blue
+colorTo: gray
 sdk: docker
 pinned: false
+license: apache-2.0
 ---
+# Polyscriptor HTR Demo
+Polyscriptor is a browser-based demo for handwritten text recognition (HTR) on
+historical Slavic manuscript material. This Hugging Face Space runs a constrained
+public version of the Polyscriptor FastAPI/Web interface.
+The hosted demo is intended for quick inspection and teaching. It is not the full
+local research environment used for training, batch processing, GPU inference, or
+private manuscript collections.
+## Source Code
+The public Polyscriptor source code is available on GitHub:
+https://github.com/achimrabus/polyscriptor
+This Hugging Face Space contains the curated hosted demo deployment. The GitHub
+repository contains the broader Polyscriptor codebase, including the web UI,
+engine plugins, segmentation code, training utilities, and local workflows.
+## What This Demo Supports
+- CRNN-CTC / PyLaia-inspired HTR presets for selected public model repositories.
+- User-supplied API keys for OpenAI, Gemini, Claude, and OpenWebUI-compatible
+  endpoints.
+- Public model download from the Hugging Face Hub, primarily under
+  `achimrabus/*`.
+- CPU-only inference.
+- Kraken Classical line segmentation, with HPP as a lightweight fallback.
+- Temporary image uploads during the active session.
+## Limitations
+- No private models are bundled with this Space.
+- API-based engines require users to paste their own API key in the browser
+  form. The Space does not ship with shared provider credentials.
+- Uploaded files are treated as temporary runtime data and are not part of the
+  repository.
+- Large local GPU/VLM engines from the full Polyscriptor workflow are not
+  enabled here.
+- Accuracy depends strongly on script, language, writing style, image quality,
+  and segmentation quality.
+## Model Notes
+The demo uses publicly available model presets. For best results, choose a model
+that matches the manuscript tradition as closely as possible. The current public
+Polyscriptor model cards are available at:
+https://huggingface.co/achimrabus
+## Project Context
+Polyscriptor is developed for historical HTR workflows, with a focus on Slavic
+manuscripts and reproducible comparison of OCR/HTR engines. The full development
+repository contains additional tooling for local use, training, evaluation, and
+batch processing; this Space contains only the hosted demo configuration.
+## Privacy
+Do not upload sensitive or unpublished manuscript images unless you are
+comfortable processing them in a hosted public demo environment. The application
+uses temporary server-side files during processing, but this Space should be
+treated as a public demonstration service rather than a secure private workflow.
+For API-based engines, provider keys are entered by the user at runtime. Do not
+commit keys to this repository or add them to the Space configuration unless you
+intend to provide a shared project credential.

engines/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""HTR Engine Plugins
+This package contains plugin implementations for different HTR engines.
+Each engine module implements the HTREngine interface defined in htr_engine_base.py.
+"""
+__all__ = [
+    "TrOCREngine",
+    "Qwen3Engine",
+    "PyLaiaEngine",
+    "KrakenEngine",
+    "CommercialAPIEngine",
+    "PartyEngine",
+    "DeepSeekOCREngine",
+    "LightOnOCREngine",
+    "PaddleOCREngine",
+]

engines/commercial_api_engine.py ADDED Viewed

	@@ -0,0 +1,768 @@

+"""
+Commercial API Engine Plugin
+Wraps commercial HTR APIs (OpenAI, Gemini, Claude) as a unified plugin.
+"""
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+import numpy as np
+from htr_engine_base import HTREngine, TranscriptionResult
+# Load environment variables from .env file
+try:
+    from dotenv import load_dotenv
+    # Look for .env in the project root (parent of engines/)
+    env_path = Path(__file__).parent.parent / ".env"
+    if env_path.exists():
+        load_dotenv(env_path)
+        print(f"[CommercialAPIEngine] Loaded environment variables from {env_path}")
+except ImportError:
+    print("[CommercialAPIEngine] Warning: python-dotenv not installed. API keys will not be loaded from .env file.")
+    print("Install with: pip install python-dotenv")
+try:
+    from PyQt6.QtWidgets import (
+        QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
+        QPushButton, QCheckBox, QLineEdit, QGroupBox, QTextEdit
+    )
+    from PyQt6.QtCore import Qt
+    PYQT_AVAILABLE = True
+except ImportError:
+    PYQT_AVAILABLE = False
+    QWidget = object
+try:
+    from inference_commercial_api import (
+        OpenAIInference, GeminiInference, ClaudeInference,
+        check_api_availability,
+        OPENAI_MODELS, GEMINI_MODELS, CLAUDE_MODELS,
+        fetch_openai_models, fetch_gemini_models
+    )
+    COMMERCIAL_API_AVAILABLE = True
+    API_AVAILABILITY = check_api_availability()
+except ImportError:
+    COMMERCIAL_API_AVAILABLE = False
+    API_AVAILABILITY = {"openai": False, "gemini": False, "claude": False}
+    OPENAI_MODELS = []
+    GEMINI_MODELS = []
+    CLAUDE_MODELS = []
+    fetch_openai_models = lambda api_key=None: []
+    fetch_gemini_models = lambda api_key=None: []
+class CommercialAPIEngine(HTREngine):
+    """Commercial API HTR engine plugin."""
+    def __init__(self):
+        # Instance attributes (avoid type annotations here for broader runtime compatibility in some environments)
+        self.model = None  # Can be OpenAI, Gemini, or Claude
+        self._config_widget = None
+        self._current_provider = None
+        # Widget references
+        self._provider_combo = None
+        self._model_combo = None
+        self._custom_model_edit = None
+        self._use_custom_model_check = None
+        self._refresh_models_btn = None
+        self._api_key_edit = None
+        self._show_key_check = None
+        self._prompt_edit = None
+        self._thinking_combo = None
+        self._temperature_edit = None
+        self._max_tokens_edit = None
+        self._early_exit_check = None
+        self._auto_continue_check = None
+        self._max_continuations_edit = None
+    def get_name(self) -> str:
+        return "Commercial APIs"
+    def get_description(self) -> str:
+        return "OpenAI GPT-4V, Google Gemini, Anthropic Claude vision APIs"
+    def is_available(self) -> bool:
+        return COMMERCIAL_API_AVAILABLE and any(API_AVAILABILITY.values())
+    def get_unavailable_reason(self) -> str:
+        if not COMMERCIAL_API_AVAILABLE:
+            return "Commercial API support not available. Install with: pip install openai google-generativeai anthropic"
+        if not any(API_AVAILABILITY.values()):
+            return "No API libraries installed. Install at least one: openai, google-generativeai, or anthropic"
+        return ""
+    def get_config_widget(self):
+        """Create Commercial API configuration panel."""
+        if self._config_widget is not None:
+            return self._config_widget
+        widget = QWidget()
+        layout = QVBoxLayout()
+        # Provider selection
+        provider_group = QGroupBox("API Provider")
+        provider_layout = QVBoxLayout()
+        self._provider_combo = QComboBox()
+        available_providers = []
+        if API_AVAILABILITY.get("openai", False):
+            available_providers.append("OpenAI")
+        if API_AVAILABILITY.get("gemini", False):
+            available_providers.append("Gemini")
+        if API_AVAILABILITY.get("claude", False):
+            available_providers.append("Claude")
+        if not available_providers:
+            available_providers = ["No APIs available"]
+        self._provider_combo.addItems(available_providers)
+        self._provider_combo.currentTextChanged.connect(self._on_provider_changed)
+        provider_layout.addWidget(self._provider_combo)
+        provider_group.setLayout(provider_layout)
+        layout.addWidget(provider_group)
+        # Model selection
+        model_group = QGroupBox("Model")
+        model_layout = QVBoxLayout()
+        # Dropdown for standard models
+        model_dropdown_layout = QHBoxLayout()
+        self._model_combo = QComboBox()
+        model_dropdown_layout.addWidget(self._model_combo)
+        # Refresh models button
+        self._refresh_models_btn = QPushButton("🔄 Refresh")
+        self._refresh_models_btn.setToolTip("Fetch latest models from API")
+        self._refresh_models_btn.setMaximumWidth(80)
+        self._refresh_models_btn.clicked.connect(self._on_refresh_models)
+        model_dropdown_layout.addWidget(self._refresh_models_btn)
+        model_layout.addLayout(model_dropdown_layout)
+        # Custom model ID checkbox and field
+        custom_model_layout = QHBoxLayout()
+        self._use_custom_model_check = QCheckBox("Use custom model ID:")
+        self._use_custom_model_check.toggled.connect(self._on_custom_model_toggled)
+        custom_model_layout.addWidget(self._use_custom_model_check)
+        self._custom_model_edit = QLineEdit()
+        self._custom_model_edit.setPlaceholderText("e.g., gpt-4.5, o1-preview-2024-12-17")
+        self._custom_model_edit.setEnabled(False)  # Disabled by default
+        custom_model_layout.addWidget(self._custom_model_edit)
+        model_layout.addLayout(custom_model_layout)
+        model_hint = QLabel("💡 Use custom model ID for bleeding-edge models not in the dropdown")
+        model_hint.setStyleSheet("color: gray; font-size: 8pt;")
+        model_hint.setWordWrap(True)
+        model_layout.addWidget(model_hint)
+        model_group.setLayout(model_layout)
+        layout.addWidget(model_group)
+        # API key
+        key_group = QGroupBox("API Key")
+        key_layout = QVBoxLayout()
+        key_input_layout = QHBoxLayout()
+        self._api_key_edit = QLineEdit()
+        self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
+        self._api_key_edit.setPlaceholderText("Enter your API key")
+        key_input_layout.addWidget(self._api_key_edit)
+        self._show_key_check = QCheckBox("Show")
+        self._show_key_check.toggled.connect(self._toggle_key_visibility)
+        key_input_layout.addWidget(self._show_key_check)
+        key_layout.addLayout(key_input_layout)
+        key_hint = QLabel("API keys are stored locally in .trocr_gui/")
+        key_hint.setStyleSheet("color: gray; font-size: 9pt;")
+        key_layout.addWidget(key_hint)
+        key_group.setLayout(key_layout)
+        layout.addWidget(key_group)
+        # Prompt & Sampling section
+        prompt_group = QGroupBox("Prompt & Sampling (Optional)")
+        prompt_layout = QVBoxLayout()
+        self._prompt_edit = QTextEdit()
+        self._prompt_edit.setPlaceholderText("Enter custom transcription prompt...")
+        self._prompt_edit.setMaximumHeight(100)
+        prompt_layout.addWidget(self._prompt_edit)
+        # Temperature control
+        temp_row = QHBoxLayout()
+        temp_row.addWidget(QLabel("Temperature:"))
+        self._temperature_edit = QLineEdit()
+        self._temperature_edit.setPlaceholderText("1.0 (default)")
+        self._temperature_edit.setToolTip(
+            "Sampling temperature (web default ~1.0).\n"
+            "Use 0-0.3 for deterministic; >1 can increase variability."
+        )
+        self._temperature_edit.setMaximumWidth(90)
+        temp_row.addWidget(self._temperature_edit)
+        temp_row.addStretch()
+        prompt_layout.addLayout(temp_row)
+        # Max output tokens control
+        tokens_row = QHBoxLayout()
+        tokens_row.addWidget(QLabel("Max output tokens:"))
+        self._max_tokens_edit = QLineEdit()
+        self._max_tokens_edit.setPlaceholderText("4096 preview / 2048 default")
+        self._max_tokens_edit.setToolTip(
+            "Upper limit on generated tokens. Lowering may force earlier output.\n"
+            "Raising (e.g. 8192) may help high reasoning but risks long 'thinking'."
+        )
+        self._max_tokens_edit.setMaximumWidth(130)
+        tokens_row.addWidget(self._max_tokens_edit)
+        tokens_row.addStretch()
+        prompt_layout.addLayout(tokens_row)
+        prompt_group.setLayout(prompt_layout)
+        layout.addWidget(prompt_group)
+        # Thinking Mode section (for Gemini models)
+        thinking_group = QGroupBox("Thinking Mode (Gemini only)")
+        thinking_layout = QVBoxLayout()
+    # (Removed warning banner recommending alternative models; preview model retained for Church Slavonic use)
+        thinking_row = QHBoxLayout()
+        thinking_row.addWidget(QLabel("Reasoning:"))
+        self._thinking_combo = QComboBox()
+        self._thinking_combo.addItems(["Auto (Low for preview)", "Low (Fast)", "High (More reasoning)"])
+        self._thinking_combo.setToolTip(
+            "Low: Fast, direct output\n"
+            "High: Slower, uses more tokens for reasoning\n"
+            "Auto: Uses Low for preview models to avoid token waste"
+        )
+        thinking_row.addWidget(self._thinking_combo)
+        thinking_row.addStretch()
+        thinking_layout.addLayout(thinking_row)
+        thinking_group.setLayout(thinking_layout)
+        layout.addWidget(thinking_group)
+        # Advanced Gemini controls
+        advanced_group = QGroupBox("Gemini Advanced")
+        adv_layout = QVBoxLayout()
+        # Row 1: Checkboxes
+        adv_row1 = QHBoxLayout()
+        self._early_exit_check = QCheckBox("Early exit on first chunk")
+        self._early_exit_check.setChecked(True)
+        self._early_exit_check.setToolTip("If checked, streaming returns after first non-empty text chunk. Uncheck to collect full stream.")
+        adv_row1.addWidget(self._early_exit_check)
+        self._auto_continue_check = QCheckBox("Auto continuation")
+        self._auto_continue_check.setChecked(False)  # Default: off for speed
+        self._auto_continue_check.setToolTip("If checked, performs additional continuation calls to capture missed trailing text.")
+        adv_row1.addWidget(self._auto_continue_check)
+        adv_row1.addStretch()
+        adv_layout.addLayout(adv_row1)
+        # Row 2: Continuation settings (symmetrical grid)
+        adv_row2 = QHBoxLayout()
+        adv_row2.addWidget(QLabel("Max passes:"))
+        self._max_continuations_edit = QLineEdit()
+        self._max_continuations_edit.setText("2")  # Default value
+        self._max_continuations_edit.setToolTip("Maximum number of continuation attempts (2-3 recommended)")
+        self._max_continuations_edit.setFixedWidth(60)
+        adv_row2.addWidget(self._max_continuations_edit)
+        adv_row2.addSpacing(20)
+        adv_row2.addWidget(QLabel("Min new chars:"))
+        self._min_new_chars_edit = QLineEdit()
+        self._min_new_chars_edit.setText("50")  # Default value
+        self._min_new_chars_edit.setToolTip("Minimum number of new characters required to accept a continuation chunk.")
+        self._min_new_chars_edit.setFixedWidth(60)
+        adv_row2.addWidget(self._min_new_chars_edit)
+        adv_row2.addStretch()
+        adv_layout.addLayout(adv_row2)
+        # Row 3: Token & fallback settings (symmetrical grid)
+        adv_row3 = QHBoxLayout()
+        adv_row3.addWidget(QLabel("Low-mode tokens:"))
+        self._low_initial_tokens_edit = QLineEdit()
+        self._low_initial_tokens_edit.setText("6144")  # Default value
+        self._low_initial_tokens_edit.setToolTip("Initial max_output_tokens for LOW thinking before fallback escalation (4096-8192).")
+        self._low_initial_tokens_edit.setFixedWidth(60)
+        adv_row3.addWidget(self._low_initial_tokens_edit)
+        adv_row3.addSpacing(20)
+        adv_row3.addWidget(QLabel("Fallback %:"))
+        self._reasoning_fallback_edit = QLineEdit()
+        self._reasoning_fallback_edit.setText("0.6")  # Default value
+        self._reasoning_fallback_edit.setToolTip("Fraction of token budget consumed internally (no output) that triggers early fallback (0.5-0.8).")
+        self._reasoning_fallback_edit.setFixedWidth(60)
+        adv_row3.addWidget(self._reasoning_fallback_edit)
+        adv_row3.addSpacing(20)
+        adv_row3.addWidget(QLabel("Fallback cap:"))
+        self._fallback_cap_edit = QLineEdit()
+        self._fallback_cap_edit.setText("8192")  # Default configurable cap
+        self._fallback_cap_edit.setToolTip("Maximum tokens for fallback attempt. Increase for page-wise recognition (e.g. 12288 or 16384).")
+        self._fallback_cap_edit.setFixedWidth(70)
+        adv_row3.addWidget(self._fallback_cap_edit)
+        adv_row3.addStretch()
+        adv_layout.addLayout(adv_row3)
+        advanced_group.setLayout(adv_layout)
+        layout.addWidget(advanced_group)
+        layout.addStretch()
+        widget.setLayout(layout)
+        self._config_widget = widget
+        # Initialize model list based on default provider
+        self._on_provider_changed(self._provider_combo.currentText())
+        return widget
+    def _get_api_key_file(self) -> 'Path':
+        """Get path to API key storage file."""
+        from pathlib import Path
+        storage_dir = Path.home() / ".trocr_gui"
+        storage_dir.mkdir(exist_ok=True)
+        return storage_dir / "api_keys.json"
+    def _load_saved_api_key(self):
+        """Load saved API key for current provider."""
+        try:
+            import json
+            key_file = self._get_api_key_file()
+            if key_file.exists():
+                with open(key_file, "r") as f:
+                    keys = json.load(f)
+                provider = self._provider_combo.currentText().lower()
+                if provider in keys:
+                    self._api_key_edit.setText(keys[provider])
+        except Exception as e:
+            print(f"Warning: Could not load saved API key: {e}")
+    def _save_api_key(self):
+        """Save API key for current provider."""
+        try:
+            import json
+            key_file = self._get_api_key_file()
+            # Load existing keys
+            keys = {}
+            if key_file.exists():
+                with open(key_file, "r") as f:
+                    keys = json.load(f)
+            # Update key for current provider
+            provider = self._provider_combo.currentText().lower()
+            api_key = self._api_key_edit.text().strip()
+            if api_key:
+                keys[provider] = api_key
+                with open(key_file, "w") as f:
+                    json.dump(keys, f, indent=2)
+        except Exception as e:
+            print(f"Warning: Could not save API key: {e}")
+    def _on_provider_changed(self, provider: str):
+        """Update model list when provider changes and load API key from environment."""
+        if self._model_combo is None:
+            return
+        self._model_combo.clear()
+        if provider == "OpenAI":
+            self._model_combo.addItems(OPENAI_MODELS)
+        elif provider == "Gemini":
+            self._model_combo.addItems(GEMINI_MODELS)
+        elif provider == "Claude":
+            self._model_combo.addItems(CLAUDE_MODELS)
+        else:
+            self._model_combo.addItem("No models available")
+        # Auto-load API key from environment variables
+        if self._api_key_edit is not None:
+            env_key = self._get_api_key_from_env(provider)
+            if env_key:
+                self._api_key_edit.setText(env_key)
+                print(f"[CommercialAPIEngine] Loaded {provider} API key from environment")
+    def _get_api_key_from_env(self, provider: str) -> Optional[str]:
+        """Get API key from environment variables based on provider."""
+        env_var_map = {
+            "OpenAI": "OPENAI_API_KEY",
+            "Gemini": "GOOGLE_API_KEY",
+            "Claude": "ANTHROPIC_API_KEY"
+        }
+        env_var = env_var_map.get(provider)
+        if env_var:
+            return os.getenv(env_var, "")
+    def _toggle_key_visibility(self, checked: bool):
+        """Toggle API key visibility."""
+        if checked:
+            self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Normal)
+        else:
+            self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
+    def _on_custom_model_toggled(self, checked: bool):
+        """Enable/disable custom model field."""
+        self._custom_model_edit.setEnabled(checked)
+        self._model_combo.setEnabled(not checked)
+    def _on_refresh_models(self):
+        """Refresh model list from API dynamically."""
+        if self._model_combo is None or self._api_key_edit is None:
+            return
+        provider = self._provider_combo.currentText()
+        api_key = self._api_key_edit.text().strip()
+        if not api_key:
+            print(f"[CommercialAPIEngine] Cannot refresh models: No API key provided")
+            return
+        print(f"[CommercialAPIEngine] Refreshing {provider} models from API...")
+        # Save current selection
+        current_model = self._model_combo.currentText()
+        # Fetch models dynamically
+        if provider == "OpenAI":
+            models = fetch_openai_models(api_key)
+        elif provider == "Gemini":
+            models = fetch_gemini_models(api_key)
+        else:
+            print(f"[CommercialAPIEngine] Dynamic refresh not supported for {provider}")
+            return
+        # Update dropdown
+        self._model_combo.clear()
+        self._model_combo.addItems(models)
+        # Restore selection if possible
+        idx = self._model_combo.findText(current_model)
+        if idx >= 0:
+            self._model_combo.setCurrentIndex(idx)
+        print(f"[CommercialAPIEngine] Refreshed {len(models)} models for {provider}")
+    def get_config(self) -> Dict[str, Any]:
+        """Extract configuration from widget controls."""
+        if self._config_widget is None:
+            return {}
+        prompt_text = self._prompt_edit.toPlainText().strip()
+        # Use custom model if checkbox is enabled, otherwise use dropdown
+        if self._use_custom_model_check.isChecked():
+            model = self._custom_model_edit.text().strip()
+        else:
+            model = self._model_combo.currentText()
+        return {
+            "provider": self._provider_combo.currentText(),
+            "model": model,
+            "api_key": self._api_key_edit.text().strip(),
+            "custom_prompt": prompt_text if prompt_text else None,
+            "use_custom_model": self._use_custom_model_check.isChecked(),
+            "custom_model_id": self._custom_model_edit.text().strip(),
+        }
+    def set_config(self, config: Dict[str, Any]):
+        """Restore configuration to widget controls."""
+        if self._config_widget is None:
+            return
+        provider = config.get("provider", "")
+        idx = self._provider_combo.findText(provider)
+        if idx >= 0:
+            self._provider_combo.setCurrentIndex(idx)
+        # Restore custom model checkbox and field
+        use_custom = config.get("use_custom_model", False)
+        self._use_custom_model_check.setChecked(use_custom)
+        if use_custom:
+            custom_model_id = config.get("custom_model_id", "")
+            self._custom_model_edit.setText(custom_model_id)
+        else:
+            model = config.get("model", "")
+            idx = self._model_combo.findText(model)
+            if idx >= 0:
+                self._model_combo.setCurrentIndex(idx)
+        self._api_key_edit.setText(config.get("api_key", ""))
+        custom_prompt = config.get("custom_prompt", "")
+        if custom_prompt:
+            self._prompt_edit.setPlainText(custom_prompt)
+    def load_model(self, config: Dict[str, Any]) -> bool:
+        """Load (initialize) API client."""
+        try:
+            provider = config.get("provider", "")
+            model_name = config.get("model", "")
+            api_key = config.get("api_key", "")
+            if not api_key:
+                print("Error: No API key provided")
+                return False
+            # Unload previous model
+            self.unload_model()
+            # Initialize appropriate client
+            if provider == "OpenAI":
+                self.model = OpenAIInference(api_key=api_key, model=model_name)
+                self._current_provider = "openai"
+            elif provider == "Gemini":
+                self.model = GeminiInference(api_key=api_key, model=model_name)
+                self._current_provider = "gemini"
+            elif provider == "Claude":
+                self.model = ClaudeInference(api_key=api_key, model=model_name)
+                self._current_provider = "claude"
+            else:
+                return False
+            return True
+        except Exception as e:
+            print(f"Error initializing API client: {e}")
+            self.model = None
+            self._current_provider = None
+            return False
+    def unload_model(self):
+        """Unload (clear) API client."""
+        if self.model is not None:
+            del self.model
+            self.model = None
+            self._current_provider = None
+    def is_model_loaded(self) -> bool:
+        """Check if API client is initialized."""
+        return self.model is not None
+    def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
+        """Transcribe a line image with commercial API."""
+        if self.model is None:
+            return TranscriptionResult(text="[API client not initialized]", confidence=0.0)
+        if config is None:
+            config = self.get_config()
+        custom_prompt = config.get("custom_prompt")
+        try:
+            # Convert numpy array to PIL Image
+            from PIL import Image
+            if isinstance(image, np.ndarray):
+                pil_image = Image.fromarray(image)
+            else:
+                pil_image = image
+            # All API clients have transcribe() method
+            # It returns a string directly, not a dict
+            # Enable retry logic for Gemini to handle content blocking
+            if self._current_provider == "gemini":
+                # Get thinking mode setting
+                thinking_mode = None
+                temperature = None
+                if self._thinking_combo is not None:
+                    thinking_text = self._thinking_combo.currentText()
+                    if "Low" in thinking_text:
+                        thinking_mode = "low"
+                        fast_direct = True  # low mode: request immediate output
+                    elif "High" in thinking_text:
+                        thinking_mode = "high"
+                    # else: Auto = None (default)
+                else:
+                    # Web UI context — get thinking_mode from config dict
+                    thinking_mode = config.get("thinking_mode") or None
+                if self._temperature_edit is not None:
+                    t_text = self._temperature_edit.text().strip()
+                    if t_text:
+                        try:
+                            temperature = float(t_text)
+                        except ValueError:
+                            temperature = None
+                max_tokens = None
+                if self._max_tokens_edit is not None:
+                    mt_text = self._max_tokens_edit.text().strip()
+                    if mt_text:
+                        try:
+                            max_tokens = int(mt_text)
+                        except ValueError:
+                            max_tokens = None
+                # Fallback to config dict (web UI context — no Qt widgets)
+                if max_tokens is None:
+                    max_tokens = config.get("max_output_tokens")
+                # Treat 0 as "no limit" (HTML number fields send 0 for blank)
+                if max_tokens is not None and max_tokens <= 0:
+                    max_tokens = None
+                if temperature is None:
+                    temperature = config.get("temperature")
+                # Web UI (no Qt widgets): disable early exit for full reasoning quality
+                if self._early_exit_check is not None:
+                    fast_direct_early_exit = self._early_exit_check.isChecked()
+                else:
+                    fast_direct_early_exit = False
+                # Extract continuation settings
+                auto_continue = False
+                max_auto_continuations = 2  # Default
+                if self._auto_continue_check is not None and self._auto_continue_check.isChecked():
+                    auto_continue = True
+                    if self._max_continuations_edit is not None:
+                        mc_text = self._max_continuations_edit.text().strip()
+                        if mc_text:
+                            try:
+                                max_auto_continuations = int(mc_text)
+                            except ValueError:
+                                pass  # Keep default of 2
+                # Extract continuation settings with defaults
+                continuation_min_new_chars = 50
+                if hasattr(self, '_min_new_chars_edit') and self._min_new_chars_edit is not None:
+                    mnc_text = self._min_new_chars_edit.text().strip()
+                    if mnc_text:
+                        try:
+                            continuation_min_new_chars = int(mnc_text)
+                        except ValueError:
+                            pass  # Keep default
+                # Web UI (no Qt widgets): disable reasoning fallback (1.0 = never trigger)
+                reasoning_fallback_threshold = 1.0 if not (hasattr(self, '_reasoning_fallback_edit') and self._reasoning_fallback_edit is not None) else 0.6
+                if hasattr(self, '_reasoning_fallback_edit') and self._reasoning_fallback_edit is not None:
+                    rft_text = self._reasoning_fallback_edit.text().strip()
+                    if rft_text:
+                        try:
+                            reasoning_fallback_threshold = float(rft_text)
+                        except ValueError:
+                            pass  # Keep default
+                fallback_cap = 8192
+                if hasattr(self, '_fallback_cap_edit') and self._fallback_cap_edit is not None:
+                    fc_text = self._fallback_cap_edit.text().strip()
+                    if fc_text:
+                        try:
+                            fallback_cap = int(fc_text)
+                        except ValueError:
+                            pass  # Keep default if invalid value
+                # Override max_tokens for LOW thinking mode if specified
+                if thinking_mode == 'low' and hasattr(self, '_low_initial_tokens_edit') and self._low_initial_tokens_edit is not None:
+                    lit_text = self._low_initial_tokens_edit.text().strip()
+                    if lit_text:
+                        try:
+                            lit_val = int(lit_text)
+                            if lit_val > 0:
+                                max_tokens = lit_val
+                                print(f"🔧 LOW thinking mode: overriding max_output_tokens to {max_tokens}")
+                        except ValueError:
+                            pass  # Keep existing max_tokens
+                # Debug: show final token budget
+                print(f"📊 Final settings: thinking_mode={thinking_mode}, max_output_tokens={max_tokens or 'model default'}, temp={temperature if temperature is not None else 1.0}")
+                text = self.model.transcribe(
+                    pil_image,
+                    prompt=custom_prompt,
+                    temperature=temperature if temperature is not None else 0.0,
+                    max_output_tokens=max_tokens,  # None = no limit, model uses its own maximum
+                    auto_retry_on_block=True,
+                    safety_relax=True,
+                    verbose_block_logging=True,
+                    thinking_mode=thinking_mode,
+                    fast_direct=fast_direct if 'fast_direct' in locals() else False,
+                    fast_direct_early_exit=fast_direct_early_exit,
+                    auto_continue=auto_continue,
+                    max_auto_continuations=max_auto_continuations,
+                    continuation_min_new_chars=continuation_min_new_chars,
+                    reasoning_fallback_threshold=reasoning_fallback_threshold,
+                    fallback_max_output_tokens=fallback_cap,
+                    record_stats_csv="gemini_runs.csv",
+                    apply_restriction_prompt=False  # Let model reason freely — improves transcription quality
+                )
+            else:
+                temperature = None
+                if self._temperature_edit is not None:
+                    t_text = self._temperature_edit.text().strip()
+                    if t_text:
+                        try:
+                            temperature = float(t_text)
+                        except ValueError:
+                            temperature = None
+                max_tokens = None
+                if self._max_tokens_edit is not None:
+                    mt_text = self._max_tokens_edit.text().strip()
+                    if mt_text:
+                        try:
+                            max_tokens = int(mt_text)
+                        except ValueError:
+                            max_tokens = None
+                # Fallback to config dict (web UI context — no Qt widgets)
+                if max_tokens is None:
+                    max_tokens = config.get("max_output_tokens")
+                # Treat 0 as "no limit" (HTML number fields send 0 for blank)
+                if max_tokens is not None and max_tokens <= 0:
+                    max_tokens = None
+                if temperature is None:
+                    temperature = config.get("temperature")
+                thinking_mode = config.get("thinking_mode") or None
+                text = self.model.transcribe(
+                    pil_image,
+                    prompt=custom_prompt,
+                    temperature=temperature if temperature is not None else 0.0,
+                    max_output_tokens=max_tokens,  # None = no limit, model uses its own maximum
+                    thinking_mode=thinking_mode,
+                )
+            meta: Dict[str, Any] = {
+                "provider": self._current_provider,
+                "model": config.get("model", ""),
+            }
+            if hasattr(self.model, "last_usage") and self.model.last_usage:
+                usage = dict(self.model.last_usage)
+                thinking_text = usage.pop("thinking_text", None)
+                meta["token_usage"] = usage
+                if thinking_text:
+                    meta["thinking_text"] = thinking_text
+            return TranscriptionResult(
+                text=text if text else "",
+                confidence=1.0,  # API models don't provide confidence
+                metadata=meta,
+            )
+        except Exception as e:
+            print(f"Error in API transcription: {e}")
+            import traceback
+            traceback.print_exc()
+            return TranscriptionResult(text=f"[API Error: {e}]", confidence=0.0)
+    def get_capabilities(self) -> Dict[str, bool]:
+        """Commercial API capabilities."""
+        return {
+            "batch_processing": False,  # APIs typically process one at a time
+            "confidence_scores": False,  # Most don't provide confidence
+            "beam_search": False,  # Internal to API
+            "language_model": True,  # All are language models
+            "preprocessing": True,  # APIs handle preprocessing
+        }
+    def requires_line_segmentation(self) -> bool:
+        """Commercial APIs can process full pages without segmentation."""
+        return False

engines/kraken_engine.py ADDED Viewed

	@@ -0,0 +1,535 @@

+"""
+Kraken HTR Engine Plugin
+Wraps the Kraken OCR system as a plugin for the unified GUI.
+Kraken is specialized for historical document OCR with robust segmentation and recognition.
+"""
+import sys
+from pathlib import Path
+from typing import Dict, Any, Optional
+import numpy as np
+def _print(msg: str) -> None:
+    """Print with graceful fallback if console can't encode the message (e.g. Windows CP-1252)."""
+    try:
+        print(msg)
+    except UnicodeEncodeError:
+        print(msg.encode("ascii", errors="replace").decode("ascii"))
+from htr_engine_base import HTREngine, TranscriptionResult
+try:
+    from PyQt6.QtWidgets import (
+        QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
+        QPushButton, QLineEdit, QFileDialog, QGroupBox, QCheckBox
+    )
+    from PyQt6.QtCore import Qt
+    PYQT_AVAILABLE = True
+except ImportError:
+    PYQT_AVAILABLE = False
+    QWidget = object
+try:
+    from kraken import rpred
+    from kraken.lib import vgsl, models
+    KRAKEN_AVAILABLE = True
+except ImportError:
+    KRAKEN_AVAILABLE = False
+# Local model (included in repo)
+LOCAL_BLLA_MODEL = "pagexml/blla.mlmodel"
+# Preset Kraken models — local + Zenodo community models (auto-download on first use)
+KRAKEN_MODELS = {
+    "blla-local": {
+        "path": LOCAL_BLLA_MODEL,
+        "description": "BLLA Segmentation Model (Local, Default)",
+        "language": "multi",
+        "source": "local"
+    },
+    # --- VERIFIED ZENODO MODELS ---
+    # CATMuS-Print: printed text, multilingual, verified DOI 10.5281/zenodo.10592716
+    "catmus-print": {
+        "zenodo_id": "10.5281/zenodo.10592716",
+        "description": "CATMuS-Print (Modern Printed Text, multilingual)",
+        "language": "multi",
+        "source": "zenodo"
+    },
+    # Arabic handwritten segmentation (Muharaf Corpus), verified DOI 10.5281/zenodo.14295555
+    "arabic-muharaf": {
+        "zenodo_id": "10.5281/zenodo.14295555",
+        "description": "Arabic Handwritten Segmentation (Muharaf Corpus)",
+        "language": "arabic",
+        "source": "zenodo"
+    },
+}
+class KrakenEngine(HTREngine):
+    """Kraken HTR engine plugin."""
+    def __init__(self):
+        self.model: Optional[Any] = None  # TorchSeqRecognizer
+        self._config_widget: Optional[QWidget] = None
+        # Widget references
+        self._model_source_combo: Optional[QComboBox] = None
+        self._preset_combo: Optional[QComboBox] = None
+        self._custom_model_edit: Optional[QLineEdit] = None
+        self._bidi_reorder_check: Optional[QCheckBox] = None
+    def get_name(self) -> str:
+        return "Kraken"
+    def get_description(self) -> str:
+        return "Kraken OCR - Specialized for historical documents with .mlmodel support"
+    def is_available(self) -> bool:
+        return KRAKEN_AVAILABLE
+    def get_unavailable_reason(self) -> str:
+        if not KRAKEN_AVAILABLE:
+            return "Kraken not installed. Install with: pip install kraken"
+        return ""
+    def get_config_widget(self) -> QWidget:
+        """Create Kraken configuration panel."""
+        if not PYQT_AVAILABLE:
+            raise RuntimeError("PyQt6 not installed. Install with: pip install PyQt6")
+        if self._config_widget is not None:
+            return self._config_widget
+        widget = QWidget()
+        layout = QVBoxLayout()
+        # Model source selection
+        source_group = QGroupBox("Model Source")
+        source_layout = QVBoxLayout()
+        self._model_source_combo = QComboBox()
+        self._model_source_combo.addItems(["Preset Models", "Custom Model File"])
+        self._model_source_combo.currentTextChanged.connect(self._on_model_source_changed)
+        source_layout.addWidget(self._model_source_combo)
+        source_group.setLayout(source_layout)
+        layout.addWidget(source_group)
+        # Preset models group
+        self._preset_group = QGroupBox("Preset Model")
+        preset_layout = QVBoxLayout()
+        self._preset_combo = QComboBox()
+        self._populate_preset_models()
+        self._preset_combo.currentIndexChanged.connect(self._on_preset_model_changed)
+        preset_layout.addWidget(QLabel("Model:"))
+        preset_layout.addWidget(self._preset_combo)
+        preset_hint = QLabel("Note: Zenodo models (⬇️) auto-download on first use")
+        preset_hint.setStyleSheet("color: gray; font-size: 9pt;")
+        preset_layout.addWidget(preset_hint)
+        self._preset_group.setLayout(preset_layout)
+        layout.addWidget(self._preset_group)
+        # Custom model group
+        self._custom_group = QGroupBox("Custom Model")
+        custom_layout = QVBoxLayout()
+        custom_layout.addWidget(QLabel("Model File (.mlmodel):"))
+        model_layout = QHBoxLayout()
+        self._custom_model_edit = QLineEdit()
+        self._custom_model_edit.setPlaceholderText("Path to .mlmodel file")
+        model_layout.addWidget(self._custom_model_edit)
+        browse_btn = QPushButton("Browse...")
+        browse_btn.clicked.connect(self._browse_model)
+        model_layout.addWidget(browse_btn)
+        custom_layout.addLayout(model_layout)
+        self._custom_group.setLayout(custom_layout)
+        self._custom_group.setVisible(False)  # Hidden by default
+        layout.addWidget(self._custom_group)
+        # Recognition settings
+        settings_group = QGroupBox("Recognition Settings")
+        settings_layout = QVBoxLayout()
+        self._bidi_reorder_check = QCheckBox("Bidirectional Text Reordering")
+        self._bidi_reorder_check.setChecked(True)
+        self._bidi_reorder_check.setToolTip("Enable for RTL languages (Arabic, Hebrew, etc.)")
+        settings_layout.addWidget(self._bidi_reorder_check)
+        settings_group.setLayout(settings_layout)
+        layout.addWidget(settings_group)
+        layout.addStretch()
+        widget.setLayout(layout)
+        self._config_widget = widget
+        return widget
+    def _populate_preset_models(self):
+        """Populate preset models dropdown with local and Zenodo models."""
+        if self._preset_combo is None:
+            return
+        self._preset_combo.clear()
+        if not KRAKEN_MODELS:
+            self._preset_combo.addItem("No presets available")
+            return
+        # Local model first
+        for model_id, info in KRAKEN_MODELS.items():
+            if info.get("source") == "local":
+                desc = info.get('description', model_id)
+                self._preset_combo.addItem(f"📁 {desc}", userData=model_id)
+                break
+        self._preset_combo.insertSeparator(self._preset_combo.count())
+        # Zenodo models
+        for model_id, info in KRAKEN_MODELS.items():
+            if info.get("source") == "zenodo":
+                desc = info.get('description', model_id)
+                lang = info.get('language', '')
+                self._preset_combo.addItem(f"⬇️  {desc} ({lang})", userData=model_id)
+        self._preset_combo.insertSeparator(self._preset_combo.count())
+        self._preset_combo.addItem("📂 Browse Custom File...", userData="__custom__")
+    def _on_model_source_changed(self, source: str):
+        """Toggle between preset and custom model selection."""
+        is_preset = (source == "Preset Models")
+        self._preset_group.setVisible(is_preset)
+        self._custom_group.setVisible(not is_preset)
+    def _on_preset_model_changed(self, index: int):
+        """Handle preset selection — open file browser for custom option."""
+        model_id = self._preset_combo.currentData()
+        if model_id == "__custom__":
+            file_path, _ = QFileDialog.getOpenFileName(
+                self._config_widget,
+                "Select Kraken Model File",
+                "",
+                "Kraken Models (*.mlmodel);;All Files (*)"
+            )
+            if file_path:
+                self._model_source_combo.setCurrentText("Custom Model File")
+                self._custom_model_edit.setText(file_path)
+            self._preset_combo.blockSignals(True)
+            self._preset_combo.setCurrentIndex(0)
+            self._preset_combo.blockSignals(False)
+    def _browse_model(self):
+        """Open file dialog to select model file."""
+        file_path, _ = QFileDialog.getOpenFileName(
+            self._config_widget,
+            "Select Kraken Model",
+            "models",
+            "Kraken Models (*.mlmodel);;All Files (*)"
+        )
+        if file_path:
+            self._custom_model_edit.setText(file_path)
+    def get_config(self) -> Dict[str, Any]:
+        """Extract configuration from widget controls."""
+        if self._config_widget is None:
+            return {}
+        is_preset = (self._model_source_combo.currentText() == "Preset Models")
+        config = {
+            "model_source": "preset" if is_preset else "custom",
+            "bidi_reordering": self._bidi_reorder_check.isChecked(),
+        }
+        if is_preset:
+            model_id = self._preset_combo.currentData()
+            if model_id and model_id in KRAKEN_MODELS:
+                config["preset_id"] = model_id
+                config["model_path"] = KRAKEN_MODELS[model_id].get("path")
+        else:
+            config["model_path"] = self._custom_model_edit.text()
+        return config
+    def set_config(self, config: Dict[str, Any]):
+        """Restore configuration to widget controls."""
+        if self._config_widget is None:
+            return
+        model_source = config.get("model_source", "preset")
+        self._model_source_combo.setCurrentText("Preset Models" if model_source == "preset" else "Custom Model File")
+        if model_source == "preset":
+            preset_id = config.get("preset_id", "")
+            for i in range(self._preset_combo.count()):
+                if self._preset_combo.itemData(i) == preset_id:
+                    self._preset_combo.setCurrentIndex(i)
+                    break
+        else:
+            self._custom_model_edit.setText(config.get("model_path", ""))
+        self._bidi_reorder_check.setChecked(config.get("bidi_reordering", True))
+    def load_model(self, config: Dict[str, Any]) -> bool:
+        """Load Kraken model (local or Zenodo auto-download)."""
+        try:
+            model_path = config.get("model_path")
+            preset_id = config.get("preset_id")
+            # Resolve Zenodo preset: download if needed
+            if preset_id and preset_id in KRAKEN_MODELS:
+                model_info = KRAKEN_MODELS[preset_id]
+                if model_info.get("source") == "zenodo":
+                    zenodo_id = model_info.get("zenodo_id")
+                    model_path = self._download_zenodo_model(zenodo_id, preset_id)
+                    if not model_path:
+                        print(f"Error: Failed to download Zenodo model '{preset_id}'")
+                        return False
+                elif model_info.get("source") == "local":
+                    model_path = model_info.get("path")
+            # Fall back to default local blla model
+            if not model_path:
+                model_path = LOCAL_BLLA_MODEL
+                print(f"No model specified, using default: {model_path}")
+            if not Path(model_path).exists():
+                print(f"Error: Model file not found: {model_path}")
+                print("For Zenodo models, run: kraken get <zenodo_id>")
+                return False
+            vgsl_model = vgsl.TorchVGSLModel.load_model(model_path)
+            from kraken.lib.models import TorchSeqRecognizer
+            self.model = TorchSeqRecognizer(vgsl_model, device='cpu')
+            print(f"Kraken model loaded from: {model_path}")
+            return True
+        except Exception as e:
+            import traceback
+            print(f"Error loading Kraken model: {e}")
+            print(traceback.format_exc())
+            self.model = None
+            return False
+    def _download_zenodo_model(self, zenodo_id: str, model_name: str) -> Optional[str]:
+        """Download a Kraken model from Zenodo via `kraken get`.
+        Models are cached in `kraken_models/` inside the repo root.
+        Returns local path on success, None on failure.
+        """
+        import subprocess
+        import shutil
+        import sys
+        import time
+        # Prefer the kraken binary from the same venv as this Python process
+        # (shutil.which only searches PATH, which may not include the venv bin/ in
+        # systemd services that invoke uvicorn directly without activating the venv).
+        venv_kraken = Path(sys.executable).parent / "kraken"
+        kraken_cmd = str(venv_kraken) if venv_kraken.exists() else shutil.which("kraken")
+        if not kraken_cmd:
+            _print("❌ 'kraken' command not found. Install with: pip install kraken")
+            _print(f"💡 Manual download: https://zenodo.org/record/{zenodo_id.split('/')[-1]}")
+            return None
+        repo_root = Path(__file__).parent.parent
+        models_dir = repo_root / "kraken_models"
+        models_dir.mkdir(exist_ok=True)
+        model_path = models_dir / f"{model_name}.mlmodel"
+        if model_path.exists():
+            _print(f"✅ Using cached Zenodo model: {model_path}")
+            return str(model_path)
+        # Check for any existing name-matched file
+        for existing in models_dir.glob("*.mlmodel"):
+            if model_name.lower() in existing.stem.lower():
+                _print(f"✅ Found existing model: {existing}")
+                return str(existing)
+        _print(f"📥 Downloading Zenodo model {zenodo_id} …")
+        _print(f"📂 Will save to: {model_path}")
+        _print("⏳ This may take a few minutes on first use …")
+        try:
+            result = subprocess.run(
+                [kraken_cmd, "get", zenodo_id],
+                capture_output=True, text=True, timeout=300
+            )
+            if result.returncode == 0:
+                # Find freshly downloaded .mlmodel (modified within last 2 min)
+                search_dirs = [
+                    Path.home() / "Library" / "Application Support" / "htrmopo",
+                    Path.home() / ".kraken",
+                ]
+                downloaded = None
+                for d in search_dirs:
+                    if not d.exists():
+                        continue
+                    for p in d.rglob("*.mlmodel"):
+                        if time.time() - p.stat().st_mtime < 120:
+                            downloaded = p
+                            break
+                    if downloaded:
+                        break
+                if downloaded and downloaded.exists():
+                    shutil.copy2(downloaded, model_path)
+                    _print(f"✅ Model saved to: {model_path}")
+                    return str(model_path)
+                else:
+                    _print("⚠️  Download succeeded but couldn't locate the file")
+            else:
+                _print(f"❌ kraken get failed (exit {result.returncode}): {result.stderr}")
+                _print(f"💡 Manual: kraken get {zenodo_id}  then copy to {models_dir}/")
+        except subprocess.TimeoutExpired:
+            _print("⏱️  Download timeout (>5 min). Try manually: kraken get " + zenodo_id)
+        except Exception as e:
+            _print(f"❌ Download error: {e}")
+        return None
+    def unload_model(self):
+        """Unload model from memory."""
+        if self.model is not None:
+            del self.model
+            self.model = None
+            # Free GPU memory
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+    def is_model_loaded(self) -> bool:
+        """Check if model is loaded."""
+        return self.model is not None
+    def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
+        """Transcribe a line image with Kraken."""
+        if self.model is None:
+            return TranscriptionResult(text="[Model not loaded]", confidence=0.0)
+        if config is None:
+            config = self.get_config()
+        try:
+            # Import numpy at the start
+            import numpy as np
+            # Convert numpy to PIL
+            from PIL import Image as PILImage
+            if isinstance(image, np.ndarray):
+                pil_image = PILImage.fromarray(image)
+            else:
+                pil_image = image
+            # Convert to grayscale first
+            if pil_image.mode != 'L':
+                pil_image = pil_image.convert('L')
+            # IMPORTANT: Do NOT binarize! Kraken models work better with grayscale
+            # Modern Kraken models are trained on grayscale images and binarization
+            # destroys character details, especially in historical manuscripts
+            # The previous median threshold was causing poor recognition quality
+            binary_image = pil_image  # Keep original grayscale
+            # Create a simple segmentation boundary for the full line image
+            # Kraken's rpred needs a Segmentation object with line boundaries
+            from kraken.containers import BaselineLine, Segmentation
+            height, width = binary_image.height, binary_image.width
+            # Create a baseline (horizontal line through the middle)
+            # Use 0-indexed coordinates (width-1, height-1 as maximum)
+            baseline = [[0, height // 2], [width - 1, height // 2]]
+            # Create a boundary polygon (rectangle around the entire image)
+            # Use 0-indexed coordinates to avoid "outside of image bounds" error
+            boundary = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
+            # Create a BaselineLine (not BBoxLine - that doesn't support baselines)
+            line = BaselineLine(
+                id='line_0',
+                baseline=baseline,
+                boundary=boundary,
+                text='',
+                tags=None,
+                split=None
+            )
+            # Create Segmentation container
+            seg = Segmentation(
+                type='baselines',
+                imagename='line',
+                text_direction='horizontal-lr',
+                script_detection=False,
+                lines=[line],
+                regions={},
+                line_orders=[]
+            )
+            # Run recognition
+            bidi = config.get("bidi_reordering", True)
+            # Model is already wrapped as TorchSeqRecognizer in load_model()
+            # rpred returns a generator
+            results = list(rpred.rpred(
+                network=self.model,
+                im=binary_image,
+                bounds=seg,
+                bidi_reordering=bidi
+            ))
+            # Extract text from first result
+            if results and len(results) > 0:
+                text = results[0].prediction
+                confidence = results[0].confidences
+                avg_confidence = sum(confidence) / len(confidence) if confidence else 1.0
+                return TranscriptionResult(
+                    text=text,
+                    confidence=avg_confidence,
+                    metadata={"model": "kraken"}
+                )
+            else:
+                return TranscriptionResult(text="", confidence=0.0)
+        except Exception as e:
+            import traceback
+            print(f"Error in Kraken transcription: {e}")
+            print(traceback.format_exc())
+            return TranscriptionResult(text=f"[Error: {e}]", confidence=0.0)
+    def get_capabilities(self) -> Dict[str, bool]:
+        """Kraken capabilities."""
+        return {
+            "batch_processing": False,  # Could be implemented
+            "confidence_scores": True,  # Kraken provides per-character confidence
+            "beam_search": False,  # Internal to Kraken
+            "language_model": False,  # Not explicitly exposed
+            "preprocessing": False,  # External binarization recommended
+        }
+def download_preset_model(preset_name: str) -> Optional[str]:
+    """Module-level helper: resolve and (if needed) download a Kraken preset model.
+    Used by batch_processing.py and the web server without instantiating KrakenEngine.
+    Returns local file path, or None on failure.
+    """
+    if preset_name not in KRAKEN_MODELS:
+        print(f"Unknown Kraken preset: '{preset_name}'. Available: {list(KRAKEN_MODELS)}")
+        return None
+    info = KRAKEN_MODELS[preset_name]
+    if info.get("source") == "local":
+        return info.get("path")
+    if info.get("source") == "zenodo":
+        engine = KrakenEngine.__new__(KrakenEngine)
+        return engine._download_zenodo_model(info["zenodo_id"], preset_name)
+    return None

engines/openwebui_engine.py ADDED Viewed

	@@ -0,0 +1,505 @@

+"""
+OpenWebUI Engine Plugin
+Wraps the OpenWebUI API (OpenAI-compatible) from uni-freiburg.de as an HTR engine.
+Supports multiple models available on the OpenWebUI platform.
+"""
+from typing import Dict, Any, Optional, List
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import io
+import base64
+from htr_engine_base import HTREngine, TranscriptionResult
+try:
+    from PyQt6.QtWidgets import (
+        QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
+        QPushButton, QCheckBox, QLineEdit, QGroupBox, QTextEdit,
+        QSpinBox
+    )
+    from PyQt6.QtCore import Qt
+    PYQT_AVAILABLE = True
+except ImportError:
+    PYQT_AVAILABLE = False
+    QWidget = object
+try:
+    from openai import OpenAI
+    OPENAI_AVAILABLE = True
+except ImportError:
+    OPENAI_AVAILABLE = False
+try:
+    from dotenv import load_dotenv
+    DOTENV_AVAILABLE = True
+except ImportError:
+    DOTENV_AVAILABLE = False
+class OpenWebUIEngine(HTREngine):
+    """OpenWebUI API HTR engine plugin (OpenAI-compatible)."""
+    def __init__(self):
+        self.client: Optional[OpenAI] = None
+        self._config_widget: Optional[QWidget] = None
+        self._available_models: List[str] = []
+        # Store config from load_model for batch processing
+        self._loaded_config: Dict[str, Any] = {}
+        # Widget references
+        self._model_combo: Optional[QComboBox] = None
+        self._api_key_edit: Optional[QLineEdit] = None
+        self._show_key_check: Optional[QCheckBox] = None
+        self._prompt_edit: Optional[QTextEdit] = None
+        self._temperature_spin: Optional[QSpinBox] = None
+        self._max_tokens_spin: Optional[QSpinBox] = None
+        self._refresh_models_btn: Optional[QPushButton] = None
+        # Default API configuration
+        self.base_url = ""
+        # Load environment variables from .env file (only once when instantiated)
+        self._load_env_variables()
+    def _load_env_variables(self):
+        """Load environment variables from .env file if available."""
+        try:
+            from dotenv import load_dotenv
+            # Look for .env in the project root (parent of engines/)
+            env_path = Path(__file__).parent.parent / ".env"
+            if env_path.exists():
+                load_dotenv(env_path)
+        except ImportError:
+            # Silently skip if python-dotenv is not installed
+            # Environment variables can still be set via OS
+            pass
+        # Load environment variables from .env file (if available)
+        self._load_env_file()
+    def _load_env_file(self):
+        """Load environment variables from project root's .env file.
+        Looks for .env in the project root directory (parent of engines/).
+        Silently skips loading if python-dotenv is not installed or if .env doesn't exist.
+        If .env loading fails or is skipped, the engine will still work if the API key
+        is provided through the config dict.
+        """
+        if not DOTENV_AVAILABLE:
+            return
+        env_path = Path(__file__).parent.parent / ".env"
+        if env_path.exists():
+            load_dotenv(env_path)
+    def get_name(self) -> str:
+        return "OpenWebUI"
+    def get_description(self) -> str:
+        return "OpenWebUI API from openwebui.uni-freiburg.de (OpenAI-compatible, multiple models)"
+    def is_available(self) -> bool:
+        return OPENAI_AVAILABLE
+    def get_unavailable_reason(self) -> str:
+        if not OPENAI_AVAILABLE:
+            return "OpenAI library not installed. Install with: pip install openai"
+        return ""
+    def get_config_widget(self) -> QWidget:
+        """Create OpenWebUI configuration panel."""
+        if self._config_widget is not None:
+            return self._config_widget
+        widget = QWidget()
+        layout = QVBoxLayout()
+        # API Key section
+        key_group = QGroupBox("API Key")
+        key_layout = QVBoxLayout()
+        key_input_layout = QHBoxLayout()
+        self._api_key_edit = QLineEdit()
+        self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
+        self._api_key_edit.setPlaceholderText("Enter your OpenWebUI API key")
+        key_input_layout.addWidget(self._api_key_edit)
+        self._show_key_check = QCheckBox("Show")
+        self._show_key_check.toggled.connect(self._toggle_key_visibility)
+        key_input_layout.addWidget(self._show_key_check)
+        key_layout.addLayout(key_input_layout)
+        key_hint = QLabel("Get your API key from https://openwebui.uni-freiburg.de")
+        key_hint.setStyleSheet("color: gray; font-size: 9pt;")
+        key_layout.addWidget(key_hint)
+        key_group.setLayout(key_layout)
+        layout.addWidget(key_group)
+        # Model selection with refresh button
+        model_group = QGroupBox("Model Selection")
+        model_layout = QVBoxLayout()
+        model_select_layout = QHBoxLayout()
+        self._model_combo = QComboBox()
+        self._model_combo.setMinimumWidth(300)
+        model_select_layout.addWidget(self._model_combo)
+        self._refresh_models_btn = QPushButton("Refresh Models")
+        self._refresh_models_btn.clicked.connect(self._refresh_models)
+        model_select_layout.addWidget(self._refresh_models_btn)
+        model_layout.addLayout(model_select_layout)
+        model_hint = QLabel("Click 'Refresh Models' to load available models from the server")
+        model_hint.setStyleSheet("color: gray; font-size: 9pt;")
+        model_layout.addWidget(model_hint)
+        model_group.setLayout(model_layout)
+        layout.addWidget(model_group)
+        # Generation parameters
+        params_group = QGroupBox("Generation Parameters")
+        params_layout = QVBoxLayout()
+        # Temperature
+        temp_layout = QHBoxLayout()
+        temp_layout.addWidget(QLabel("Temperature:"))
+        self._temperature_spin = QSpinBox()
+        self._temperature_spin.setRange(0, 100)
+        self._temperature_spin.setValue(10)  # 0.1
+        self._temperature_spin.setSuffix(" (×0.01)")
+        temp_layout.addWidget(self._temperature_spin)
+        temp_layout.addStretch()
+        params_layout.addLayout(temp_layout)
+        # Max tokens
+        tokens_layout = QHBoxLayout()
+        tokens_layout.addWidget(QLabel("Max Tokens:"))
+        self._max_tokens_spin = QSpinBox()
+        self._max_tokens_spin.setRange(100, 4096)
+        self._max_tokens_spin.setValue(500)
+        tokens_layout.addWidget(self._max_tokens_spin)
+        tokens_layout.addStretch()
+        params_layout.addLayout(tokens_layout)
+        params_group.setLayout(params_layout)
+        layout.addWidget(params_group)
+        # Custom prompt section
+        prompt_group = QGroupBox("Custom Prompt (Optional)")
+        prompt_layout = QVBoxLayout()
+        self._prompt_edit = QTextEdit()
+        self._prompt_edit.setPlaceholderText(
+            "Enter custom transcription prompt...\n\n"
+            "Default prompt:\n"
+            "Transcribe the text in this historical manuscript line image. "
+            "Return only the transcribed text without any explanation or formatting."
+        )
+        self._prompt_edit.setMaximumHeight(120)
+        prompt_layout.addWidget(self._prompt_edit)
+        prompt_group.setLayout(prompt_layout)
+        layout.addWidget(prompt_group)
+        layout.addStretch()
+        widget.setLayout(layout)
+        self._config_widget = widget
+        # Try to load saved API key
+        self._load_saved_api_key()
+        return widget
+    def _toggle_key_visibility(self, checked: bool):
+        """Toggle API key visibility."""
+        if checked:
+            self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Normal)
+        else:
+            self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
+    def _get_api_key_file(self) -> 'Path':
+        """Get path to API key storage file."""
+        from pathlib import Path
+        storage_dir = Path.home() / ".trocr_gui"
+        storage_dir.mkdir(exist_ok=True)
+        return storage_dir / "api_keys.json"
+    def _load_saved_api_key(self):
+        """Load saved API key."""
+        try:
+            import json
+            key_file = self._get_api_key_file()
+            if key_file.exists():
+                with open(key_file, "r") as f:
+                    keys = json.load(f)
+                if "openwebui" in keys:
+                    self._api_key_edit.setText(keys["openwebui"])
+        except Exception as e:
+            print(f"Warning: Could not load saved API key: {e}")
+    def _save_api_key(self):
+        """Save API key."""
+        try:
+            import json
+            key_file = self._get_api_key_file()
+            # Load existing keys
+            keys = {}
+            if key_file.exists():
+                with open(key_file, "r") as f:
+                    keys = json.load(f)
+            # Update key for OpenWebUI
+            api_key = self._api_key_edit.text().strip()
+            if api_key:
+                keys["openwebui"] = api_key
+                with open(key_file, "w") as f:
+                    json.dump(keys, f, indent=2)
+        except Exception as e:
+            print(f"Warning: Could not save API key: {e}")
+    def _refresh_models(self):
+        """Fetch available models from OpenWebUI API."""
+        api_key = self._api_key_edit.text().strip()
+        if not api_key:
+            self._model_combo.clear()
+            self._model_combo.addItem("Please enter API key first")
+            return
+        try:
+            # Create temporary client to fetch models
+            client = OpenAI(
+                base_url=self.base_url,
+                api_key=api_key
+            )
+            # Fetch models
+            models = client.models.list()
+            self._available_models = []
+            for model in models.data:
+                self._available_models.append(model.id)
+            # Update combo box
+            self._model_combo.clear()
+            if self._available_models:
+                self._model_combo.addItems(sorted(self._available_models))
+                print(f"[OpenWebUI] Loaded {len(self._available_models)} models")
+            else:
+                self._model_combo.addItem("No models found")
+        except Exception as e:
+            print(f"Error fetching models: {e}")
+            self._model_combo.clear()
+            self._model_combo.addItem(f"Error: {str(e)[:50]}")
+    def get_config(self) -> Dict[str, Any]:
+        """Extract configuration from widget controls."""
+        if self._config_widget is None:
+            return {}
+        prompt_text = self._prompt_edit.toPlainText().strip()
+        return {
+            "api_key": self._api_key_edit.text().strip(),
+            "model": self._model_combo.currentText(),
+            "temperature": self._temperature_spin.value() / 100.0,
+            "max_tokens": self._max_tokens_spin.value(),
+            "custom_prompt": prompt_text if prompt_text else None,
+        }
+    def set_config(self, config: Dict[str, Any]):
+        """Restore configuration to widget controls."""
+        if self._config_widget is None:
+            return
+        self._api_key_edit.setText(config.get("api_key", ""))
+        model = config.get("model", "")
+        idx = self._model_combo.findText(model)
+        if idx >= 0:
+            self._model_combo.setCurrentIndex(idx)
+        temp = int(config.get("temperature", 0.1) * 100)
+        self._temperature_spin.setValue(temp)
+        self._max_tokens_spin.setValue(config.get("max_tokens", 500))
+        custom_prompt = config.get("custom_prompt", "")
+        if custom_prompt:
+            self._prompt_edit.setPlainText(custom_prompt)
+    def load_model(self, config: Dict[str, Any]) -> bool:
+        """Initialize OpenWebUI client."""
+        try:
+            api_key = config.get("api_key", "")
+            if not api_key:
+                print("Error: No API key provided. Paste your key in the field.")
+                return False
+            base_url = config.get("base_url", "").strip().rstrip("/")
+            if not base_url:
+                print("Error: No OpenWebUI base URL provided.")
+                return False
+            # Store config for batch processing (model, temperature, etc.)
+            self._loaded_config = config.copy()
+            # Save API key for future use
+            if self._api_key_edit and self._api_key_edit.text().strip():
+                self._save_api_key()
+            self.base_url = base_url
+            # Initialize client
+            self.client = OpenAI(
+                base_url=self.base_url,
+                api_key=api_key
+            )
+            model = config.get("model", config.get("model_id", "unknown"))
+            print(f"[OpenWebUI] Client initialized with base URL: {self.base_url}, model: {model}")
+            return True
+        except Exception as e:
+            print(f"Error initializing OpenWebUI client: {e}")
+            self.client = None
+            return False
+    def unload_model(self):
+        """Unload OpenWebUI client."""
+        if self.client is not None:
+            self.client = None
+        self._loaded_config = {}
+    def is_model_loaded(self) -> bool:
+        """Check if client is initialized."""
+        return self.client is not None
+    def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
+        """Transcribe a line image with OpenWebUI API."""
+        if self.client is None:
+            return TranscriptionResult(text="[OpenWebUI client not initialized]", confidence=0.0)
+        if config is None:
+            # First try loaded config (from batch processing), then GUI config
+            if self._loaded_config:
+                config = self._loaded_config
+            else:
+                config = self.get_config()
+        try:
+            # Convert numpy array to PIL Image
+            if isinstance(image, np.ndarray):
+                pil_image = Image.fromarray(image)
+            else:
+                pil_image = image
+            # Convert to RGB if needed
+            if pil_image.mode != 'RGB':
+                pil_image = pil_image.convert('RGB')
+            # Encode image to base64
+            buffered = io.BytesIO()
+            pil_image.save(buffered, format="PNG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            # Prepare prompt
+            custom_prompt = config.get("custom_prompt")
+            if custom_prompt:
+                prompt = custom_prompt
+            else:
+                prompt = (
+                    "Transcribe the text in this historical manuscript line image. "
+                    "Return only the transcribed text without any explanation or formatting."
+                )
+            # Get model and parameters
+            model = config.get("model", "gpt-4-vision-preview")
+            temperature = config.get("temperature", 0.1)
+            max_tokens = config.get("max_tokens")
+            # Treat 0 as "no limit" (HTML number fields send 0 for blank)
+            if max_tokens is not None and max_tokens <= 0:
+                max_tokens = None
+            # Call OpenWebUI API (OpenAI-compatible)
+            api_kwargs = dict(
+                model=model,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{img_base64}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                temperature=temperature,
+            )
+            if max_tokens is not None:
+                api_kwargs["max_tokens"] = max_tokens
+            response = self.client.chat.completions.create(**api_kwargs)
+            # Extract transcription
+            text = response.choices[0].message.content.strip()
+            # Extract usage info
+            usage = {}
+            if hasattr(response, 'usage') and response.usage:
+                usage = {
+                    "prompt_tokens": response.usage.prompt_tokens,
+                    "completion_tokens": response.usage.completion_tokens,
+                    "total_tokens": response.usage.total_tokens
+                }
+            return TranscriptionResult(
+                text=text,
+                confidence=1.0,  # OpenWebUI doesn't provide confidence
+                metadata={
+                    "provider": "openwebui",
+                    "model": model,
+                    "usage": usage
+                }
+            )
+        except Exception as e:
+            print(f"Error in OpenWebUI transcription: {e}")
+            import traceback
+            traceback.print_exc()
+            return TranscriptionResult(text=f"[OpenWebUI Error: {e}]", confidence=0.0)
+    def get_capabilities(self) -> Dict[str, bool]:
+        """OpenWebUI capabilities."""
+        return {
+            "batch_processing": False,
+            "confidence_scores": False,
+            "beam_search": False,
+            "language_model": True,
+            "preprocessing": True,
+        }
+    def requires_line_segmentation(self) -> bool:
+        """OpenWebUI VLMs can process full pages directly without segmentation."""
+        return False  # VLMs process full page images

engines/pylaia_engine.py ADDED Viewed

	@@ -0,0 +1,414 @@

+"""
+PyLaia Engine Plugin
+Wraps the PyLaia CTC-based HTR inference system as a plugin.
+"""
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import numpy as np
+from htr_engine_base import HTREngine, TranscriptionResult
+try:
+    from PyQt6.QtWidgets import (
+        QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
+        QPushButton, QCheckBox, QLineEdit, QFileDialog,
+        QGroupBox, QDoubleSpinBox
+    )
+    from PyQt6.QtCore import Qt
+    PYQT_AVAILABLE = True
+except ImportError:
+    PYQT_AVAILABLE = False
+    QWidget = object
+try:
+    # Use native Linux implementation (no WSL dependency)
+    from inference_pylaia_native import PyLaiaInference, PYLAIA_MODELS
+    PYLAIA_AVAILABLE = True
+    PYLAIA_LM_AVAILABLE = False  # Language model not yet implemented
+except ImportError:
+    PYLAIA_AVAILABLE = False
+    PYLAIA_MODELS = {}
+    PYLAIA_LM_AVAILABLE = False
+class PyLaiaEngine(HTREngine):
+    """PyLaia CTC-based HTR engine plugin."""
+    def __init__(self):
+        self.model: Optional[PyLaiaInference] = None
+        self.model_lm: Optional[PyLaiaInferenceLM] = None
+        self._config_widget: Optional[QWidget] = None
+        # Widget references
+        self._model_combo: Optional[QComboBox] = None
+        self._use_lm_check: Optional[QCheckBox] = None
+        self._lm_weight_spin: Optional[QDoubleSpinBox] = None
+        self._custom_model_edit: Optional[QLineEdit] = None
+        self._custom_lm_edit: Optional[QLineEdit] = None
+        self._enable_spaces_check: Optional[QCheckBox] = None
+    def get_name(self) -> str:
+        return "CRNN-CTC (PyLaia-inspired)"
+    def get_description(self) -> str:
+        return "Puigcerver CRNN-CTC: clean-room PyTorch reimplementation of the PyLaia architecture"
+    def get_aliases(self) -> List[str]:
+        return ["crnn-ctc", "CRNN-CTC", "PyLaia"]  # "PyLaia" kept for backward compatibility
+    def is_available(self) -> bool:
+        return PYLAIA_AVAILABLE
+    def get_unavailable_reason(self) -> str:
+        if not PYLAIA_AVAILABLE:
+            return "CRNN-CTC engine not available. Check that inference_pylaia_native.py exists and dependencies are installed."
+        return ""
+    def get_config_widget(self) -> QWidget:
+        """Create PyLaia configuration panel."""
+        if self._config_widget is not None:
+            return self._config_widget
+        widget = QWidget()
+        layout = QVBoxLayout()
+        # Model selection
+        model_group = QGroupBox("Model Selection")
+        model_layout = QVBoxLayout()
+        # Preset models
+        model_layout.addWidget(QLabel("Preset Model:"))
+        self._model_combo = QComboBox()
+        self._populate_preset_models()
+        self._model_combo.currentTextChanged.connect(self._on_preset_changed)
+        model_layout.addWidget(self._model_combo)
+        # Custom model path
+        model_layout.addWidget(QLabel("Custom Model Path:"))
+        custom_layout = QHBoxLayout()
+        self._custom_model_edit = QLineEdit()
+        self._custom_model_edit.setPlaceholderText("Leave empty to use preset model")
+        custom_layout.addWidget(self._custom_model_edit)
+        browse_model_btn = QPushButton("Browse...")
+        browse_model_btn.clicked.connect(self._browse_model)
+        custom_layout.addWidget(browse_model_btn)
+        model_layout.addLayout(custom_layout)
+        model_group.setLayout(model_layout)
+        layout.addWidget(model_group)
+        # Language model settings
+        lm_group = QGroupBox("Language Model (Optional)")
+        lm_layout = QVBoxLayout()
+        self._use_lm_check = QCheckBox("Use Language Model")
+        self._use_lm_check.setChecked(False)
+        self._use_lm_check.toggled.connect(self._on_lm_toggled)
+        if not PYLAIA_LM_AVAILABLE:
+            self._use_lm_check.setEnabled(False)
+            self._use_lm_check.setToolTip("KenLM not available. Install with: pip install kenlm")
+        lm_layout.addWidget(self._use_lm_check)
+        # LM weight
+        weight_layout = QHBoxLayout()
+        weight_layout.addWidget(QLabel("LM Weight:"))
+        self._lm_weight_spin = QDoubleSpinBox()
+        self._lm_weight_spin.setRange(0.0, 10.0)
+        self._lm_weight_spin.setValue(1.5)
+        self._lm_weight_spin.setSingleStep(0.1)
+        self._lm_weight_spin.setToolTip("Higher = more influence from language model")
+        self._lm_weight_spin.setEnabled(False)
+        weight_layout.addWidget(self._lm_weight_spin)
+        weight_layout.addStretch()
+        lm_layout.addLayout(weight_layout)
+        # Custom LM path
+        lm_layout.addWidget(QLabel("Custom LM Path:"))
+        custom_lm_layout = QHBoxLayout()
+        self._custom_lm_edit = QLineEdit()
+        self._custom_lm_edit.setPlaceholderText("Leave empty for auto-detection")
+        self._custom_lm_edit.setEnabled(False)
+        custom_lm_layout.addWidget(self._custom_lm_edit)
+        browse_lm_btn = QPushButton("Browse...")
+        browse_lm_btn.clicked.connect(self._browse_lm)
+        browse_lm_btn.setEnabled(False)
+        self._browse_lm_btn = browse_lm_btn
+        custom_lm_layout.addWidget(browse_lm_btn)
+        lm_layout.addLayout(custom_lm_layout)
+        lm_group.setLayout(lm_layout)
+        layout.addWidget(lm_group)
+        # Output options
+        output_group = QGroupBox("Output Options")
+        output_layout = QVBoxLayout()
+        self._enable_spaces_check = QCheckBox("Convert <space> tokens to spaces")
+        self._enable_spaces_check.setChecked(True)
+        self._enable_spaces_check.setToolTip(
+            "When enabled, <space> or <SPACE> tokens in the vocabulary are converted to actual spaces.\n"
+            "Disable to keep them as literal <space> text."
+        )
+        output_layout.addWidget(self._enable_spaces_check)
+        self._flip_rtl_check = QCheckBox("RTL manuscript (flip line images)")
+        self._flip_rtl_check.setChecked(False)
+        self._flip_rtl_check.setToolTip(
+            "Flip line images horizontally for right-to-left scripts.\n"
+            "Required for models trained on RTL manuscripts (Ottoman, Arabic, Hebrew, etc.)\n"
+            "with left-to-right transcriptions (Latin transliteration)."
+        )
+        output_layout.addWidget(self._flip_rtl_check)
+        output_group.setLayout(output_layout)
+        layout.addWidget(output_group)
+        layout.addStretch()
+        widget.setLayout(layout)
+        self._config_widget = widget
+        return widget
+    def _populate_preset_models(self):
+        """Populate preset models dropdown."""
+        if self._model_combo is None:
+            return
+        self._model_combo.clear()
+        if not PYLAIA_MODELS:
+            self._model_combo.addItem("No preset models found")
+            return
+        for model_id in PYLAIA_MODELS.keys():
+            self._model_combo.addItem(model_id)
+    def _on_preset_changed(self, preset_name: str):
+        """Update when preset changes."""
+        # Could add description display here
+        pass
+    def _on_lm_toggled(self, checked: bool):
+        """Enable/disable LM controls."""
+        self._lm_weight_spin.setEnabled(checked)
+        self._custom_lm_edit.setEnabled(checked)
+        self._browse_lm_btn.setEnabled(checked)
+    def _browse_model(self):
+        """Open file dialog to select model file."""
+        file_path, _ = QFileDialog.getOpenFileName(
+            self._config_widget,
+            "Select CRNN-CTC Model",
+            "models",
+            "CRNN-CTC Models (*.ckpt *.pth *.pt);;All Files (*)"
+        )
+        if file_path:
+            self._custom_model_edit.setText(file_path)
+    def _browse_lm(self):
+        """Open file dialog to select LM file."""
+        file_path, _ = QFileDialog.getOpenFileName(
+            self._config_widget,
+            "Select KenLM Model",
+            "models",
+            "KenLM Models (*.arpa *.klm *.bin);;All Files (*)"
+        )
+        if file_path:
+            self._custom_lm_edit.setText(file_path)
+    def get_config(self) -> Dict[str, Any]:
+        """Extract configuration from widget controls."""
+        if self._config_widget is None:
+            return {}
+        custom_model = self._custom_model_edit.text().strip()
+        preset_model = self._model_combo.currentText()
+        config = {
+            "model_path": custom_model if custom_model else preset_model,
+            "use_lm": self._use_lm_check.isChecked(),
+            "lm_weight": self._lm_weight_spin.value(),
+            "enable_spaces": self._enable_spaces_check.isChecked(),
+            "flip_rtl": self._flip_rtl_check.isChecked(),
+        }
+        if config["use_lm"]:
+            custom_lm = self._custom_lm_edit.text().strip()
+            if custom_lm:
+                config["lm_path"] = custom_lm
+        return config
+    def set_config(self, config: Dict[str, Any]):
+        """Restore configuration to widget controls."""
+        if self._config_widget is None:
+            return
+        model_path = config.get("model_path", "")
+        # Try to find in presets
+        idx = self._model_combo.findText(model_path)
+        if idx >= 0:
+            self._model_combo.setCurrentIndex(idx)
+            self._custom_model_edit.clear()
+        else:
+            self._custom_model_edit.setText(model_path)
+        self._use_lm_check.setChecked(config.get("use_lm", False))
+        self._lm_weight_spin.setValue(config.get("lm_weight", 1.5))
+        self._enable_spaces_check.setChecked(config.get("enable_spaces", True))
+        if hasattr(self, '_flip_rtl_check'):
+            self._flip_rtl_check.setChecked(config.get("flip_rtl", False))
+        if "lm_path" in config:
+            self._custom_lm_edit.setText(config["lm_path"])
+    def load_model(self, config: Dict[str, Any]) -> bool:
+        """Load PyLaia model."""
+        try:
+            model_path = config.get("model_path", "")
+            if not model_path or model_path == "No preset models found":
+                return False
+            # If it's a preset name, resolve to actual path and syms
+            syms_path = None
+            if model_path in PYLAIA_MODELS:
+                preset_info = PYLAIA_MODELS[model_path]
+                if isinstance(preset_info, dict):
+                    if preset_info.get("repo_id"):
+                        try:
+                            from huggingface_hub import hf_hub_download
+                        except ImportError as exc:
+                            raise RuntimeError(
+                                "huggingface_hub is required for Hugging Face model presets"
+                            ) from exc
+                        repo_id = preset_info["repo_id"]
+                        model_path = hf_hub_download(
+                            repo_id=repo_id,
+                            filename=preset_info.get("checkpoint", "best_model.pt"),
+                        )
+                        syms_path = hf_hub_download(
+                            repo_id=repo_id,
+                            filename=preset_info.get("syms", "symbols.txt"),
+                        )
+                    else:
+                        model_path = preset_info.get("checkpoint", preset_info.get("path", model_path))
+                        syms_path = preset_info.get("syms")
+                # If preset_info is just a string, use it as the path
+                elif isinstance(preset_info, str):
+                    model_path = preset_info
+            use_lm = config.get("use_lm", False)
+            # Unload previous model
+            self.unload_model()
+            if use_lm and PYLAIA_LM_AVAILABLE:
+                # Load with language model
+                lm_weight = config.get("lm_weight", 1.5)
+                lm_path = config.get("lm_path")
+                self.model_lm = PyLaiaInferenceLM(
+                    model_path=model_path,
+                    lm_path=lm_path,
+                    lm_weight=lm_weight
+                )
+                self.model = None
+            else:
+                # Load without language model
+                # PyLaiaInference expects checkpoint_path, syms_path, and enable_spaces
+                enable_spaces = config.get("enable_spaces", True)
+                self.model = PyLaiaInference(
+                    checkpoint_path=model_path,
+                    syms_path=syms_path,
+                    enable_spaces=enable_spaces
+                )
+                self.model_lm = None
+            return True
+        except Exception as e:
+            import traceback
+            print(f"Error loading PyLaia model: {e}")
+            print(traceback.format_exc())
+            self.model = None
+            self.model_lm = None
+            return False
+    def unload_model(self):
+        """Unload model from memory."""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.model_lm is not None:
+            del self.model_lm
+            self.model_lm = None
+        # Free GPU memory
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    def is_model_loaded(self) -> bool:
+        """Check if model is loaded."""
+        return self.model is not None or self.model_lm is not None
+    def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
+        """Transcribe a line image with PyLaia."""
+        if not self.is_model_loaded():
+            return TranscriptionResult(text="[Model not loaded]", confidence=0.0)
+        try:
+            # Convert numpy to PIL
+            from PIL import Image as PILImage
+            if isinstance(image, np.ndarray):
+                pil_image = PILImage.fromarray(image)
+            else:
+                pil_image = image
+            # Flip horizontally for RTL scripts
+            if config and config.get("flip_rtl", False):
+                pil_image = pil_image.transpose(PILImage.FLIP_LEFT_RIGHT)
+            # PyLaiaInferenceWSL uses transcribe() which returns (text, confidence) tuple
+            # Use LM version if available (not yet implemented for WSL)
+            if self.model_lm is not None:
+                # PyLaiaInferenceLM might have different method
+                result = self.model_lm.transcribe(pil_image)
+            else:
+                result = self.model.transcribe(pil_image)
+            # Result is a tuple: (text, confidence)
+            if isinstance(result, tuple):
+                text, confidence = result
+            else:
+                # Fallback for dict-style results
+                text = result.get("text", "")
+                confidence = result.get("confidence", 1.0)
+            return TranscriptionResult(
+                text=text,
+                confidence=confidence,
+                metadata={"model": "pylaia"}
+            )
+        except Exception as e:
+            import traceback
+            print(f"Error in PyLaia transcription: {e}")
+            print(traceback.format_exc())
+            return TranscriptionResult(text=f"[Error: {e}]", confidence=0.0)
+    def get_capabilities(self) -> Dict[str, bool]:
+        """PyLaia capabilities."""
+        return {
+            "batch_processing": False,  # Could be implemented
+            "confidence_scores": True,  # CTC provides confidence
+            "beam_search": False,  # CTC uses greedy/beam decoding
+            "language_model": PYLAIA_LM_AVAILABLE,  # Optional KenLM
+            "preprocessing": False,  # External preprocessing recommended
+        }

hf-space/README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+title: Polyscriptor HTR Demo
+emoji: 📝
+colorFrom: teal
+colorTo: slate
+sdk: docker
+pinned: false
+license: apache-2.0
+---
+# Polyscriptor HTR Demo
+This is the hosted Hugging Face Spaces demo for Polyscriptor. It runs the
+existing FastAPI/Web UI with a constrained demo mode:
+- CRNN-CTC (PyLaia-inspired) engines only.
+- Public model presets are downloaded from `achimrabus/*` Hugging Face model repos.
+- CPU inference only.
+- Kraken Classical line segmentation, with HPP as a lightweight fallback.
+- Temporary uploads only.
+The normal Polyscriptor server, local GPU workflow, and the existing mobile PWA
+demo under `web/static/pwa/` are not changed by this Space configuration.
+## Source Code
+Public source repository:
+https://github.com/achimrabus/polyscriptor
+The Space repository is a curated deployment snapshot for the hosted demo. The
+GitHub repository contains the broader Polyscriptor codebase and local workflows.
+## Deployment Note
+Hugging Face Docker Spaces expect the `Dockerfile` at the root of the Space
+repository. This branch includes a root `Dockerfile` for direct Space builds and
+keeps the Space-specific notes and dependency set in `hf-space/`.
+When publishing into a dedicated Space repository under
+`https://huggingface.co/spaces/achimrabus/...`, use `hf-space/SPACE_README.md`
+as the Space repository root `README.md`. The Polyscriptor project README is
+left untouched in this branch.

hf-space/SPACE_README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+title: Polyscriptor HTR Demo
+emoji: 📝
+colorFrom: blue
+colorTo: gray
+sdk: docker
+pinned: false
+license: apache-2.0
+---
+# Polyscriptor HTR Demo
+Polyscriptor is a browser-based demo for handwritten text recognition (HTR) on
+historical Slavic manuscript material. This Hugging Face Space runs a constrained
+public version of the Polyscriptor FastAPI/Web interface.
+The hosted demo is intended for quick inspection and teaching. It is not the full
+local research environment used for training, batch processing, GPU inference, or
+private manuscript collections.
+## Source Code
+The public Polyscriptor source code is available on GitHub:
+https://github.com/achimrabus/polyscriptor
+This Hugging Face Space contains the curated hosted demo deployment. The GitHub
+repository contains the broader Polyscriptor codebase, including the web UI,
+engine plugins, segmentation code, training utilities, and local workflows.
+## What This Demo Supports
+- CRNN-CTC / PyLaia-inspired HTR presets for selected public model repositories.
+- User-supplied API keys for OpenAI, Gemini, Claude, and OpenWebUI-compatible
+  endpoints.
+- Public model download from the Hugging Face Hub, primarily under
+  `achimrabus/*`.
+- CPU-only inference.
+- Kraken Classical line segmentation, with HPP as a lightweight fallback.
+- Temporary image uploads during the active session.
+## Limitations
+- No private models are bundled with this Space.
+- API-based engines require users to paste their own API key in the browser
+  form. The Space does not ship with shared provider credentials.
+- Uploaded files are treated as temporary runtime data and are not part of the
+  repository.
+- Large local GPU/VLM engines from the full Polyscriptor workflow are not
+  enabled here.
+- Accuracy depends strongly on script, language, writing style, image quality,
+  and segmentation quality.
+## Model Notes
+The demo uses publicly available model presets. For best results, choose a model
+that matches the manuscript tradition as closely as possible. The current public
+Polyscriptor model cards are available at:
+https://huggingface.co/achimrabus
+## Project Context
+Polyscriptor is developed for historical HTR workflows, with a focus on Slavic
+manuscripts and reproducible comparison of OCR/HTR engines. The full development
+repository contains additional tooling for local use, training, evaluation, and
+batch processing; this Space contains only the hosted demo configuration.
+## Privacy
+Do not upload sensitive or unpublished manuscript images unless you are
+comfortable processing them in a hosted public demo environment. The application
+uses temporary server-side files during processing, but this Space should be
+treated as a public demonstration service rather than a secure private workflow.
+For API-based engines, provider keys are entered by the user at runtime. Do not
+commit keys to this repository or add them to the Space configuration unless you
+intend to provide a shared project credential.

hf-space/requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+--extra-index-url https://download.pytorch.org/whl/cpu
+torch>=2.5.1,<2.10
+torchvision>=0.20.1,<0.25
+numpy>=2.0,<2.1
+pillow==11.1.0
+opencv-python-headless==4.11.0.86
+scikit-learn>=1.5,<1.6
+scipy>=1.13,<1.14
+kraken>=6.0.0,<7.0.0
+fastapi>=0.111.0
+uvicorn[standard]>=0.29.0
+python-multipart>=0.0.9
+pymupdf>=1.24.0
+pyyaml==6.0.2
+huggingface_hub>=0.23.0
+python-Levenshtein>=0.23.0
+openai>=1.50.0
+anthropic>=0.34.0
+google-genai>=1.0.0
+google-generativeai>=0.8.0

htr_engine_base.py ADDED Viewed

	@@ -0,0 +1,398 @@

+"""
+HTR Engine Plugin System - Base Classes and Registry
+This module defines the plugin architecture for HTR (Handwritten Text Recognition) engines.
+All HTR engines (TrOCR, Qwen3, CRNN-CTC, Kraken, etc.) implement the HTREngine interface.
+Design principles:
+- Abstraction: Each engine is self-contained and interchangeable
+- Scalability: New engines can be added without modifying existing code
+- Consistency: All engines expose the same interface to the GUI
+- Flexibility: Each engine can have custom configuration widgets
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass
+import os
+import numpy as np
+try:
+    from PyQt6.QtWidgets import QWidget, QVBoxLayout, QLabel
+    PYQT_AVAILABLE = True
+except ImportError:
+    PYQT_AVAILABLE = False
+    QWidget = object
+@dataclass
+class TranscriptionResult:
+    """Result from HTR engine transcription."""
+    text: str
+    confidence: float = 1.0
+    metadata: Dict[str, Any] = None
+    def __post_init__(self):
+        if self.metadata is None:
+            self.metadata = {}
+class HTREngine(ABC):
+    """Abstract base class for HTR engines.
+    All HTR engines must implement this interface to be compatible
+    with the GUI and batch processing systems.
+    """
+    @abstractmethod
+    def get_name(self) -> str:
+        """Get display name for the engine.
+        Returns:
+            str: Human-readable engine name (e.g., "TrOCR", "Qwen3 VLM")
+        """
+        pass
+    @abstractmethod
+    def get_description(self) -> str:
+        """Get brief description of the engine.
+        Returns:
+            str: One-line description (e.g., "Transformer-based OCR for manuscripts")
+        """
+        pass
+    @abstractmethod
+    def is_available(self) -> bool:
+        """Check if engine dependencies are installed and functional.
+        Returns:
+            bool: True if engine can be used, False otherwise
+        """
+        pass
+    @abstractmethod
+    def get_unavailable_reason(self) -> str:
+        """Get reason why engine is unavailable (if is_available() == False).
+        Returns:
+            str: Explanation and installation instructions
+        """
+        pass
+    @abstractmethod
+    def get_config_widget(self) -> QWidget:
+        """Create and return configuration widget for this engine.
+        The widget should contain all engine-specific controls (model selection,
+        beam search, preprocessing options, etc.). The GUI will embed this widget
+        in the configuration panel.
+        Returns:
+            QWidget: Qt widget with engine configuration controls
+        """
+        pass
+    @abstractmethod
+    def get_config(self) -> Dict[str, Any]:
+        """Get current configuration from the config widget.
+        This method extracts values from the widget controls and returns
+        them as a dictionary that can be passed to transcribe_line().
+        Returns:
+            Dict[str, Any]: Configuration parameters
+        """
+        pass
+    @abstractmethod
+    def set_config(self, config: Dict[str, Any]):
+        """Set configuration values in the config widget.
+        Used to restore saved settings when switching engines.
+        Args:
+            config: Configuration parameters
+        """
+        pass
+    @abstractmethod
+    def load_model(self, config: Dict[str, Any]) -> bool:
+        """Load the HTR model with given configuration.
+        Args:
+            config: Configuration parameters (from get_config())
+        Returns:
+            bool: True if model loaded successfully, False otherwise
+        """
+        pass
+    @abstractmethod
+    def unload_model(self):
+        """Unload model from memory to free resources.
+        Called when switching to a different engine or closing the application.
+        """
+        pass
+    @abstractmethod
+    def is_model_loaded(self) -> bool:
+        """Check if model is currently loaded.
+        Returns:
+            bool: True if model is ready for inference
+        """
+        pass
+    @abstractmethod
+    def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
+        """Transcribe a single line image.
+        Args:
+            image: Line image as numpy array (RGB, shape: H x W x 3)
+            config: Optional configuration overrides
+        Returns:
+            TranscriptionResult: Transcription text and metadata
+        """
+        pass
+    def requires_line_segmentation(self) -> bool:
+        """Check if engine requires pre-segmented lines or can process full pages.
+        Returns:
+            bool: True if lines must be segmented first (TrOCR, CRNN-CTC),
+                  False if engine handles full pages (Qwen3, Commercial APIs)
+        """
+        return True  # Default: most engines need line segmentation
+    def transcribe_lines(self, images: List[np.ndarray], config: Optional[Dict[str, Any]] = None) -> List[TranscriptionResult]:
+        """Transcribe multiple line images (batch processing).
+        Default implementation calls transcribe_line() for each image.
+        Engines can override this for optimized batch processing.
+        Args:
+            images: List of line images
+            config: Optional configuration overrides
+        Returns:
+            List[TranscriptionResult]: Transcriptions for each image
+        """
+        return [self.transcribe_line(img, config) for img in images]
+    def supports_batch(self) -> bool:
+        """Check if engine supports optimized batch processing.
+        Returns:
+            bool: True if transcribe_lines() is optimized, False if it just loops
+        """
+        return False
+    def get_aliases(self) -> List[str]:
+        """Get alternative names for this engine (e.g., short CLI aliases).
+        Returns:
+            List[str]: Alternative names accepted by the registry (default: none)
+        """
+        return []
+    def get_capabilities(self) -> Dict[str, bool]:
+        """Get engine capabilities.
+        Returns:
+            Dict with capability flags:
+            - batch_processing: Supports batch inference
+            - confidence_scores: Returns confidence scores
+            - beam_search: Supports beam search decoding
+            - language_model: Uses language model for post-processing
+            - preprocessing: Has built-in preprocessing
+        """
+        return {
+            "batch_processing": self.supports_batch(),
+            "confidence_scores": False,
+            "beam_search": False,
+            "language_model": False,
+            "preprocessing": False,
+        }
+class HTREngineRegistry:
+    """Registry of available HTR engines.
+    Manages discovery, registration, and instantiation of HTR engines.
+    """
+    def __init__(self):
+        self.engines: List[HTREngine] = []
+        self._engine_cache: Dict[str, HTREngine] = {}
+    def register(self, engine: HTREngine):
+        """Register an HTR engine.
+        Args:
+            engine: HTREngine instance to register
+        """
+        self.engines.append(engine)
+        self._engine_cache[engine.get_name()] = engine
+        for alias in engine.get_aliases():
+            self._engine_cache[alias] = engine
+    def discover_engines(self):
+        """Automatically discover and register all available engines.
+        Tries to import each engine module and registers it if available.
+        """
+        if os.environ.get("POLYSCRIPTOR_DEMO_MODE") == "hf_space":
+            demo_engines = [
+                ("CRNN-CTC", "engines.pylaia_engine", "PyLaiaEngine"),
+                ("Commercial APIs", "engines.commercial_api_engine", "CommercialAPIEngine"),
+                ("OpenWebUI", "engines.openwebui_engine", "OpenWebUIEngine"),
+            ]
+            for label, module_name, class_name in demo_engines:
+                try:
+                    module = __import__(module_name, fromlist=[class_name])
+                    self.register(getattr(module, class_name)())
+                except ImportError as e:
+                    print(f"Warning: Failed to load {label} engine: {e}")
+            return
+        # Import and register TrOCR engine
+        try:
+            from engines.trocr_engine import TrOCREngine
+            self.register(TrOCREngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load TrOCR engine: {e}")
+        # Import and register Qwen3 engine
+        try:
+            from engines.qwen3_engine import Qwen3Engine
+            self.register(Qwen3Engine())
+        except ImportError as e:
+            print(f"Warning: Failed to load Qwen3 engine: {e}")
+        # Import and register Churro engine
+        try:
+            from engines.churro_engine import ChurroEngine
+            self.register(ChurroEngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load Churro engine: {e}")
+        # Import and register CRNN-CTC engine
+        try:
+            from engines.pylaia_engine import PyLaiaEngine
+            self.register(PyLaiaEngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load CRNN-CTC engine: {e}")
+        # Import and register Kraken engine
+        try:
+            from engines.kraken_engine import KrakenEngine
+            self.register(KrakenEngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load Kraken engine: {e}")
+        # Import and register Commercial API engine
+        try:
+            from engines.commercial_api_engine import CommercialAPIEngine
+            self.register(CommercialAPIEngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load Commercial API engine: {e}")
+        # Import and register Party engine
+        try:
+            from engines.party_engine import PartyEngine
+            self.register(PartyEngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load Party engine: {e}")
+        # Import and register OpenWebUI engine
+        try:
+            from engines.openwebui_engine import OpenWebUIEngine
+            self.register(OpenWebUIEngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load OpenWebUI engine: {e}")
+        # Import and register DeepSeek-OCR engine
+        try:
+            from engines.deepseek_ocr_engine import DeepSeekOCREngine
+            self.register(DeepSeekOCREngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load DeepSeek-OCR engine: {e}")
+        # Import and register LightOnOCR engine
+        try:
+            from engines.lighton_ocr_engine import LightOnOCREngine
+            self.register(LightOnOCREngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load LightOnOCR engine: {e}")
+        # Import and register PaddleOCR engine
+        try:
+            from engines.paddle_engine import PaddleOCREngine
+            self.register(PaddleOCREngine())
+        except ImportError as e:
+            print(f"Warning: Failed to load PaddleOCR engine: {e}")
+    def get_available_engines(self) -> List[HTREngine]:
+        """Get list of engines with satisfied dependencies.
+        Returns:
+            List[HTREngine]: Engines that can be used
+        """
+        return [e for e in self.engines if e.is_available()]
+    def get_all_engines(self) -> List[HTREngine]:
+        """Get all registered engines (including unavailable ones).
+        Returns:
+            List[HTREngine]: All registered engines
+        """
+        return self.engines
+    def get_engine_by_name(self, name: str) -> Optional[HTREngine]:
+        """Get engine by display name.
+        Args:
+            name: Engine display name
+        Returns:
+            Optional[HTREngine]: Engine instance or None if not found
+        """
+        return self._engine_cache.get(name)
+    def get_engine_names(self) -> List[str]:
+        """Get list of available engine names.
+        Returns:
+            List[str]: Engine display names
+        """
+        return [e.get_name() for e in self.get_available_engines()]
+# Global registry instance (singleton pattern)
+_global_registry: Optional[HTREngineRegistry] = None
+def get_global_registry() -> HTREngineRegistry:
+    """Get global HTR engine registry (singleton).
+    Returns:
+        HTREngineRegistry: Global registry instance
+    """
+    global _global_registry
+    if _global_registry is None:
+        _global_registry = HTREngineRegistry()
+        _global_registry.discover_engines()
+    return _global_registry
+# Convenience function for GUI
+def get_available_engine_names() -> List[str]:
+    """Get list of available engine names (convenience function).
+    Returns:
+        List[str]: Engine display names
+    """
+    return get_global_registry().get_engine_names()

inference_commercial_api.py ADDED Viewed

	@@ -0,0 +1,760 @@

+"""
+Commercial VLM/LLM API inference for manuscript transcription.
+Supports:
+- OpenAI GPT-4 Vision / GPT-4o
+- Google Gemini Pro Vision / Gemini Flash
+- Anthropic Claude 3 (Opus, Sonnet, Haiku)
+Usage:
+    # OpenAI
+    api = OpenAIInference(api_key="YOUR_OPENAI_API_KEY")
+    text = api.transcribe(image)
+    # Gemini
+    api = GeminiInference(api_key="YOUR_GEMINI_API_KEY")
+    text = api.transcribe(image)
+    # Claude
+    api = ClaudeInference(api_key="YOUR_ANTHROPIC_API_KEY")
+    text = api.transcribe(image)
+"""
+import base64
+import io
+import time
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Optional, Dict, Any
+from PIL import Image
+# API clients (install with: pip install openai google-generativeai anthropic)
+try:
+    from openai import OpenAI
+    OPENAI_AVAILABLE = True
+except ImportError:
+    OPENAI_AVAILABLE = False
+try:
+    from google import genai as _google_genai_new
+    from google.genai import types as _google_genai_types
+    GEMINI_AVAILABLE = True
+    GEMINI_NEW_SDK = True
+except ImportError:
+    GEMINI_NEW_SDK = False
+    try:
+        import google.generativeai as genai  # legacy fallback
+        GEMINI_AVAILABLE = True
+    except ImportError:
+        GEMINI_AVAILABLE = False
+try:
+    from anthropic import Anthropic
+    CLAUDE_AVAILABLE = True
+except ImportError:
+    CLAUDE_AVAILABLE = False
+class BaseAPIInference(ABC):
+    """Base class for commercial API inference."""
+    def __init__(self, api_key: str, default_prompt: Optional[str] = None):
+        """
+        Initialize API client.
+        Args:
+            api_key: API key for the service
+            default_prompt: Default transcription prompt
+        """
+        self.api_key = api_key
+        self.default_prompt = default_prompt or self._get_default_prompt()
+    @abstractmethod
+    def _get_default_prompt(self) -> str:
+        """Get default transcription prompt."""
+        pass
+    @abstractmethod
+    def transcribe(
+        self,
+        image: Image.Image,
+        prompt: Optional[str] = None,
+        **kwargs
+    ) -> str:
+        """
+        Transcribe a manuscript line image.
+        Args:
+            image: PIL Image
+            prompt: Custom prompt (uses default if None)
+            **kwargs: Provider-specific parameters
+        Returns:
+            Transcribed text
+        """
+        pass
+    @staticmethod
+    def encode_image_base64(image: Image.Image, format: str = "PNG") -> str:
+        """
+        Encode PIL Image to base64 string.
+        Args:
+            image: PIL Image
+            format: Image format (PNG, JPEG, etc.)
+        Returns:
+            Base64-encoded image string
+        """
+        buffered = io.BytesIO()
+        image.save(buffered, format=format)
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+    @staticmethod
+    def resize_image_if_needed(
+        image: Image.Image,
+        max_dimension: int = 2048
+    ) -> Image.Image:
+        """
+        Resize image if larger than max dimension while preserving aspect ratio.
+        Args:
+            image: PIL Image
+            max_dimension: Maximum width or height
+        Returns:
+            Resized image (or original if already small enough)
+        """
+        width, height = image.size
+        if width <= max_dimension and height <= max_dimension:
+            return image
+        # Calculate new size preserving aspect ratio
+        if width > height:
+            new_width = max_dimension
+            new_height = int(height * (max_dimension / width))
+        else:
+            new_height = max_dimension
+            new_width = int(width * (max_dimension / height))
+        return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+class OpenAIInference(BaseAPIInference):
+    """OpenAI GPT-4 Vision / GPT-4o inference."""
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "gpt-4o",  # gpt-4o, gpt-4-vision-preview, gpt-4-turbo
+        default_prompt: Optional[str] = None
+    ):
+        """
+        Initialize OpenAI inference.
+        Args:
+            api_key: OpenAI API key
+            model: Model name
+            default_prompt: Default transcription prompt
+        """
+        if not OPENAI_AVAILABLE:
+            raise ImportError("OpenAI library not installed. Install with: pip install openai")
+        super().__init__(api_key, default_prompt)
+        self.model = model
+        self.client = OpenAI(api_key=api_key)
+    def _get_default_prompt(self) -> str:
+        return (
+            "Transcribe all handwritten text in this manuscript image. "
+            "Preserve the original language (Cyrillic, Latin, etc.) and layout. "
+            "Output only the transcribed text without any additional commentary."
+        )
+    def transcribe(
+        self,
+        image: Image.Image,
+        prompt: Optional[str] = None,
+        max_tokens: int = 500,
+    temperature: float = 1.0,
+        **kwargs
+    ) -> str:
+        """
+        Transcribe with OpenAI GPT-4 Vision.
+        Args:
+            image: PIL Image
+            prompt: Custom prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature (web default ~1.0). Lower (0-0.3) = deterministic; higher = more variation.
+            **kwargs: Additional OpenAI parameters
+        Returns:
+            Transcribed text
+        """
+        prompt = prompt or self.default_prompt
+        # Resize if needed (GPT-4V supports up to 2048x2048)
+        image = self.resize_image_if_needed(image, max_dimension=2048)
+        # Encode image
+        base64_image = self.encode_image_base64(image, format="PNG")
+        # API call
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{base64_image}"
+                            }
+                        }
+                    ]
+                }
+            ],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            **kwargs
+        )
+        return response.choices[0].message.content.strip()
+class GeminiInference(BaseAPIInference):
+    """Google Gemini inference via google-genai SDK (with legacy google-generativeai fallback)."""
+    # thinking_mode string -> thinking_budget token count (max tokens for internal reasoning)
+    # "low":  8000  — moderate budget; fast enough for most lines
+    # "high": None  — no ThinkingConfig passed at all; model decides dynamically (no cap)
+    _THINKING_BUDGETS = {"low": 8000, "high": None}
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "gemini-2.0-flash",
+        default_prompt: Optional[str] = None,
+    ):
+        if not GEMINI_AVAILABLE:
+            raise ImportError(
+                "Google AI library not installed. Install with: pip install google-genai"
+            )
+        super().__init__(api_key, default_prompt)
+        self.model_name = model
+        # Populated after each transcribe() call — for UI token display
+        self.last_usage: Dict[str, Any] = {}
+        self._last_call_usage: Dict[str, Any] = {}
+        if GEMINI_NEW_SDK:
+            self._client = _google_genai_new.Client(api_key=api_key)
+        else:
+            # Legacy fallback
+            genai.configure(api_key=api_key)
+            self._legacy_model = genai.GenerativeModel(model)
+    def _get_default_prompt(self) -> str:
+        return (
+            "Transcribe all handwritten text in this manuscript image. "
+            "Preserve the original language (Cyrillic, Latin, etc.) and layout. "
+            "Output only the transcribed text without any additional commentary."
+        )
+    def _build_config(self, temperature, max_output_tokens, thinking_budget, safety_settings,
+                      request_thoughts: bool = True):
+        """Build GenerateContentConfig for google-genai SDK.
+        request_thoughts=True (default): always sets include_thoughts=True so thought parts
+        appear in candidates[].content.parts[] and can be exported.  Pass False when retrying
+        against a model that rejects ThinkingConfig entirely.
+        """
+        kw: Dict[str, Any] = {"temperature": temperature}
+        if max_output_tokens:
+            kw["max_output_tokens"] = max_output_tokens
+        if safety_settings:
+            kw["safety_settings"] = safety_settings
+        if request_thoughts:
+            # Always request thought text back; only cap thinking_budget when explicitly set
+            tc_kw: Dict[str, Any] = {"include_thoughts": True}
+            if thinking_budget is not None:
+                tc_kw["thinking_budget"] = thinking_budget
+            kw["thinking_config"] = _google_genai_types.ThinkingConfig(**tc_kw)
+        return _google_genai_types.GenerateContentConfig(**kw)
+    def _generate(self, prompt, image, temperature, thinking_budget, safety_settings, verbose):
+        """Single generate call. Handles thinking-not-supported gracefully."""
+        if not GEMINI_NEW_SDK:
+            # Legacy google-generativeai path
+            gen_cfg = genai.GenerationConfig(temperature=temperature or 0.0)
+            resp = self._legacy_model.generate_content(
+                [prompt, image], generation_config=gen_cfg, safety_settings=safety_settings
+            )
+            self._last_call_usage = {}
+            return resp.text.strip()
+        config = self._build_config(temperature or 0.0, None, thinking_budget, safety_settings,
+                                    request_thoughts=True)
+        try:
+            resp = self._client.models.generate_content(
+                model=self.model_name, contents=[prompt, image], config=config
+            )
+        except Exception as e:
+            err = str(e)
+            # Non-thinking models reject ThinkingConfig with a 400/invalid error — retry without it
+            if "thinking" in err.lower() or ("400" in err and "invalid" in err.lower()):
+                if verbose:
+                    print(f"Model does not support ThinkingConfig, retrying without.")
+                config = self._build_config(temperature or 0.0, None, thinking_budget,
+                                            safety_settings, request_thoughts=False)
+                resp = self._client.models.generate_content(
+                    model=self.model_name, contents=[prompt, image], config=config
+                )
+            else:
+                raise
+        usage = getattr(resp, "usage_metadata", None)
+        self._last_call_usage = {
+            "prompt_tokens": getattr(usage, "prompt_token_count", None) if usage else None,
+            "output_tokens": getattr(usage, "candidates_token_count", None) if usage else None,
+            "thinking_tokens": getattr(usage, "thoughts_token_count", None) if usage else None,
+            "total_tokens": getattr(usage, "total_token_count", None) if usage else None,
+        }
+        # Extract thinking text from thought parts (present when include_thoughts=True was sent)
+        thinking_parts = []
+        try:
+            for cand in (getattr(resp, "candidates", None) or []):
+                for part in (getattr(getattr(cand, "content", None), "parts", None) or []):
+                    if getattr(part, "thought", False) and getattr(part, "text", None):
+                        thinking_parts.append(part.text)
+        except Exception:
+            pass
+        self._last_call_usage["thinking_text"] = "\n\n".join(thinking_parts) if thinking_parts else None
+        return resp.text.strip()
+    def _maybe_continue(
+        self,
+        current_text: str,
+        prompt: str,
+        image,
+        thinking_budget,
+        safety_settings,
+        auto_continue: bool,
+        max_auto_continuations: int,
+        continuation_min_new_chars: int,
+        verbose_block_logging: bool,
+    ) -> str:
+        if not auto_continue:
+            return current_text
+        accumulated = current_text
+        for pass_idx in range(1, max_auto_continuations + 1):
+            continuation_prompt = (
+                f"{prompt}\n\nPartial transcription so far (DO NOT repeat it):\n"
+                f"{accumulated}\n\nContinue transcribing remaining, previously UNTRANSCRIBED text. "
+                "Output ONLY the new continuation without repeating prior characters."
+            )
+            try:
+                new_chunk = self._generate(
+                    continuation_prompt, image, None, thinking_budget,
+                    safety_settings, verbose_block_logging
+                )
+            except Exception as e:
+                if verbose_block_logging:
+                    print(f"Continuation {pass_idx} failed: {e}")
+                break
+            if not new_chunk:
+                if verbose_block_logging:
+                    print(f"Continuation {pass_idx}: no new text, stopping.")
+                break
+            # Guard against repetition
+            if accumulated and new_chunk.startswith(accumulated[:200]):
+                overlap_pos = new_chunk.find(accumulated[-50:])
+                if overlap_pos > 0:
+                    new_chunk = new_chunk[overlap_pos + len(accumulated[-50:]):]
+            delta = len(new_chunk)
+            if delta < continuation_min_new_chars:
+                if verbose_block_logging:
+                    print(f"Continuation {pass_idx}: only {delta} chars, stopping.")
+                break
+            accumulated += ("\n" if not accumulated.endswith("\n") else "") + new_chunk
+            if verbose_block_logging:
+                print(f"Continuation {pass_idx}: +{delta} chars (total {len(accumulated)})")
+        return accumulated
+    def transcribe(
+        self,
+        image,
+        prompt: Optional[str] = None,
+        temperature: float = 0.0,
+        max_output_tokens: Optional[int] = None,
+        auto_retry_on_block: bool = True,
+        safety_relax: bool = True,
+        verbose_block_logging: bool = True,
+        thinking_mode: Optional[str] = None,
+        fast_direct: bool = False,
+        fast_direct_early_exit: bool = True,
+        auto_continue: bool = False,
+        max_auto_continuations: int = 2,
+        continuation_min_new_chars: int = 50,
+        reasoning_fallback_threshold: float = 1.0,
+        record_stats_csv: Optional[str] = None,
+        apply_restriction_prompt: bool = False,
+        fallback_max_output_tokens: int = 8192,
+        **kwargs,
+    ) -> str:
+        """Transcribe a manuscript image with Google Gemini.
+        Args:
+            image: PIL Image or numpy array
+            prompt: Transcription prompt (uses default if None)
+            temperature: Sampling temperature (0.0 = deterministic)
+            max_output_tokens: Output token cap (None = model default)
+            thinking_mode: None | "low" | "high" -- maps to thinking_budget
+            record_stats_csv: Path to append usage CSV row (None to skip)
+            auto_continue: Request continuation calls if output seems truncated
+        """
+        from PIL import Image as _PIL_Image
+        import numpy as np
+        if isinstance(image, np.ndarray):
+            image = _PIL_Image.fromarray(image)
+        image = self.resize_image_if_needed(image, max_dimension=3072)
+        prompt = prompt or self.default_prompt
+        # Map thinking_mode to thinking_budget
+        thinking_budget = self._THINKING_BUDGETS.get(thinking_mode)  # None if mode is None/unknown
+        # Safety settings
+        safety_settings = None
+        if safety_relax and GEMINI_NEW_SDK:
+            safety_settings = [
+                _google_genai_types.SafetySetting(category=cat, threshold="BLOCK_NONE")
+                for cat in (
+                    "HARM_CATEGORY_HARASSMENT",
+                    "HARM_CATEGORY_HATE_SPEECH",
+                    "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                    "HARM_CATEGORY_DANGEROUS_CONTENT",
+                )
+            ]
+        self._last_call_usage = {}
+        try:
+            result_text = self._generate(
+                prompt, image, temperature, thinking_budget, safety_settings, verbose_block_logging
+            )
+        except Exception as e:
+            raise ValueError(f"Gemini transcription failed: {e}") from e
+        # Persist usage for callers (e.g. statistics panel, CSV logging)
+        self.last_usage = dict(self._last_call_usage)
+        u = self.last_usage
+        if verbose_block_logging and u.get("total_tokens"):
+            print(
+                f"[tokens] prompt={u.get('prompt_tokens')} "
+                f"output={u.get('output_tokens')} "
+                f"thinking={u.get('thinking_tokens')} "
+                f"total={u.get('total_tokens')}"
+            )
+        if record_stats_csv:
+            try:
+                from datetime import datetime
+                with open(record_stats_csv, "a") as f:
+                    f.write(
+                        f"{datetime.utcnow().isoformat()},"
+                        f"{self.model_name},"
+                        f"{thinking_mode or 'default'},"
+                        f"final_success,"
+                        f"{u.get('prompt_tokens')},"
+                        f"{u.get('output_tokens')},"
+                        f"{u.get('thinking_tokens')},"
+                        f"{u.get('total_tokens')},"
+                        f"{len(result_text)}\n"
+                    )
+            except Exception as csv_e:
+                if verbose_block_logging:
+                    print(f"Stats logging failed: {csv_e}")
+        return self._maybe_continue(
+            result_text, prompt, image, thinking_budget, safety_settings,
+            auto_continue, max_auto_continuations, continuation_min_new_chars,
+            verbose_block_logging,
+        )
+class ClaudeInference(BaseAPIInference):
+    """Anthropic Claude 3 inference (Opus, Sonnet, Haiku)."""
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "claude-3-5-sonnet-20241022",  # claude-3-5-sonnet-20241022, claude-3-opus-20240229, claude-3-haiku-20240307
+        default_prompt: Optional[str] = None
+    ):
+        """
+        Initialize Claude inference.
+        Args:
+            api_key: Anthropic API key
+            model: Model name
+            default_prompt: Default transcription prompt
+        """
+        if not CLAUDE_AVAILABLE:
+            raise ImportError("Anthropic library not installed. Install with: pip install anthropic")
+        super().__init__(api_key, default_prompt)
+        self.model = model
+        self.client = Anthropic(api_key=api_key)
+    def _get_default_prompt(self) -> str:
+        return (
+            "Transcribe all handwritten text in this manuscript image. "
+            "Preserve the original language (Cyrillic, Latin, etc.) and layout. "
+            "Output only the transcribed text without any additional commentary."
+        )
+    def transcribe(
+        self,
+        image: Image.Image,
+        prompt: Optional[str] = None,
+        max_tokens: int = 500,
+        temperature: float = 0.0,
+        **kwargs
+    ) -> str:
+        """
+        Transcribe with Anthropic Claude.
+        Args:
+            image: PIL Image
+            prompt: Custom prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature (0.0 = deterministic)
+            **kwargs: Additional Claude parameters
+        Returns:
+            Transcribed text
+        """
+        prompt = prompt or self.default_prompt
+        # Resize if needed (Claude supports up to 1568px on longest side)
+        image = self.resize_image_if_needed(image, max_dimension=1568)
+        # Encode image
+        base64_image = self.encode_image_base64(image, format="PNG")
+        # API call
+        response = self.client.messages.create(
+            model=self.model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/png",
+                                "data": base64_image
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": prompt
+                        }
+                    ]
+                }
+            ],
+            **kwargs
+        )
+        return response.content[0].text.strip()
+# Model availability checks
+def check_api_availability() -> Dict[str, bool]:
+    """Check which API libraries are installed."""
+    return {
+        "openai": OPENAI_AVAILABLE,
+        "gemini": GEMINI_AVAILABLE,
+        "claude": CLAUDE_AVAILABLE,
+    }
+# Fallback API model lists (used if dynamic fetching fails)
+OPENAI_MODELS_FALLBACK = [
+    "gpt-4o",
+    "gpt-4o-mini",
+    "gpt-4o-2024-11-20",
+    "chatgpt-4o-latest",
+    "gpt-4-turbo",
+    "gpt-4-vision-preview",
+    "o1-preview",
+    "o1-mini",
+]
+GEMINI_MODELS_FALLBACK = [
+    # Free tier models (generally available)
+    "gemini-1.5-flash",
+    "gemini-1.5-flash-002",
+    "gemini-1.5-flash-8b",
+    "gemini-2.0-flash-exp",
+    # Paid/preview models (may require upgrade)
+    "gemini-1.5-pro",
+    "gemini-1.5-pro-002",
+    "gemini-1.5-pro-exp-0827",
+    # Experimental (may not be available to all users)
+    "gemini-exp-1206",
+    "gemini-exp-1121",
+    # Gemini 3 preview models (latest, may have restrictions)
+    "gemini-3-pro-preview",
+]
+CLAUDE_MODELS_FALLBACK = [
+    "claude-opus-4-6",
+    "claude-sonnet-4-6",
+    "claude-haiku-4-5-20251001",
+    "claude-3-5-sonnet-20241022",
+    "claude-3-5-haiku-20241022",
+    "claude-3-opus-20240229",
+    "claude-3-haiku-20240307",
+]
+def fetch_openai_models(api_key: str = None) -> list:
+    """
+    Dynamically fetch available OpenAI models from API.
+    Args:
+        api_key: OpenAI API key (uses env var if not provided)
+    Returns:
+        List of vision-capable model IDs, or fallback list if fetch fails
+    """
+    if not OPENAI_AVAILABLE:
+        return OPENAI_MODELS_FALLBACK
+    try:
+        if not api_key:
+            return OPENAI_MODELS_FALLBACK
+        client = OpenAI(api_key=api_key)
+        models = client.models.list()
+        # Filter for vision-capable models (GPT-4 family + o1)
+        vision_models = []
+        for model in models.data:
+            model_id = model.id
+            # Include GPT-4 vision models and o1 models
+            if any(prefix in model_id for prefix in [
+                "gpt-4o", "gpt-4-turbo", "gpt-4-vision",
+                "chatgpt-4o", "o1-", "gpt-4.5"  # Include potential GPT-4.5
+            ]):
+                vision_models.append(model_id)
+        # Sort with newest/best models first
+        vision_models.sort(reverse=True)
+        # Return dynamic list if we found models, otherwise fallback
+        return vision_models if vision_models else OPENAI_MODELS_FALLBACK
+    except Exception as e:
+        print(f"[OpenAI] Could not fetch models dynamically: {e}")
+        print(f"[OpenAI] Using fallback model list")
+        return OPENAI_MODELS_FALLBACK
+def fetch_gemini_models(api_key: str = None) -> list:
+    """Dynamically fetch available Gemini models; returns fallback list on failure."""
+    if not GEMINI_AVAILABLE:
+        return GEMINI_MODELS_FALLBACK
+    try:
+        if not api_key:
+            return GEMINI_MODELS_FALLBACK
+        if GEMINI_NEW_SDK:
+            client = _google_genai_new.Client(api_key=api_key)
+            models = [
+                m.name.replace("models/", "")
+                for m in client.models.list()
+                if "generateContent" in (getattr(m, "supported_actions", None) or [])
+            ]
+        else:
+            genai.configure(api_key=api_key)
+            models = [
+                m.name.replace("models/", "")
+                for m in genai.list_models()
+                if "generateContent" in m.supported_generation_methods
+            ]
+        models = [m for m in models if m.startswith("gemini")]
+        models.sort(reverse=True)
+        return models if models else GEMINI_MODELS_FALLBACK
+    except Exception as e:
+        print(f"[Gemini] Could not fetch models: {e}")
+        return GEMINI_MODELS_FALLBACK
+def fetch_claude_models(api_key: str = None) -> list:
+    """
+    Dynamically fetch available Claude models via Anthropic API.
+    Returns:
+        List of Claude model IDs (newest first), or fallback list if fetch fails.
+    """
+    if not CLAUDE_AVAILABLE:
+        return CLAUDE_MODELS_FALLBACK
+    try:
+        if not api_key:
+            return CLAUDE_MODELS_FALLBACK
+        client = Anthropic(api_key=api_key)
+        models_page = client.models.list()
+        model_ids = [m.id for m in models_page.data]
+        # Sort newest first (IDs contain dates like -20241022 or version numbers)
+        model_ids.sort(reverse=True)
+        return model_ids if model_ids else CLAUDE_MODELS_FALLBACK
+    except Exception as e:
+        print(f"[Claude] Could not fetch models dynamically: {e}")
+        return CLAUDE_MODELS_FALLBACK
+# Initialize model lists (will be updated when API keys are provided)
+OPENAI_MODELS = OPENAI_MODELS_FALLBACK.copy()
+GEMINI_MODELS = GEMINI_MODELS_FALLBACK.copy()
+CLAUDE_MODELS = CLAUDE_MODELS_FALLBACK.copy()
+if __name__ == "__main__":
+    # Example usage
+    import sys
+    if len(sys.argv) < 4:
+        print("Usage: python inference_commercial_api.py <provider> <api_key> <image_path>")
+        print("Providers: openai, gemini, claude")
+        sys.exit(1)
+    provider = sys.argv[1].lower()
+    api_key = sys.argv[2]
+    image_path = sys.argv[3]
+    # Load image
+    image = Image.open(image_path).convert("RGB")
+    # Initialize appropriate inference client
+    if provider == "openai":
+        api = OpenAIInference(api_key)
+    elif provider == "gemini":
+        api = GeminiInference(api_key)
+    elif provider == "claude":
+        api = ClaudeInference(api_key)
+    else:
+        print(f"Unknown provider: {provider}")
+        sys.exit(1)
+    # Transcribe
+    print(f"Transcribing with {provider}...")
+    text = api.transcribe(image)
+    print(f"\nResult: {text}")

inference_page.py ADDED Viewed

	@@ -0,0 +1,946 @@

+"""
+Whole-page OCR inference for Ukrainian handwritten text using TrOCR.
+This script performs line segmentation and transcription on unsegmented page images.
+Usage:
+    # Basic usage with checkpoint
+    python inference_page.py --image path/to/page.jpg --checkpoint models/ukrainian_model/checkpoint-3000
+    # With custom settings
+    python inference_page.py --image page.jpg --checkpoint checkpoint-3000 --num_beams 4 --output output.txt
+    # With Transkribus PAGE XML (uses existing segmentation)
+    python inference_page.py --image page.jpg --xml page.xml --checkpoint checkpoint-3000
+Future: Can be extended with a GUI using tkinter or PyQt.
+"""
+import argparse
+import torch
+from pathlib import Path
+from PIL import Image, ImageDraw
+import numpy as np
+from typing import List, Tuple, Optional
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+import cv2
+# Disable PIL DecompressionBomb protection for large manuscript images
+Image.MAX_IMAGE_PIXELS = None
+# Optional: the hosted Hugging Face Space uses this module for segmentation, but
+# does not enable TrOCR inference. Avoid making transformers a startup dependency.
+try:
+    from transformers import VisionEncoderDecoderModel, TrOCRProcessor
+except ImportError:
+    VisionEncoderDecoderModel = None
+    TrOCRProcessor = None
+@dataclass
+class LineSegment:
+    """Represents a segmented text line."""
+    image: Image.Image
+    bbox: Tuple[int, int, int, int]  # x1, y1, x2, y2
+    coords: Optional[List[Tuple[int, int]]] = None  # polygon coordinates if available
+    text: Optional[str] = None  # transcription result
+    confidence: Optional[float] = None  # average confidence score (0-1)
+    char_confidences: Optional[List[float]] = None  # per-character confidence scores
+def sort_lines_by_region(regions, lines):
+    """
+    Sort lines in reading order: regions left-to-right, lines top-to-bottom
+    within each region.
+    Works with SegRegion objects from kraken_segmenter (which carry bbox and
+    line_ids) and any list of line-like objects that have a ``.bbox`` attribute
+    with (x1, y1, x2, y2) format.
+    Args:
+        regions: List of SegRegion (from kraken_segmenter) with .bbox and .line_ids.
+                 If empty/None, lines are returned sorted top-to-bottom.
+        lines:   List of LineSegment (or kraken LineSegment).
+    Returns:
+        List of lines re-ordered by region reading order.
+    """
+    if not regions or not lines:
+        # No region info — simple top-to-bottom sort
+        return sorted(lines, key=lambda l: l.bbox[1])
+    # Sort regions left-to-right by mean x-center
+    sorted_regions = sorted(
+        regions,
+        key=lambda r: (r.bbox[0] + r.bbox[2]) / 2,
+    )
+    # Assign each line to the region whose bbox contains the line's center
+    region_buckets = {r.id: [] for r in sorted_regions}
+    unassigned = []
+    for line in lines:
+        cx = (line.bbox[0] + line.bbox[2]) / 2
+        cy = (line.bbox[1] + line.bbox[3]) / 2
+        assigned = False
+        for r in sorted_regions:
+            rx1, ry1, rx2, ry2 = r.bbox
+            if rx1 <= cx <= rx2 and ry1 <= cy <= ry2:
+                region_buckets[r.id].append(line)
+                assigned = True
+                break
+        if not assigned:
+            unassigned.append(line)
+    # Build ordered list: per-region top-to-bottom, then unassigned at the end
+    ordered = []
+    for r in sorted_regions:
+        bucket = region_buckets[r.id]
+        bucket.sort(key=lambda l: l.bbox[1])
+        ordered.extend(bucket)
+    unassigned.sort(key=lambda l: l.bbox[1])
+    ordered.extend(unassigned)
+    return ordered
+def normalize_background(image: Image.Image) -> Image.Image:
+    """
+    Normalize background to light gray (similar to Efendiev dataset).
+    CRITICAL for Ukrainian dataset: Models trained on data with background
+    normalization MUST have normalization applied at inference time as well.
+    Args:
+        image: PIL Image with potentially aged/colored background
+    Returns:
+        PIL Image with normalized background
+    """
+    # Convert PIL to OpenCV format
+    img_array = np.array(image)
+    # Convert to LAB color space for better lighting normalization
+    lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB)
+    l, a, b = cv2.split(lab)
+    # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) to L channel
+    # This normalizes lighting variations across the image
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    l_normalized = clahe.apply(l)
+    # Merge back and convert to RGB
+    lab_normalized = cv2.merge([l_normalized, a, b])
+    rgb_normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2RGB)
+    # Convert to grayscale to remove color variations (aged paper tones)
+    gray = cv2.cvtColor(rgb_normalized, cv2.COLOR_RGB2GRAY)
+    # Convert back to RGB with uniform background
+    # This creates a light gray background similar to Efendiev dataset
+    normalized_rgb = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
+    return Image.fromarray(normalized_rgb)
+class LineSegmenter:
+    """Improved line segmentation using horizontal projection with multiple strategies."""
+    def __init__(self, min_line_height: int = 15, min_gap: int = 5,
+                 sensitivity: float = 0.02, use_morph: bool = True):
+        """
+        Initialize LineSegmenter.
+        Args:
+            min_line_height: Minimum height of a line in pixels (default: 15, lowered for tighter spacing)
+            min_gap: Minimum gap between lines in pixels (default: 5, lowered for tight spacing)
+            sensitivity: Threshold for detecting text (0.01-0.1, lower = more sensitive, default: 0.02)
+            use_morph: Apply morphological operations to clean up detection (default: True)
+        """
+        self.min_line_height = min_line_height
+        self.min_gap = min_gap
+        self.sensitivity = sensitivity
+        self.use_morph = use_morph
+    def segment_lines(self, image: Image.Image, debug: bool = False) -> List[LineSegment]:
+        """
+        Segment page image into text lines using horizontal projection.
+        Improved algorithm:
+        1. Multiple binarization strategies (Otsu + Sauvola for different scripts)
+        2. Morphological operations to connect broken text
+        3. Lower sensitivity threshold for tight line spacing
+        4. Smart gap detection based on local context
+        Args:
+            image: Input page image (PIL Image)
+            debug: If True, visualize segmentation
+        Returns:
+            List of LineSegment objects
+        """
+        # Convert to grayscale
+        gray = np.array(image.convert('L'))
+        # Try multiple binarization strategies and combine
+        from scipy.ndimage import gaussian_filter
+        blurred = gaussian_filter(gray, sigma=1.0)
+        # Strategy 1: Otsu's method (global threshold)
+        threshold_otsu = self._otsu_threshold(blurred)
+        binary_otsu = blurred < threshold_otsu
+        # Strategy 2: Adaptive threshold (local threshold, better for varying contrast)
+        binary_adaptive = self._adaptive_threshold(gray)
+        # Combine both strategies (logical OR to catch text in both)
+        binary = np.logical_or(binary_otsu, binary_adaptive)
+        # Apply morphological closing to connect broken characters
+        if self.use_morph:
+            from scipy.ndimage import binary_closing
+            # Horizontal structuring element to connect characters on same line
+            struct = np.ones((3, 5))  # 3 pixels tall, 5 pixels wide
+            binary = binary_closing(binary, structure=struct, iterations=2)
+        # Horizontal projection (sum of black pixels per row)
+        h_projection = binary.sum(axis=1)
+        # Adaptive threshold based on image statistics
+        # Use lower threshold for better sensitivity
+        if h_projection.max() > 0:
+            threshold = h_projection.max() * self.sensitivity
+        else:
+            # Fallback if no text detected
+            threshold = 1
+        is_text = h_projection > threshold
+        # Apply median filter to smooth out noise in projection
+        from scipy.ndimage import median_filter
+        is_text_smoothed = median_filter(is_text.astype(float), size=3) > 0.5
+        # Find continuous text regions with improved gap detection
+        lines = []
+        in_line = False
+        start_y = 0
+        gap_count = 0
+        for y in range(len(is_text_smoothed)):
+            if is_text_smoothed[y]:
+                if not in_line:
+                    # Start of new line
+                    start_y = y
+                    in_line = True
+                    gap_count = 0
+                else:
+                    # Continue line, reset gap counter
+                    gap_count = 0
+            else:
+                if in_line:
+                    # Potential gap - count consecutive gap pixels
+                    gap_count += 1
+                    if gap_count >= self.min_gap:
+                        # End of line (gap is large enough)
+                        end_y = y - gap_count
+                        if end_y - start_y >= self.min_line_height:
+                            lines.append((start_y, end_y))
+                        in_line = False
+                        gap_count = 0
+        # Don't forget last line if image ends with text
+        if in_line and len(is_text_smoothed) - start_y >= self.min_line_height:
+            lines.append((start_y, len(is_text_smoothed)))
+        # Post-process: Merge lines that are too close (likely one line split incorrectly)
+        merged_lines = self._merge_close_lines(lines, max_gap=self.min_gap * 2)
+        # Create LineSegment objects
+        segments = []
+        width = image.width
+        for y1, y2 in merged_lines:
+            # Add padding (larger padding for better context)
+            padding = 8
+            y1_pad = max(0, y1 - padding)
+            y2_pad = min(image.height, y2 + padding)
+            # Crop line (full width for now, could be refined with vertical projection)
+            bbox = (0, y1_pad, width, y2_pad)
+            line_img = image.crop(bbox)
+            segments.append(LineSegment(
+                image=line_img,
+                bbox=bbox
+            ))
+        if debug:
+            self._visualize_segmentation(image, segments, h_projection)
+        print(f"[LineSegmenter] Detected {len(segments)} lines (sensitivity={self.sensitivity}, min_height={self.min_line_height})")
+        return segments
+    @staticmethod
+    def _adaptive_threshold(gray: np.ndarray, block_size: int = 35) -> np.ndarray:
+        """
+        Apply adaptive thresholding using a local window.
+        Better for images with varying illumination or contrast.
+        """
+        # Use cv2 if available, otherwise fallback to simple method
+        try:
+            import cv2
+            # Adaptive Gaussian thresholding
+            binary = cv2.adaptiveThreshold(
+                gray.astype(np.uint8),
+                255,
+                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                cv2.THRESH_BINARY_INV,
+                block_size,
+                10
+            )
+            return binary > 0
+        except:
+            # Fallback: simple global threshold
+            threshold = np.mean(gray) - np.std(gray) * 0.5
+            return gray < threshold
+    @staticmethod
+    def _merge_close_lines(lines: List[Tuple[int, int]], max_gap: int = 10) -> List[Tuple[int, int]]:
+        """Merge lines that are very close together (likely one line split incorrectly)."""
+        if not lines:
+            return lines
+        merged = [lines[0]]
+        for y1, y2 in lines[1:]:
+            prev_y1, prev_y2 = merged[-1]
+            gap = y1 - prev_y2
+            if gap <= max_gap:
+                # Merge with previous line
+                merged[-1] = (prev_y1, y2)
+            else:
+                # Add as new line
+                merged.append((y1, y2))
+        return merged
+    @staticmethod
+    def _otsu_threshold(gray_array: np.ndarray) -> float:
+        """Compute Otsu's threshold."""
+        hist, bin_edges = np.histogram(gray_array, bins=256, range=(0, 256))
+        hist = hist.astype(float)
+        # Normalize
+        hist /= hist.sum()
+        # Cumulative sums
+        weight1 = np.cumsum(hist)
+        weight2 = np.cumsum(hist[::-1])[::-1]
+        # Cumulative means
+        mean1 = np.cumsum(hist * np.arange(256))
+        mean2 = (np.cumsum((hist * np.arange(256))[::-1])[::-1])
+        # Avoid division by zero
+        weight1 = np.clip(weight1, 1e-10, 1)
+        weight2 = np.clip(weight2, 1e-10, 1)
+        # Between-class variance
+        variance = weight1 * weight2 * ((mean1 / weight1) - (mean2 / weight2)) ** 2
+        return np.argmax(variance)
+    @staticmethod
+    def _visualize_segmentation(image: Image.Image, segments: List[LineSegment],
+                                h_projection: Optional[np.ndarray] = None):
+        """Visualize line segmentation for debugging."""
+        vis = image.copy()
+        draw = ImageDraw.Draw(vis)
+        for i, seg in enumerate(segments):
+            x1, y1, x2, y2 = seg.bbox
+            # Alternate colors for visibility
+            color = 'red' if i % 2 == 0 else 'blue'
+            draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+            draw.text((x1 + 5, y1 + 5), f"Line {i+1}", fill=color)
+        vis.show()
+        # Optionally show projection profile
+        if h_projection is not None:
+            import matplotlib.pyplot as plt
+            plt.figure(figsize=(12, 4))
+            plt.plot(h_projection)
+            plt.title("Horizontal Projection Profile")
+            plt.xlabel("Y Position")
+            plt.ylabel("Text Density")
+            plt.grid(True)
+            plt.show()
+class PageXMLSegmenter:
+    """Segment using existing Transkribus PAGE XML annotations."""
+    NS = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
+    def __init__(self, xml_path: str):
+        self.xml_path = Path(xml_path)
+    def segment_lines(self, image: Image.Image) -> List[LineSegment]:
+        """Extract lines using PAGE XML coordinates with correct reading order."""
+        tree = ET.parse(self.xml_path)
+        root = tree.getroot()
+        # Determine scale factors: PAGE XML stores absolute pixel coords for the
+        # original scan.  If the uploaded image was resized, we must scale coords.
+        ns = self.NS
+        # Try both common PAGE XML namespaces (2013 and 2019 Transkribus variants)
+        page_elem = root.find('.//page:Page', ns)
+        if page_elem is None:
+            ns_2019 = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
+            page_elem = root.find('.//page:Page', ns_2019)
+            if page_elem is not None:
+                ns = ns_2019
+        xml_w = int(page_elem.get('imageWidth',  image.width))  if page_elem is not None else image.width
+        xml_h = int(page_elem.get('imageHeight', image.height)) if page_elem is not None else image.height
+        scale_x = image.width  / xml_w if xml_w > 0 else 1.0
+        scale_y = image.height / xml_h if xml_h > 0 else 1.0
+        # Will be populated below for visualization in the viewer
+        self.region_data: list = []
+        # Store regions with their reading order
+        regions_with_order = []
+        for region in root.findall('.//page:TextRegion', ns):
+            # Extract region reading order from custom attribute
+            region_order = self._extract_reading_order(region.get('custom', ''))
+            # Get region Y coordinate as fallback (from first TextLine or Coords)
+            region_y = self._get_region_y_position(region, ns)
+            # Store lines for this region with their reading order
+            lines_with_order = []
+            for text_line in region.findall('.//page:TextLine', ns):
+                # Get coordinates
+                coords_elem = text_line.find('page:Coords', ns)
+                if coords_elem is None:
+                    continue
+                coords_str = coords_elem.get('points')
+                if not coords_str:
+                    continue
+                # Parse coordinates and scale to uploaded image dimensions
+                coords = self._parse_coords(coords_str)
+                if scale_x != 1.0 or scale_y != 1.0:
+                    coords = [(int(x * scale_x), int(y * scale_y)) for x, y in coords]
+                x1, y1, x2, y2 = self._get_bounding_box(coords)
+                # Crop line with padding
+                padding = 5
+                x1_pad = max(0, x1 - padding)
+                y1_pad = max(0, y1 - padding)
+                x2_pad = min(image.width, x2 + padding)
+                y2_pad = min(image.height, y2 + padding)
+                bbox = (x1_pad, y1_pad, x2_pad, y2_pad)
+                line_img = image.crop(bbox)
+                segment = LineSegment(
+                    image=line_img,
+                    bbox=bbox,
+                    coords=coords
+                )
+                # Extract line reading order from custom attribute
+                line_order = self._extract_reading_order(text_line.get('custom', ''))
+                # Use line reading order if available, otherwise Y coordinate
+                sort_key = line_order if line_order is not None else y1
+                lines_with_order.append((sort_key, segment))
+            # Sort lines within this region
+            lines_with_order.sort(key=lambda x: x[0])
+            sorted_lines = [seg for _, seg in lines_with_order]
+            # Collect TextRegion bbox for viewer visualization
+            region_id = region.get('id', f'region_{len(regions_with_order)}')
+            region_coords_elem = region.find('page:Coords', ns)
+            if region_coords_elem is not None:
+                rc_str = region_coords_elem.get('points', '')
+                if rc_str:
+                    rc = self._parse_coords(rc_str)
+                    if scale_x != 1.0 or scale_y != 1.0:
+                        rc = [(int(x * scale_x), int(y * scale_y)) for x, y in rc]
+                    rx1, ry1, rx2, ry2 = self._get_bounding_box(rc)
+                    self.region_data.append({
+                        "id": region_id,
+                        "bbox": [rx1, ry1, rx2, ry2],
+                        "num_lines": len(sorted_lines),
+                    })
+            # Use region reading order if available, otherwise region Y position
+            region_sort_key = region_order if region_order is not None else region_y
+            regions_with_order.append((region_sort_key, sorted_lines))
+        # Sort regions by reading order (or Y position fallback)
+        regions_with_order.sort(key=lambda x: x[0])
+        # Flatten: concatenate all lines from all regions in order
+        segments = []
+        for _, region_lines in regions_with_order:
+            segments.extend(region_lines)
+        return segments
+    @staticmethod
+    def _extract_reading_order(custom_attr: str) -> Optional[int]:
+        """Extract reading order index from custom attribute.
+        Format: custom="readingOrder {index:5;}"
+        Returns: 5 (or None if not found/parseable)
+        """
+        if not custom_attr or 'readingOrder' not in custom_attr:
+            return None
+        try:
+            # Find "index:X;" pattern
+            start = custom_attr.index('index:') + 6
+            end = custom_attr.index(';', start)
+            return int(custom_attr[start:end])
+        except (ValueError, IndexError):
+            return None
+    def _get_region_y_position(self, region, ns=None) -> int:
+        """Get Y position of region for fallback sorting.
+        Uses the Y coordinate of the region's Coords or first TextLine.
+        """
+        if ns is None:
+            ns = self.NS
+        # Try region Coords first
+        coords_elem = region.find('page:Coords', ns)
+        if coords_elem is not None:
+            coords_str = coords_elem.get('points')
+            if coords_str:
+                coords = self._parse_coords(coords_str)
+                _, y1, _, _ = self._get_bounding_box(coords)
+                return y1
+        # Fallback: use first TextLine Y position
+        text_line = region.find('.//page:TextLine', ns)
+        if text_line is not None:
+            coords_elem = text_line.find('page:Coords', ns)
+            if coords_elem is not None:
+                coords_str = coords_elem.get('points')
+                if coords_str:
+                    coords = self._parse_coords(coords_str)
+                    _, y1, _, _ = self._get_bounding_box(coords)
+                    return y1
+        # Default fallback
+        return 0
+    @staticmethod
+    def _parse_coords(coords_str: str) -> List[Tuple[int, int]]:
+        """Parse coordinate string from PAGE XML."""
+        points = coords_str.split()
+        return [(int(p.split(',')[0]), int(p.split(',')[1])) for p in points]
+    @staticmethod
+    def _get_bounding_box(coords: List[Tuple[int, int]]) -> Tuple[int, int, int, int]:
+        """Get bounding box from polygon coordinates."""
+        xs = [p[0] for p in coords]
+        ys = [p[1] for p in coords]
+        return min(xs), min(ys), max(xs), max(ys)
+class TrOCRInference:
+    """TrOCR model inference."""
+    def __init__(self, model_path: str, device: Optional[str] = None,
+                 base_model: str = "kazars24/trocr-base-handwritten-ru",
+                 normalize_bg: bool = False,
+                 flip_rtl: bool = False,
+                 is_huggingface: bool = False):
+        """
+        Initialize TrOCR inference.
+        Args:
+            model_path: Path to local checkpoint or HuggingFace model ID
+            device: 'cuda', 'cpu', or None for auto-detect
+            base_model: Base model for processor (used with local checkpoints)
+            normalize_bg: Apply background normalization
+            flip_rtl: Flip line images horizontally for RTL scripts
+            is_huggingface: If True, load from HuggingFace Hub instead of local path
+        """
+        self.model_path = model_path
+        self.base_model = base_model
+        self.normalize_bg = normalize_bg
+        self.flip_rtl = flip_rtl
+        self.is_huggingface = is_huggingface
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        print(f"Loading model from {'HuggingFace Hub' if is_huggingface else 'local checkpoint'}: {model_path}...")
+        print(f"Using device: {self.device}")
+        print(f"Background normalization: {'Enabled' if self.normalize_bg else 'Disabled'}")
+        if VisionEncoderDecoderModel is None or TrOCRProcessor is None:
+            raise ImportError("TrOCR inference requires transformers to be installed")
+        if is_huggingface:
+            # Load both processor and model from HuggingFace Hub
+            print(f"Downloading from HuggingFace Hub (if not cached): {model_path}")
+            # Try to load processor from model first, fallback to base_model if it fails
+            try:
+                print(f"Attempting to load processor from {model_path}...")
+                self.processor = TrOCRProcessor.from_pretrained(model_path)
+                # Some models (e.g. dh-unibe/trocr-kurrent) ship a truncated tokenizer
+                # with only special tokens (vocab_size=5).  The model itself uses the full
+                # microsoft/trocr-base-handwritten vocabulary (50265 tokens).  Detect this
+                # by checking vocab_size and replace only the tokenizer – keep the image
+                # processor from the model so preprocessing stays correct.
+                if self.processor.tokenizer.vocab_size < 100:
+                    print(f"WARNING: tokenizer from '{model_path}' has vocab_size="
+                          f"{self.processor.tokenizer.vocab_size} (looks broken). "
+                          f"Replacing tokenizer with microsoft/trocr-base-handwritten.")
+                    _fallback = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+                    self.processor.tokenizer = _fallback.tokenizer
+            except Exception as e:
+                print(f"Failed to load processor from model: {e}")
+                print(f"Falling back to base model processor: {self.base_model}")
+                self.processor = TrOCRProcessor.from_pretrained(self.base_model)
+            self.model = VisionEncoderDecoderModel.from_pretrained(
+                model_path, low_cpu_mem_usage=False)
+            # For backwards compatibility
+            self.checkpoint_path = model_path
+        else:
+            # Load processor from base model, model from local checkpoint
+            self.checkpoint_path = Path(model_path)
+            # If model_path points to a specific file (e.g., model.safetensors),
+            # use the parent directory for from_pretrained()
+            if self.checkpoint_path.is_file():
+                model_dir = self.checkpoint_path.parent
+                print(f"Model path is a file, using directory: {model_dir}")
+            else:
+                model_dir = self.checkpoint_path
+            # Try to load processor from the local model first (correct tokenizer),
+            # fall back to base_model for old checkpoints that lack processor files.
+            try:
+                print(f"Attempting to load processor from local model: {model_dir}")
+                self.processor = TrOCRProcessor.from_pretrained(model_dir)
+            except Exception as e:
+                print(f"Local processor not found ({e}), falling back to base model: {self.base_model}")
+                self.processor = TrOCRProcessor.from_pretrained(self.base_model)
+            self.model = VisionEncoderDecoderModel.from_pretrained(
+                model_dir, low_cpu_mem_usage=False)
+        self.model.to(self.device)
+        # mBART decoder creates _float_tensor lazily on CPU; force it to the right device now.
+        for m in self.model.modules():
+            if hasattr(m, '_float_tensor'):
+                m._float_tensor = m._float_tensor.to(self.device)
+        self.model.eval()
+        print("Model loaded successfully!")
+    def transcribe_line(self, line_image: Image.Image, num_beams: int = 4,
+                       max_length: int = 128, return_confidence: bool = False):
+        """
+        Transcribe a single line image.
+        Args:
+            line_image: PIL Image of text line
+            num_beams: Number of beams for beam search (higher = better quality, slower)
+            max_length: Maximum sequence length
+            return_confidence: If True, return (text, confidence) tuple
+        Returns:
+            If return_confidence=False: Transcribed text string
+            If return_confidence=True: Tuple of (text, confidence_score, char_confidences)
+        """
+        # Apply background normalization if enabled
+        if self.normalize_bg:
+            line_image = normalize_background(line_image)
+        # Flip horizontally for RTL scripts (model trained on flipped images)
+        if self.flip_rtl:
+            line_image = line_image.transpose(Image.FLIP_LEFT_RIGHT)
+        # Ensure image is in RGB mode (TrOCR requires 3 channels)
+        if line_image.mode != 'RGB':
+            line_image = line_image.convert('RGB')
+        # Prepare image
+        pixel_values = self.processor(
+            images=line_image,
+            return_tensors="pt"
+        ).pixel_values.to(self.device)
+        # Generate text with scores
+        with torch.no_grad():
+            if return_confidence:
+                # Generate with output scores for confidence
+                outputs = self.model.generate(
+                    pixel_values,
+                    num_beams=num_beams,
+                    max_length=max_length,
+                    early_stopping=True,
+                    output_scores=True,
+                    return_dict_in_generate=True
+                )
+                generated_ids = outputs.sequences
+                # Calculate confidence from scores
+                # scores is a tuple of tensors, one per generation step
+                # generated_ids shape: (batch_size, sequence_length)
+                if hasattr(outputs, 'scores') and outputs.scores and len(outputs.scores) > 0:
+                    import torch.nn.functional as F
+                    # Get the actual generated tokens (excluding special tokens like BOS)
+                    # generated_ids[0] is the first (and only) sequence in the batch
+                    generated_tokens = generated_ids[0].cpu().numpy()
+                    # scores is a tuple with one tensor per generation step
+                    # Each tensor has shape (batch_size * num_beams, vocab_size)
+                    token_confidences = []
+                    for step_idx, score_tensor in enumerate(outputs.scores):
+                        # Get probabilities for this generation step
+                        # score_tensor shape: (num_beams, vocab_size) for batch_size=1
+                        probs = F.softmax(score_tensor, dim=-1)
+                        # The actual generated token at this step
+                        # Skip BOS token (index 0), so generated token index is step_idx + 1
+                        if step_idx + 1 < len(generated_tokens):
+                            actual_token_id = generated_tokens[step_idx + 1]
+                            # Get probability of the actual selected token (from best beam, index 0)
+                            token_prob = probs[0, actual_token_id].item()
+                            token_confidences.append(token_prob)
+                    # Calculate average confidence
+                    avg_confidence = sum(token_confidences) / len(token_confidences) if token_confidences else 0.0
+                    char_confidences = token_confidences
+                else:
+                    avg_confidence = 0.0
+                    char_confidences = []
+            else:
+                generated_ids = self.model.generate(
+                    pixel_values,
+                    num_beams=num_beams,
+                    max_length=max_length,
+                    early_stopping=True
+                )
+                avg_confidence = None
+                char_confidences = None
+        # Decode
+        text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        if return_confidence:
+            return text, avg_confidence, char_confidences
+        else:
+            return text
+    def transcribe_segments(self, segments: List[LineSegment],
+                          num_beams: int = 4, max_length: int = 128,
+                          show_progress: bool = True) -> List[LineSegment]:
+        """
+        Transcribe multiple line segments.
+        Args:
+            segments: List of LineSegment objects
+            num_beams: Beam search parameter
+            max_length: Max sequence length
+            show_progress: Show progress bar
+        Returns:
+            Updated segments with text field filled
+        """
+        if show_progress:
+            from tqdm import tqdm
+            iterator = tqdm(segments, desc="Transcribing lines")
+        else:
+            iterator = segments
+        for segment in iterator:
+            segment.text = self.transcribe_line(
+                segment.image,
+                num_beams=num_beams,
+                max_length=max_length
+            )
+        return segments
+def main():
+    parser = argparse.ArgumentParser(
+        description="Whole-page OCR inference for Ukrainian handwritten text"
+    )
+    parser.add_argument(
+        '--image',
+        type=str,
+        required=True,
+        help='Path to input page image'
+    )
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        required=True,
+        help='Path to TrOCR checkpoint directory'
+    )
+    parser.add_argument(
+        '--xml',
+        type=str,
+        default=None,
+        help='Optional: PAGE XML file for line segmentation (if not provided, automatic segmentation is used)'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        default=None,
+        help='Output text file (default: <image_name>_transcription.txt)'
+    )
+    parser.add_argument(
+        '--num_beams',
+        type=int,
+        default=4,
+        help='Number of beams for beam search (default: 4, higher=better quality but slower)'
+    )
+    parser.add_argument(
+        '--max_length',
+        type=int,
+        default=128,
+        help='Maximum sequence length (default: 128)'
+    )
+    parser.add_argument(
+        '--min_line_height',
+        type=int,
+        default=20,
+        help='Minimum line height for automatic segmentation (default: 20)'
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Visualize line segmentation'
+    )
+    parser.add_argument(
+        '--device',
+        type=str,
+        default=None,
+        choices=['cuda', 'cpu'],
+        help='Device to use for inference (default: auto-detect)'
+    )
+    parser.add_argument(
+        '--base_model',
+        type=str,
+        default='kazars24/trocr-base-handwritten-ru',
+        help='Base model for processor (default: kazars24/trocr-base-handwritten-ru)'
+    )
+    parser.add_argument(
+        '--normalize-background',
+        action='store_true',
+        help='Apply background normalization (REQUIRED if model was trained with --normalize-background)'
+    )
+    parser.add_argument(
+        '--flip-rtl',
+        action='store_true',
+        help='Flip line images horizontally for RTL scripts (REQUIRED if model was trained with --flip-rtl)'
+    )
+    args = parser.parse_args()
+    print("=" * 80)
+    print("TrOCR Whole-Page Inference")
+    print("=" * 80)
+    print(f"Input image:  {args.image}")
+    print(f"Checkpoint:   {args.checkpoint}")
+    print(f"Segmentation: {'PAGE XML' if args.xml else 'Automatic'}")
+    print(f"Beam search:  {args.num_beams}")
+    print("=" * 80)
+    # Load image
+    print("\nLoading image...")
+    Image.MAX_IMAGE_PIXELS = None  # Allow large images
+    from PIL import ImageOps
+    image = Image.open(args.image)
+    image = ImageOps.exif_transpose(image)  # Fix EXIF orientation
+    image = image.convert('RGB')
+    print(f"Image size: {image.width}x{image.height}")
+    # Segment lines
+    print("\nSegmenting lines...")
+    if args.xml:
+        segmenter = PageXMLSegmenter(args.xml)
+        segments = segmenter.segment_lines(image)
+        print(f"Found {len(segments)} lines in PAGE XML")
+    else:
+        segmenter = LineSegmenter(
+            min_line_height=args.min_line_height
+        )
+        segments = segmenter.segment_lines(image, debug=args.debug)
+        print(f"Detected {len(segments)} lines")
+    if not segments:
+        print("ERROR: No lines detected!")
+        return
+    # Initialize TrOCR
+    print("\nInitializing TrOCR model...")
+    ocr = TrOCRInference(
+        args.checkpoint,
+        device=args.device,
+        base_model=args.base_model,
+        normalize_bg=args.normalize_background,  # NEW: pass normalization flag
+        flip_rtl=args.flip_rtl
+    )
+    # Transcribe
+    print(f"\nTranscribing {len(segments)} lines...")
+    segments = ocr.transcribe_segments(
+        segments,
+        num_beams=args.num_beams,
+        max_length=args.max_length
+    )
+    # Prepare output
+    transcription = "\n".join(seg.text for seg in segments if seg.text)
+    # Determine output path
+    if args.output:
+        output_path = Path(args.output)
+    else:
+        image_path = Path(args.image)
+        output_path = image_path.parent / f"{image_path.stem}_transcription.txt"
+    # Save
+    print(f"\nSaving transcription to {output_path}...")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(transcription)
+    # Print results
+    print("\n" + "=" * 80)
+    print("TRANSCRIPTION RESULT")
+    print("=" * 80)
+    print(transcription)
+    print("=" * 80)
+    print(f"\nTranscription saved to: {output_path}")
+    print(f"Total lines: {len(segments)}")
+    print(f"Average confidence: N/A (not implemented yet)")
+if __name__ == '__main__':
+    main()

inference_pylaia_native.py ADDED Viewed

	@@ -0,0 +1,453 @@

+"""
+Native PyLaia Inference (No WSL)
+This module provides inference for PyLaia CRNN models trained with train_pylaia.py.
+It loads the PyTorch checkpoint directly and runs inference natively on Linux.
+"""
+import torch
+import torch.nn as nn
+from pathlib import Path
+from typing import Tuple, Optional, List
+from PIL import Image
+import torchvision.transforms as transforms
+import logging
+import json
+import os
+logger = logging.getLogger(__name__)
+class CRNN(nn.Module):
+    """
+    CRNN architecture (same as train_pylaia.py).
+    """
+    def __init__(
+        self,
+        img_height: int = 128,
+        num_channels: int = 1,
+        num_classes: int = 100,
+        cnn_filters: List[int] = [12, 24, 48, 48],
+        cnn_poolsize: List[int] = [2, 2, 0, 2],
+        rnn_hidden: int = 256,
+        rnn_layers: int = 3,
+        dropout: float = 0.5
+    ):
+        super(CRNN, self).__init__()
+        self.img_height = img_height
+        self.num_classes = num_classes
+        self.cnn_poolsize = cnn_poolsize
+        # CNN layers
+        cnn_layers = []
+        in_channels = num_channels
+        for i, out_channels in enumerate(cnn_filters):
+            cnn_layers.extend([
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, dilation=1),
+                nn.BatchNorm2d(out_channels),
+                nn.LeakyReLU(0.2, inplace=True)
+            ])
+            if cnn_poolsize[i] > 0:
+                cnn_layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
+            in_channels = out_channels
+        self.cnn = nn.Sequential(*cnn_layers)
+        # Calculate RNN input size
+        num_pools = sum(1 for p in cnn_poolsize if p > 0)
+        cnn_output_height = img_height // (2 ** num_pools)
+        rnn_input_size = cnn_filters[-1] * cnn_output_height
+        # Bidirectional LSTM
+        self.rnn = nn.LSTM(
+            input_size=rnn_input_size,
+            hidden_size=rnn_hidden,
+            num_layers=rnn_layers,
+            dropout=dropout if rnn_layers > 1 else 0,
+            bidirectional=True,
+            batch_first=False
+        )
+        self.lin_dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(rnn_hidden * 2, num_classes)
+    def forward(self, x):
+        """
+        Args:
+            x: [batch, channels, height, width]
+        Returns:
+            log_probs: [width, batch, num_classes]
+        """
+        # CNN
+        conv = self.cnn(x)
+        # Reshape for RNN
+        batch, channels, height, width = conv.size()
+        conv = conv.permute(3, 0, 1, 2)  # [width, batch, channels, height]
+        conv = conv.reshape(width, batch, channels * height)
+        # RNN
+        rnn_out, _ = self.rnn(conv)
+        rnn_out = self.lin_dropout(rnn_out)
+        # Output projection
+        output = self.fc(rnn_out)
+        # Log softmax for CTC
+        log_probs = torch.nn.functional.log_softmax(output, dim=2)
+        return log_probs
+class PyLaiaInference:
+    """
+    Native PyLaia inference (no WSL dependency).
+    Loads PyTorch checkpoint directly and runs inference on Linux.
+    """
+    def __init__(self, checkpoint_path: str, syms_path: str = None, enable_spaces: bool = True):
+        """
+        Initialize PyLaia inference.
+        Args:
+            checkpoint_path: Path to .ckpt checkpoint file
+            syms_path: Path to symbols file. If None, will look in data directory.
+            enable_spaces: If True, convert <space> tokens to actual spaces. If False, keep as <space>.
+        """
+        self.enable_spaces = enable_spaces
+        self.checkpoint_path = Path(checkpoint_path)
+        if not self.checkpoint_path.exists():
+            raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+        # Find symbols file
+        if syms_path is None:
+            # First: look alongside the checkpoint for symbols.txt or syms.txt
+            model_dir = self.checkpoint_path.parent
+            for _candidate in ("symbols.txt", "syms.txt"):
+                _candidate_path = model_dir / _candidate
+                if _candidate_path.exists():
+                    syms_path = _candidate_path
+                    logger.info(f"Found symbols file alongside checkpoint: {syms_path}")
+                    break
+        if syms_path is None:
+            # Last-resort fallback
+            syms_path = Path("data/pylaia_glagolitic/syms.txt")
+        self.syms_path = Path(syms_path)
+        if not self.syms_path.exists():
+            raise FileNotFoundError(f"Symbols file not found: {syms_path}")
+        # Load symbols (handle both list and KALDI formats)
+        # CRITICAL: Use rstrip('\n\r') not strip() to preserve leading/trailing whitespace in symbols (e.g., TAB)
+        with open(self.syms_path, 'r', encoding='utf-8') as f:
+            symbols_raw = [line.rstrip('\n\r') for line in f if line.rstrip('\n\r')]
+        # Auto-detect format: KALDI format has "symbol index" pairs
+        if symbols_raw and ' ' in symbols_raw[0]:
+            parts = symbols_raw[0].split()
+            if len(parts) == 2 and parts[1].isdigit():
+                # KALDI format: "symbol index"
+                # Parse carefully to handle whitespace symbols (e.g., TAB at index 131)
+                self.symbols = []
+                for line in symbols_raw:
+                    # Get the last token (index)
+                    idx_str = line.split()[-1]
+                    if not idx_str.isdigit():
+                        continue
+                    # Symbol is everything before the last space + index
+                    symbol = line[:line.rfind(' ' + idx_str)]
+                    self.symbols.append(symbol)
+                logger.info(f"Detected KALDI format vocabulary")
+            else:
+                # List format (one symbol per line)
+                self.symbols = symbols_raw
+        else:
+            # List format (one symbol per line)
+            self.symbols = symbols_raw
+        # Remove <ctc> token if present (CTC blank is handled separately as index 0)
+        if self.symbols and self.symbols[0] == '<ctc>':
+            self.symbols = self.symbols[1:]
+            logger.info(f"Removed <ctc> token from vocabulary (using index 0 for CTC blank)")
+        # Create char-to-index mapping (0 reserved for CTC blank)
+        self.char2idx = {char: idx + 1 for idx, char in enumerate(self.symbols)}
+        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
+        self.idx2char[0] = ''  # CTC blank
+        # Map <SPACE> or <space> to actual space (if enabled)
+        if self.enable_spaces:
+            if '<SPACE>' in self.char2idx:
+                space_idx = self.char2idx['<SPACE>']
+                self.idx2char[space_idx] = ' '
+            elif '<space>' in self.char2idx:
+                space_idx = self.char2idx['<space>']
+                self.idx2char[space_idx] = ' '
+        # Load checkpoint
+        logger.info(f"Loading PyLaia checkpoint: {checkpoint_path}")
+        checkpoint = torch.load(self.checkpoint_path, map_location='cpu', weights_only=False)
+        # CRITICAL: If checkpoint has idx2char, use it instead of vocabulary file
+        # This handles models trained with different vocabulary parsing (strip vs rstrip)
+        if 'idx2char' in checkpoint:
+            logger.info(f"Using idx2char from checkpoint ({len(checkpoint['idx2char'])} characters)")
+            self.idx2char = checkpoint['idx2char']
+            self.char2idx = checkpoint.get('char2idx', {char: idx for idx, char in self.idx2char.items()})
+            # Still apply enable_spaces setting
+            if self.enable_spaces:
+                for idx, char in list(self.idx2char.items()):
+                    if char == '<SPACE>' or char == '<space>':
+                        self.idx2char[idx] = ' '
+        # Extract model state dict from checkpoint
+        # train_pylaia.py saves checkpoints with 'model_state_dict' key
+        state_dict = checkpoint.get('model_state_dict', checkpoint.get('state_dict', checkpoint))
+        # Infer number of classes from checkpoint (fc.weight shape is [num_classes, rnn_hidden*2])
+        fc_weight_shape = state_dict['fc.weight'].shape
+        num_classes = fc_weight_shape[0]
+        logger.info(f"Inferred {num_classes} output classes from checkpoint")
+        logger.info(f"Vocabulary has {len(self.symbols)} symbols (+ 1 blank = {len(self.symbols)+1} expected)")
+        # Initialize model
+        self.model = CRNN(
+            img_height=128,
+            num_channels=1,
+            num_classes=num_classes,
+            cnn_filters=[12, 24, 48, 48],
+            cnn_poolsize=[2, 2, 0, 2],
+            rnn_hidden=256,
+            rnn_layers=3,
+            dropout=0.5
+        )
+        # Load weights
+        self.model.load_state_dict(state_dict, strict=True)
+        # Set device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        # Image preprocessing (same as training)
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5], std=[0.5])
+        ])
+        logger.info(f"Loaded PyLaia model with {num_classes} output classes")
+        logger.info(f"Using device: {self.device}")
+    def preprocess_image(self, image: Image.Image) -> torch.Tensor:
+        """
+        Preprocess image for inference.
+        Args:
+            image: PIL Image (RGB or grayscale)
+        Returns:
+            Preprocessed tensor [1, 1, height, width]
+        """
+        # Convert to grayscale
+        if image.mode != 'L':
+            image = image.convert('L')
+        # Resize to target height (128) while preserving aspect ratio
+        target_height = 128
+        aspect_ratio = image.width / image.height
+        new_width = int(target_height * aspect_ratio)
+        image = image.resize((new_width, target_height), Image.LANCZOS)
+        # Apply transforms
+        img_tensor = self.transform(image)  # [1, height, width]
+        img_tensor = img_tensor.unsqueeze(0)  # [1, 1, height, width]
+        return img_tensor
+    def decode_ctc(self, log_probs: torch.Tensor) -> Tuple[str, float]:
+        """
+        Decode CTC output using greedy decoding.
+        Args:
+            log_probs: [seq_len, 1, num_classes]
+        Returns:
+            Tuple of (decoded_text, confidence)
+        """
+        # Get most likely class at each time step
+        probs = torch.exp(log_probs)
+        _, pred_indices = torch.max(probs, dim=2)  # [seq_len, 1]
+        pred_indices = pred_indices.squeeze(1).cpu().numpy()  # [seq_len]
+        # CTC greedy decoding: remove consecutive duplicates and blanks
+        decoded_chars = []
+        prev_idx = -1
+        confidences = []
+        for t, idx in enumerate(pred_indices):
+            if idx != 0 and idx != prev_idx:  # Not blank and not duplicate
+                char = self.idx2char.get(idx, '')
+                if char:
+                    decoded_chars.append(char)
+                    # Get confidence for this character
+                    char_conf = probs[t, 0, idx].item()
+                    confidences.append(char_conf)
+            prev_idx = idx
+        # Join characters
+        text = ''.join(decoded_chars)
+        # Average confidence
+        confidence = sum(confidences) / len(confidences) if confidences else 0.0
+        return text, confidence
+    def transcribe(self, image: Image.Image) -> Tuple[str, float]:
+        """
+        Transcribe a single line image.
+        Args:
+            image: PIL Image of text line
+        Returns:
+            Tuple of (transcription_text, confidence_score)
+        """
+        try:
+            # Preprocess
+            img_tensor = self.preprocess_image(image).to(self.device)
+            # Forward pass
+            with torch.no_grad():
+                log_probs = self.model(img_tensor)  # [width, 1, num_classes]
+            # Decode
+            text, confidence = self.decode_ctc(log_probs)
+            return text, confidence
+        except Exception as e:
+            logger.error(f"Error during PyLaia inference: {e}")
+            import traceback
+            traceback.print_exc()
+            return "", 0.0
+# Model registry (updated for trained models)
+PYLAIA_MODELS = {
+    "Church Slavonic (2.89% CER)": {
+        "checkpoint": "models/pylaia_church_slavonic_20251103_222215/best_model.pt",
+        "syms": "models/pylaia_church_slavonic_20251103_222215/symbols.txt",
+        "description": "PyLaia CRNN - Church Slavonic manuscript (2.89% CER)"
+    },
+    "Prosta Mova (3.77% CER)": {
+        "checkpoint": "models/pylaia_prosta_mova_v4_20251121_155322/best_model.pt",
+        "syms": "models/pylaia_prosta_mova_v4_20251121_155322/symbols.txt",
+        "description": "PyLaia CRNN - Prosta Mova (3.77% CER)"
+    },
+    "Glagolitic (5.33% CER)": {
+        "checkpoint": "models/pylaia_glagolitic_with_spaces_20251102_182103/best_model.pt",
+        "syms": "data/pylaia_glagolitic/syms.txt",
+        "description": "PyLaia CRNN - Glagolitic manuscript (76 symbols, 5.33% CER)"
+    },
+    "Ukrainian (4.76% CER)": {
+        "checkpoint": "models/pylaia_ukrainian_v2c_20251124_180634/best_model.pt",
+        "syms": "models/pylaia_ukrainian_v2c_20251124_180634/symbols.txt",
+        "description": "PyLaia CRNN - Ukrainian manuscript (4.76% CER)"
+    },
+    "Ukrainian (13.53% CER - OLD)": {
+        "checkpoint": "models/pylaia_ukrainian_retrain_20251102_213431/best_model.pt",
+        "syms": "models/pylaia_ukrainian_retrain_20251102_213431/symbols.txt",
+        "description": "PyLaia CRNN - Ukrainian manuscript (180 symbols, 13.53% CER)"
+    },
+    "Glagolitic (old)": {
+        "checkpoint": "models/pylaia_glagolitic_single_gpu/best_model.pt",
+        "syms": "models/pylaia_glagolitic_single_gpu/symbols.txt",
+        "description": "PyLaia model - old Glagolitic training (no spaces)"
+    }
+}
+def _register_hf_space_demo_models() -> None:
+    """Add public Hugging Face CRNN-CTC presets for the hosted demo mode."""
+    if os.environ.get("POLYSCRIPTOR_DEMO_MODE") != "hf_space":
+        return
+    PYLAIA_MODELS.clear()
+    PYLAIA_MODELS.update({
+        "Ukrainian (HF, 4.76% CER)": {
+            "repo_id": "achimrabus/crnn-ctc-ukrainian",
+            "checkpoint": "best_model.pt",
+            "syms": "symbols.txt",
+            "description": "Public Hugging Face CRNN-CTC model for Ukrainian HTR",
+        },
+        "Prosta Mova (HF, 3.77% CER)": {
+            "repo_id": "achimrabus/crnn-ctc-prosta-mova",
+            "checkpoint": "best_model.pt",
+            "syms": "symbols.txt",
+            "description": "Public Hugging Face CRNN-CTC model for Prosta Mova HTR",
+        },
+        "Church Slavonic (HF, 2.89% CER)": {
+            "repo_id": "achimrabus/crnn-ctc-church-slavonic",
+            "checkpoint": "best_model.pt",
+            "syms": "symbols.txt",
+            "description": "Public Hugging Face CRNN-CTC model for Church Slavonic HTR",
+        },
+        "Glagolitic (HF, 5.33% CER)": {
+            "repo_id": "achimrabus/crnn-ctc-glagolitic",
+            "checkpoint": "best_model.pt",
+            "syms": "symbols.txt",
+            "description": "Public Hugging Face CRNN-CTC model for Glagolitic HTR",
+        },
+    })
+def _scan_pylaia_models(models_dir: str = "models") -> None:
+    """Scan models/ for CRNN-CTC checkpoints not already in PYLAIA_MODELS.
+    Any subdirectory containing best_model.pt that isn't already registered
+    is added automatically, using its folder name as the display key.
+    A co-located symbols.txt or syms.txt is used as the symbols file.
+    This lets users drop a trained model into models/ without editing the registry.
+    """
+    models_path = Path(models_dir)
+    if not models_path.is_dir():
+        return
+    registered = {
+        str(Path(info["checkpoint"])) if isinstance(info, dict) else str(Path(info))
+        for info in PYLAIA_MODELS.values()
+    }
+    for checkpoint in sorted(models_path.glob("*/best_model.pt")):
+        checkpoint_str = str(checkpoint)
+        if checkpoint_str in registered:
+            continue
+        model_dir = checkpoint.parent
+        folder_name = model_dir.name
+        if folder_name in PYLAIA_MODELS:
+            continue
+        syms_path = None
+        for candidate in ("symbols.txt", "syms.txt"):
+            candidate_path = model_dir / candidate
+            if candidate_path.exists():
+                syms_path = str(candidate_path)
+                break
+        PYLAIA_MODELS[folder_name] = {
+            "checkpoint": checkpoint_str,
+            "syms": syms_path,
+            "description": f"CRNN-CTC model (auto-discovered): {folder_name}",
+        }
+        logger.debug(f"Auto-discovered CRNN-CTC model: {folder_name}")
+# Populate registry with any models not hard-coded above
+_register_hf_space_demo_models()
+_scan_pylaia_models()

kraken_segmenter.py ADDED Viewed

	@@ -0,0 +1,823 @@

+"""
+Kraken-based line segmentation for historical document OCR.
+This module provides an alternative to the classical HPP (Horizontal Projection Profile)
+segmentation using Kraken's pre-trained neural models.
+Supports two modes:
+- Classical: pageseg.segment() — fast, lines only, no regions
+- Neural (blla): blla.segment() — GPU-accelerated, returns regions AND baselines,
+  handles multi-column layouts
+"""
+import os
+import time
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, NamedTuple, Tuple, Dict
+from PIL import Image
+import numpy as np
+# Module-level cache: maps model path -> loaded TorchVGSLModel.
+# Shared across all KrakenLineSegmenter instances so that the model is loaded
+# from disk only once per process, even in batch processing loops.
+_MODEL_CACHE: Dict[str, Any] = {}
+class LineSegment(NamedTuple):
+    """Represents a segmented text line."""
+    image: Image.Image
+    bbox: tuple  # (x1, y1, x2, y2)
+    baseline: Optional[List[tuple]] = None  # List of (x, y) points
+@dataclass
+class SegRegion:
+    """Represents a detected text region (column, marginalia, etc.)."""
+    id: str
+    bbox: Tuple[int, int, int, int]  # (x1, y1, x2, y2)
+    line_ids: List[str] = field(default_factory=list)
+    polygon: Optional[List[Tuple[int, int]]] = None  # Convex hull or neural polygon
+    mode: str = "neural"  # "neural" or "classical"
+class KrakenLineSegmenter:
+    """
+    Line segmentation using Kraken with pre-trained models.
+    Kraken is specifically designed for historical document OCR and provides:
+    - Pre-trained models that work out-of-the-box
+    - Baseline detection (not just bounding boxes)
+    - Robust handling of degraded/faded text
+    - Support for rotated and multi-column layouts
+    Performance: ~3-8s per page (CPU), ~1-3s (GPU)
+    Accuracy: 90-95% on historical documents
+    """
+    def __init__(self, model_path: Optional[str] = None, device: str = "cpu"):
+        """
+        Initialize Kraken segmenter.
+        Args:
+            model_path: Path to custom segmentation model (.mlmodel file).
+                       Note: Kraken 5.x uses classical segmentation by default.
+                       Neural baseline segmentation requires additional setup.
+            device: 'cpu' or 'cuda' for GPU acceleration (not used by classical segmenter)
+        """
+        self.model_path = model_path
+        self.device = device
+        # Import kraken components
+        try:
+            from kraken import binarization, pageseg
+            self.binarization = binarization
+            self.pageseg = pageseg
+        except ImportError as e:
+            raise ImportError(
+                "Kraken is not installed. Install it with: pip install kraken\n"
+                f"Original error: {e}"
+            )
+        # Note: model_path is currently not used as pageseg.segment() doesn't accept models
+        # The classical segmentation algorithm is robust and works well for most documents
+        if model_path:
+            print(f"[KrakenSegmenter] Warning: Custom model path provided but not used.")
+            print(f"[KrakenSegmenter] Kraken 5.x pageseg.segment() uses classical algorithm.")
+            print(f"[KrakenSegmenter] Neural baseline segmentation requires kraken.lib.models workflow.")
+    def segment_lines(
+        self,
+        image: Image.Image,
+        text_direction: str = 'horizontal-lr',
+        use_binarization: bool = True
+    ) -> List[LineSegment]:
+        """
+        Segment image into text lines using Kraken.
+        Args:
+            image: PIL Image to segment
+            text_direction: Text direction - 'horizontal-lr' (left-to-right),
+                          'horizontal-rl', 'vertical-lr', 'vertical-rl'
+            use_binarization: Whether to apply neural binarization preprocessing
+                            (recommended for degraded documents)
+        Returns:
+            List of LineSegment objects sorted top to bottom
+        """
+        print(f"[KrakenSegmenter] Segmenting image (size={image.size}, mode={image.mode}, "
+              f"direction={text_direction}, binarize={use_binarization})")
+        try:
+            # Step 0: Convert to grayscale if needed (Kraken works better with grayscale)
+            if image.mode not in ('L', '1'):
+                print(f"[KrakenSegmenter] Converting from {image.mode} to grayscale...")
+                image = image.convert('L')
+            # Step 1: Binarize (required by pageseg.segment)
+            # pageseg.segment REQUIRES binary images
+            if use_binarization:
+                print(f"[KrakenSegmenter] Applying neural binarization...")
+                processed_img = self.binarization.nlbin(image)
+            else:
+                # Simple Otsu binarization as fallback
+                print(f"[KrakenSegmenter] Applying Otsu binarization...")
+                import numpy as np
+                from PIL import ImageOps
+                # Otsu's method
+                img_array = np.array(image)
+                threshold = np.median(img_array)  # Simple threshold
+                binary = img_array > threshold
+                processed_img = Image.fromarray((binary * 255).astype(np.uint8), mode='L')
+            # Step 2: Line segmentation using Kraken's classical algorithm
+            # This is more robust than basic HPP and works well on historical documents
+            print(f"[KrakenSegmenter] Running line segmentation...")
+            seg_result = self.pageseg.segment(
+                processed_img,
+                text_direction=text_direction
+            )
+            # Handle both dict (old Kraken) and Segmentation object (new Kraken)
+            if isinstance(seg_result, dict):
+                print(f"[KrakenSegmenter] pageseg.segment returned dict (old Kraken API)")
+                # Old API: seg_result is a dict with 'boxes' key
+                seg_lines = seg_result.get('boxes', seg_result.get('lines', []))
+            else:
+                print(f"[KrakenSegmenter] pageseg.segment returned Segmentation object")
+                seg_lines = seg_result.lines
+            print(f"[KrakenSegmenter] Processing {len(seg_lines)} lines...")
+            # Step 3: Extract line information
+            lines = []
+            for idx, line in enumerate(seg_lines):
+                # Extract bounding box
+                bbox = line.bbox  # (x_min, y_min, x_max, y_max)
+                # Extract baseline (list of (x, y) points)
+                baseline = line.baseline if hasattr(line, 'baseline') else None
+                # Crop line image from original (not binarized)
+                line_img = image.crop(bbox)
+                lines.append(LineSegment(
+                    image=line_img,
+                    bbox=bbox,
+                    baseline=baseline
+                ))
+            # Sort lines top to bottom by Y coordinate
+            lines = sorted(lines, key=lambda x: x.bbox[1])
+            print(f"[KrakenSegmenter] Detected {len(lines)} lines")
+            return lines
+        except Exception as e:
+            print(f"[KrakenSegmenter] ERROR: Segmentation failed: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+    def segment_with_regions(
+        self,
+        image: Image.Image,
+        model_path: Optional[str] = None,
+        device: Optional[str] = None,
+        min_line_height: int = 8,
+        max_columns: int = 4,
+        split_width_fraction: float = 0.40,
+        min_lines_to_split: int = 10,
+        text_direction: str = 'horizontal-lr',
+    ) -> Tuple[List[SegRegion], List[LineSegment]]:
+        """
+        Neural baseline segmentation using blla.segment().
+        Returns regions AND lines with baselines.  Handles multi-column layouts
+        by using blla's region detection, with a column-clustering fallback when
+        blla returns a single region with many lines (≥30).
+        Falls back to classical pageseg.segment() + column clustering if blla
+        fails or the model file is missing.
+        Args:
+            image: PIL Image to segment (RGB or grayscale)
+            model_path: Path to blla .mlmodel file.  Defaults to
+                        ``pagexml/blla.mlmodel`` relative to this script.
+            device: 'cpu' or 'cuda' / 'cuda:0'.  Defaults to self.device.
+            min_line_height: Discard lines shorter than this (pixels).
+            max_columns: Maximum number of columns to detect per region (1-8).
+            split_width_fraction: Minimum region width as fraction of page width
+                        to trigger sub-column splitting (0.0-1.0).  Lower values
+                        split narrower regions.  Default 0.40 (40%).
+                        For landscape double-page spreads, try 0.20 (20%).
+            min_lines_to_split: Minimum number of lines in a region before
+                        attempting to split it into sub-columns.
+        Returns:
+            (regions, lines) where *lines* carry a ``region_id`` attribute via
+            the companion ``SegRegion`` that owns them.
+        """
+        device = device or self.device
+        if model_path is None:
+            model_path = os.path.join(os.path.dirname(__file__), 'pagexml', 'blla.mlmodel')
+        print(f"[KrakenSegmenter] Neural segmentation (blla) on {image.size}, device={device}")
+        # ── Try neural (blla) first ──────────────────────────────────
+        if os.path.isfile(model_path):
+            try:
+                regions, lines = self._segment_neural(
+                    image, model_path, device, min_line_height,
+                    max_columns=max_columns,
+                    split_width_fraction=split_width_fraction,
+                    min_lines_to_split=min_lines_to_split,
+                    text_direction=text_direction,
+                )
+                if regions:
+                    print(f"[KrakenSegmenter] blla: {len(regions)} regions, {len(lines)} lines")
+                    return regions, lines
+                print("[KrakenSegmenter] blla returned no regions; falling back to classical + clustering")
+            except Exception as e:
+                print(f"[KrakenSegmenter] blla failed ({e}); falling back to classical + clustering")
+                import traceback
+                traceback.print_exc()
+        else:
+            print(f"[KrakenSegmenter] blla model not found at {model_path}; using classical fallback")
+        # ── Fallback: classical pageseg + column clustering ──────────
+        return self._segment_classical_with_regions(image, min_line_height)
+    # ── internal: neural blla ────────────────────────────────────────
+    def _segment_neural(
+        self,
+        image: Image.Image,
+        model_path: str,
+        device: str,
+        min_line_height: int,
+        max_columns: int = 4,
+        split_width_fraction: float = 0.40,
+        min_lines_to_split: int = 10,
+        text_direction: str = 'horizontal-lr',
+    ) -> Tuple[List[SegRegion], List[LineSegment]]:
+        """Run blla.segment() and build SegRegion / LineSegment lists."""
+        from kraken import blla
+        from kraken.lib import vgsl
+        import torch
+        start = time.time()
+        # Validate device
+        if device.startswith('cuda') and not torch.cuda.is_available():
+            print(f"[KrakenSegmenter] WARNING: device={device} but CUDA not available, falling back to cpu")
+            device = 'cpu'
+        # Load model once and cache keyed by (path, device) — repeated calls
+        # reuse the already-loaded, already-placed model. Keying by device means
+        # a CPU and a CUDA instance don't share the same cached object.
+        cache_key = (model_path, device)
+        if cache_key not in _MODEL_CACHE:
+            print(f"[KrakenSegmenter] Loading blla model: {model_path}")
+            m = vgsl.TorchVGSLModel.load_model(model_path)
+            # blla.segment()'s device= parameter does NOT move the model —
+            # it must be placed on the target device explicitly before the call.
+            m.nn.to(device)
+            _MODEL_CACHE[cache_key] = m
+        model = _MODEL_CACHE[cache_key]
+        # Diagnostic: confirm model parameters are on the expected device.
+        try:
+            actual_device = next(model.nn.parameters()).device
+            print(f"[KrakenSegmenter] blla model on: {actual_device} (requested: {device})")
+            if device.startswith('cuda') and actual_device.type != 'cuda':
+                print(f"[KrakenSegmenter] WARNING: model is on {actual_device}, not GPU")
+        except Exception:
+            print(f"[KrakenSegmenter] blla running on device={device}")
+        # blla wants RGB
+        img = image.convert('RGB') if image.mode != 'RGB' else image
+        # blla has built-in autocast support (disabled by default). Enable it
+        # on CUDA for faster fp16 forward pass.
+        baseline_seg = blla.segment(img, model=model, device=device,
+                                    autocast=device.startswith('cuda'),
+                                    text_direction=text_direction)
+        w, h = image.size
+        seg_lines: List[LineSegment] = []
+        # region_id -> {'lines': [...], 'blla_region': ...}
+        regions_dict: Dict[str, dict] = {}
+        # Extract blla region bounding boxes for cross-column line splitting.
+        # blla sometimes draws baselines that span multiple columns at the same
+        # vertical position.  Using region boundaries we can clip or split such
+        # lines so that each crop stays within one column.
+        blla_boxes = self._extract_blla_region_boxes(baseline_seg, text_direction)
+        if blla_boxes:
+            print(f"[KrakenSegmenter] blla detected {len(blla_boxes)} text regions "
+                  f"— will clip lines to region boundaries")
+        for idx, line in enumerate(baseline_seg.lines):
+            bbox = self._extract_bbox(line)
+            if bbox is None:
+                continue
+            baseline = (
+                [(int(p[0]), int(p[1])) for p in line.baseline]
+                if hasattr(line, 'baseline') and line.baseline
+                else None
+            )
+            if blla_boxes:
+                # Find which detected regions this line's bbox overlaps.
+                overlapping = self._overlapping_blla_boxes(bbox, blla_boxes)
+            else:
+                overlapping = []
+            if not overlapping:
+                # No region overlap or no regions at all — fall back to
+                # centre-based assignment and keep the original bbox.
+                region_id, blla_region = self._find_region_for_line(
+                    bbox, line, baseline_seg
+                )
+                sub_bboxes = [(bbox, region_id, blla_region)]
+            else:
+                # Clip / split the line at each overlapping region boundary.
+                sub_bboxes = []
+                for rx1, ry1, rx2, ry2, region_obj, region_key in overlapping:
+                    clipped = (
+                        max(bbox[0], rx1), max(bbox[1], ry1),
+                        min(bbox[2], rx2), min(bbox[3], ry2),
+                    )
+                    sub_bboxes.append((clipped, region_key, region_obj))
+            for clipped_bbox, region_key, region_obj in sub_bboxes:
+                cx1, cy1, cx2, cy2 = clipped_bbox
+                if cx2 <= cx1 or cy2 <= cy1:
+                    continue
+                # Filter tiny lines (after possible clamping)
+                if (cy2 - cy1) < min_line_height:
+                    continue
+                line_img = image.crop(clipped_bbox)
+                seg_line = LineSegment(image=line_img, bbox=clipped_bbox, baseline=baseline)
+                seg_lines.append(seg_line)
+                if region_key not in regions_dict:
+                    regions_dict[region_key] = {'lines': [], 'blla_region': region_obj}
+                regions_dict[region_key]['lines'].append((len(seg_lines) - 1, seg_line))
+        # Sub-split wide regions that likely contain multiple columns.
+        # blla often detects "left page" and "right page" as two regions on a
+        # double-page spread, but each page may have 2 columns internally.
+        # Loop until convergence: a single pass may leave wide sub-regions that
+        # need further splitting (e.g. a 3-column area assigned as one bucket).
+        for _round in range(max_columns):
+            prev_size = len(regions_dict)
+            regions_dict = self._split_wide_regions(
+                regions_dict, w,
+                min_lines_to_split=min_lines_to_split,
+                split_width_fraction=split_width_fraction,
+                max_columns=max_columns,
+            )
+            if len(regions_dict) == prev_size:
+                break  # no new splits — converged
+        # Build SegRegion objects
+        regions, ordered_lines = self._build_regions(regions_dict, seg_lines, w,
+                                                      text_direction=text_direction)
+        elapsed = time.time() - start
+        print(f"[KrakenSegmenter] blla completed in {elapsed:.2f}s")
+        return regions, ordered_lines
+    # ── internal: classical fallback with column clustering ──────────
+    def segment_classical_with_regions(
+        self,
+        image: Image.Image,
+        min_line_height: int = 15,
+        max_columns: int = 4,
+    ) -> Tuple[List[SegRegion], List[LineSegment]]:
+        """Public wrapper: classical pageseg + heuristic column clustering."""
+        return self._segment_classical_with_regions(image, min_line_height, max_columns)
+    def _segment_classical_with_regions(
+        self,
+        image: Image.Image,
+        min_line_height: int = 15,
+        max_columns: int = 4,
+    ) -> Tuple[List[SegRegion], List[LineSegment]]:
+        """Classical pageseg + heuristic column clustering."""
+        raw_lines = self.segment_lines(image)
+        if not raw_lines:
+            return [], []
+        # Filter small lines
+        raw_lines = [l for l in raw_lines if (l.bbox[3] - l.bbox[1]) >= min_line_height]
+        w = image.size[0]
+        # Cluster into columns (pass max_columns so 4-column spreads are handled)
+        regions_dict = self._cluster_into_columns(raw_lines, w, max_columns=max_columns)
+        regions, ordered_lines = self._build_regions(regions_dict, raw_lines, w)
+        for r in regions:
+            r.mode = "classical"
+        return regions, ordered_lines
+    # ── helpers ───────────────────────────────────────────────────────
+    @staticmethod
+    def _extract_bbox(line) -> Optional[Tuple[int, int, int, int]]:
+        """Extract (x1,y1,x2,y2) bbox from a blla line object."""
+        if hasattr(line, 'bbox'):
+            return tuple(int(v) for v in line.bbox)
+        if hasattr(line, 'baseline') and line.baseline:
+            xs = [p[0] for p in line.baseline]
+            ys = [p[1] for p in line.baseline]
+            avg_h = 30
+            return (int(min(xs)), int(min(ys) - avg_h // 2),
+                    int(max(xs)), int(max(ys) + avg_h // 2))
+        return None
+    @staticmethod
+    def _find_region_for_line(bbox, line, baseline_seg) -> Tuple[str, object]:
+        """Determine which blla region a line belongs to."""
+        # Check tags first
+        if hasattr(line, 'tags') and isinstance(line.tags, dict):
+            rtype = line.tags.get('type')
+            if rtype and isinstance(rtype, str):
+                return rtype, None
+        # Check region boundaries
+        if hasattr(baseline_seg, 'regions') and baseline_seg.regions:
+            cx = (bbox[0] + bbox[2]) // 2
+            cy = (bbox[1] + bbox[3]) // 2
+            for rtype, region_list in baseline_seg.regions.items():
+                for ri, region in enumerate(region_list):
+                    if hasattr(region, 'boundary') and region.boundary:
+                        bxs = [p[0] for p in region.boundary]
+                        bys = [p[1] for p in region.boundary]
+                        if (min(bxs) <= cx <= max(bxs) and
+                                min(bys) <= cy <= max(bys)):
+                            return f"{rtype}_{ri}", region
+        return 'r_1', None
+    @staticmethod
+    def _extract_blla_region_boxes(
+        baseline_seg,
+        text_direction: str = 'horizontal-lr',
+    ) -> List[Tuple[int, int, int, int, object, str]]:
+        """
+        Build a sorted list of (x1, y1, x2, y2, region_obj, region_key) tuples
+        from blla's detected regions.  Used to clip / split lines that cross
+        column boundaries.  Returns an empty list when no region boundaries are
+        available.
+        """
+        boxes: List[Tuple[int, int, int, int, object, str]] = []
+        if not (hasattr(baseline_seg, 'regions') and baseline_seg.regions):
+            return boxes
+        for rtype, region_list in baseline_seg.regions.items():
+            for ri, region in enumerate(region_list):
+                if not (hasattr(region, 'boundary') and region.boundary):
+                    continue
+                bxs = [p[0] for p in region.boundary]
+                bys = [p[1] for p in region.boundary]
+                boxes.append((
+                    int(min(bxs)), int(min(bys)),
+                    int(max(bxs)), int(max(bys)),
+                    region, f"{rtype}_{ri}",
+                ))
+        rtl = text_direction.endswith('-rl')
+        boxes.sort(key=lambda t: t[0], reverse=rtl)
+        return boxes
+    @staticmethod
+    def _overlapping_blla_boxes(
+        bbox: Tuple[int, int, int, int],
+        blla_boxes: List[Tuple[int, int, int, int, object, str]],
+    ) -> List[Tuple[int, int, int, int, object, str]]:
+        """
+        Return the blla region boxes whose bbox overlaps with *bbox*.
+        Overlap requires intersection in both x and y.
+        """
+        x1, y1, x2, y2 = bbox
+        result = []
+        for rb in blla_boxes:
+            rx1, ry1, rx2, ry2 = rb[0], rb[1], rb[2], rb[3]
+            if rx1 < x2 and rx2 > x1 and ry1 < y2 and ry2 > y1:
+                result.append(rb)
+        return result
+    @staticmethod
+    def _estimate_columns(
+        lines: list,
+        page_w: int,
+        max_columns: int = 4,
+        min_gap_fraction: float = 0.03,
+    ) -> List[int]:
+        """
+        Gap-based column clustering.
+        Finds natural breaks in the x-center distribution by looking for the
+        largest gaps in the sorted sequence of line x-centers.  This is more
+        robust than histogram peak-finding for closely spaced columns, because
+        a column gap is a region with *no* line centers — it shows up as a large
+        jump in the sorted sequence regardless of how close the columns are.
+        Args:
+            lines:             List of LineSegment objects.
+            page_w:            Width of the region being analysed (pixels).
+            max_columns:       Maximum number of columns to return (≥1).
+            min_gap_fraction:  Minimum gap size as a fraction of *page_w* to be
+                               considered a column boundary.  Default 0.03 (3%).
+                               Increase if spurious splits occur within a column.
+        """
+        if not lines:
+            return []
+        # Lines wider than 60% of the region are likely headers/footers that
+        # span columns — exclude them from clustering to avoid false splits.
+        orig_centers = [((l.bbox[0] + l.bbox[2]) // 2) for l in lines]
+        line_widths = [(l.bbox[2] - l.bbox[0]) for l in lines]
+        clustering_centers = [
+            cx for cx, w in zip(orig_centers, line_widths)
+            if w < 0.60 * page_w
+        ]
+        if not clustering_centers:
+            # All lines are wide (e.g. single full-width text block)
+            return [0] * len(lines)
+        min_gap_px = max(10, int(min_gap_fraction * page_w))
+        sorted_cx = sorted(clustering_centers)
+        # Compute gaps between consecutive sorted x-centers
+        gaps = [
+            (sorted_cx[i + 1] - sorted_cx[i], (sorted_cx[i] + sorted_cx[i + 1]) // 2)
+            for i in range(len(sorted_cx) - 1)
+            if sorted_cx[i + 1] - sorted_cx[i] >= min_gap_px
+        ]
+        if not gaps:
+            return [0] * len(lines)
+        # Take the largest max_columns-1 gaps as column boundaries
+        split_midpoints = sorted(
+            mid for _, mid in sorted(gaps, reverse=True)[: max_columns - 1]
+        )
+        # Assign each line (using original center) to a column
+        assignments = []
+        for cx in orig_centers:
+            col = sum(1 for sp in split_midpoints if cx > sp)
+            assignments.append(col)
+        return assignments
+    def _split_wide_regions(
+        self,
+        regions_dict: Dict[str, dict],
+        page_w: int,
+        min_lines_to_split: int = 10,
+        split_width_fraction: float = 0.40,
+        max_columns: int = 4,
+    ) -> Dict[str, dict]:
+        """
+        Split blla regions that are wide enough to contain multiple columns.
+        A region whose width exceeds *split_width_fraction* of the page width
+        and has enough lines is run through column clustering internally.
+        For landscape double-page spreads, lower split_width_fraction (e.g. 0.20)
+        to trigger splitting on narrower regions.
+        """
+        new_dict: Dict[str, dict] = {}
+        split_counter = 0
+        for key, rdata in regions_dict.items():
+            region_lines = rdata['lines']  # list of (idx, LineSegment)
+            if len(region_lines) < min_lines_to_split:
+                new_dict[key] = rdata
+                continue
+            # Compute region width from line bboxes
+            bboxes = [l.bbox for _, l in region_lines]
+            rx1 = min(b[0] for b in bboxes)
+            rx2 = max(b[2] for b in bboxes)
+            region_w = rx2 - rx1
+            if region_w < split_width_fraction * page_w:
+                # Narrow enough to be a single column
+                new_dict[key] = rdata
+                continue
+            # Wide region — try column clustering within it.
+            # _estimate_columns bins x-centers into [0, page_w), so we need to
+            # shift line coordinates so that rx1 maps to 0.
+            just_lines = [l for _, l in region_lines]
+            shifted_lines = []
+            for l in just_lines:
+                shifted_bbox = (l.bbox[0] - rx1, l.bbox[1],
+                                l.bbox[2] - rx1, l.bbox[3])
+                shifted_lines.append(LineSegment(l.image, shifted_bbox, l.baseline))
+            assignments = self._estimate_columns(shifted_lines, page_w=region_w,
+                                                  max_columns=max_columns)
+            n_cols = len(set(assignments))
+            if n_cols <= 1:
+                # Clustering didn't find multiple columns
+                new_dict[key] = rdata
+                continue
+            print(f"[KrakenSegmenter] Splitting region '{key}' ({len(region_lines)} lines, "
+                  f"width={region_w}px) into {n_cols} sub-columns")
+            # Re-compute x-centers relative to region left edge for clustering
+            # (already done inside _estimate_columns via absolute coords, which
+            # works fine since columns are spatially separated)
+            for col_id in sorted(set(assignments)):
+                sub_key = f"{key}_col{split_counter}"
+                split_counter += 1
+                sub_lines = [
+                    region_lines[i]
+                    for i, a in enumerate(assignments)
+                    if a == col_id
+                ]
+                new_dict[sub_key] = {'lines': sub_lines, 'blla_region': None}
+        return new_dict
+    def _cluster_into_columns(
+        self,
+        lines: list,
+        page_w: int,
+        max_columns: int = 4,
+    ) -> Dict[str, dict]:
+        """Cluster lines into columns and return regions_dict."""
+        assignments = self._estimate_columns(lines, page_w, max_columns=max_columns)
+        regions_dict: Dict[str, dict] = {}
+        for idx, (col, line) in enumerate(zip(assignments, lines)):
+            key = f"col_{col}"
+            if key not in regions_dict:
+                regions_dict[key] = {'lines': [], 'blla_region': None}
+            regions_dict[key]['lines'].append((idx, line))
+        return regions_dict
+    @staticmethod
+    def _convex_hull(points: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
+        """Monotonic chain convex hull."""
+        pts = sorted(set(points))
+        if len(pts) <= 2:
+            return pts
+        def cross(o, a, b):
+            return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
+        lower = []
+        for p in pts:
+            while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0:
+                lower.pop()
+            lower.append(p)
+        upper = []
+        for p in reversed(pts):
+            while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0:
+                upper.pop()
+            upper.append(p)
+        return lower[:-1] + upper[:-1]
+    def _build_regions(
+        self,
+        regions_dict: Dict[str, dict],
+        all_lines: list,
+        page_w: int,
+        text_direction: str = 'horizontal-lr',
+    ) -> Tuple[List[SegRegion], List[LineSegment]]:
+        """
+        Build SegRegion objects from regions_dict.
+        Returns (regions, ordered_lines) where ordered_lines is sorted by
+        region (left-to-right for LTR, right-to-left for RTL) then
+        top-to-bottom within each region.
+        """
+        rtl = text_direction.endswith('-rl')
+        # Sort regions by mean x-center: LTR = ascending, RTL = descending
+        def _region_mean_x(item):
+            lines = item[1]['lines']
+            if not lines:
+                return 0
+            return sum((l.bbox[0] + l.bbox[2]) / 2 for _, l in lines) / len(lines)
+        sorted_regions = sorted(regions_dict.items(), key=_region_mean_x, reverse=rtl)
+        regions: List[SegRegion] = []
+        ordered_lines: List[LineSegment] = []
+        for ri, (region_key, rdata) in enumerate(sorted_regions, start=1):
+            region_lines = rdata['lines']
+            blla_region = rdata['blla_region']
+            # Sort lines top-to-bottom within region
+            region_lines.sort(key=lambda item: item[1].bbox[1])
+            region_id = f"r_{ri}"
+            line_ids = [f"l_{i + 1}" for i, _ in region_lines]
+            bboxes = [l.bbox for _, l in region_lines]
+            rbbox = (
+                min(b[0] for b in bboxes),
+                min(b[1] for b in bboxes),
+                max(b[2] for b in bboxes),
+                max(b[3] for b in bboxes),
+            )
+            # Polygon: prefer blla boundary, else convex hull
+            polygon = None
+            if blla_region and hasattr(blla_region, 'boundary') and blla_region.boundary:
+                polygon = [(int(p[0]), int(p[1])) for p in blla_region.boundary]
+            else:
+                pts = []
+                for _, l in region_lines:
+                    x1, y1, x2, y2 = l.bbox
+                    pts.extend([(x1, y1), (x2, y1), (x2, y2), (x1, y2)])
+                hull = self._convex_hull(pts)
+                polygon = hull if len(hull) >= 3 else None
+            regions.append(SegRegion(
+                id=region_id,
+                bbox=rbbox,
+                line_ids=line_ids,
+                polygon=polygon,
+            ))
+            for _, line in region_lines:
+                ordered_lines.append(line)
+        return regions, ordered_lines
+    def segment_lines_to_dict(
+        self,
+        image: Image.Image,
+        text_direction: str = 'horizontal-lr',
+        use_binarization: bool = True
+    ) -> List[dict]:
+        """
+        Segment image and return results as dictionaries (for compatibility).
+        Returns:
+            List of dicts with 'image', 'bbox', and 'baseline' keys
+        """
+        segments = self.segment_lines(image, text_direction, use_binarization)
+        return [
+            {
+                'image': seg.image,
+                'bbox': seg.bbox,
+                'baseline': seg.baseline
+            }
+            for seg in segments
+        ]
+def test_kraken_segmenter():
+    """Test Kraken segmenter on a sample image."""
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python kraken_segmenter.py <image_path>")
+        sys.exit(1)
+    image_path = sys.argv[1]
+    print(f"Testing Kraken segmenter on: {image_path}")
+    # Load image
+    image = Image.open(image_path)
+    print(f"Image size: {image.size}")
+    # Create segmenter
+    segmenter = KrakenLineSegmenter()
+    # Segment lines
+    lines = segmenter.segment_lines(image, use_binarization=True)
+    # Print results
+    print(f"\nDetected {len(lines)} lines:")
+    for i, line in enumerate(lines):
+        print(f"  Line {i+1}: bbox={line.bbox}, "
+              f"baseline_points={len(line.baseline) if line.baseline else 0}")
+    # Save line images
+    import os
+    output_dir = "kraken_test_output"
+    os.makedirs(output_dir, exist_ok=True)
+    for i, line in enumerate(lines):
+        output_path = os.path.join(output_dir, f"line_{i+1:03d}.png")
+        line.image.save(output_path)
+    print(f"\nLine images saved to: {output_dir}/")
+if __name__ == "__main__":
+    test_kraken_segmenter()

page_xml_exporter.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+PAGE XML Exporter
+Exports line segmentation and transcription data to PAGE XML format.
+Compatible with party and other PAGE XML processors.
+"""
+import xml.etree.ElementTree as ET
+from xml.dom import minidom
+from pathlib import Path
+from typing import List, Optional
+from datetime import datetime
+from inference_page import LineSegment
+class PageXMLExporter:
+    """Export line segmentation data to PAGE XML format."""
+    # PAGE XML namespace
+    NAMESPACE = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
+    def __init__(self, image_path: str, image_width: int, image_height: int):
+        """
+        Initialize PAGE XML exporter.
+        Args:
+            image_path: Path to the page image file
+            image_width: Width of the page image in pixels
+            image_height: Height of the page image in pixels
+        """
+        self.image_path = Path(image_path)
+        self.image_width = image_width
+        self.image_height = image_height
+    def _make_root(self, creator: str, comments: Optional[str]) -> tuple:
+        """Build root PcGts element with Metadata and Page. Returns (root, page)."""
+        ET.register_namespace('', self.NAMESPACE)
+        root = ET.Element('PcGts', {
+            'xmlns': self.NAMESPACE,
+            'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+            'xsi:schemaLocation': (
+                f'{self.NAMESPACE} '
+                'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd'
+            ),
+            'pcGtsId': f'pc-{self.image_path.stem}'
+        })
+        metadata = ET.SubElement(root, 'Metadata')
+        ET.SubElement(metadata, 'Creator').text = creator
+        ET.SubElement(metadata, 'Created').text = datetime.now().isoformat()
+        ET.SubElement(metadata, 'LastChange').text = datetime.now().isoformat()
+        if comments:
+            ET.SubElement(metadata, 'Comments').text = comments
+        page = ET.SubElement(root, 'Page', {
+            'imageFilename': str(self.image_path.name),
+            'imageWidth': str(self.image_width),
+            'imageHeight': str(self.image_height)
+        })
+        return root, page
+    @staticmethod
+    def _write_xml(root: ET.Element, output_path: str) -> None:
+        xml_str = ET.tostring(root, encoding='utf-8', method='xml')
+        dom = minidom.parseString(xml_str)
+        pretty_xml = dom.toprettyxml(indent='  ', encoding='utf-8')
+        with open(output_path, 'wb') as f:
+            f.write(pretty_xml)
+    @staticmethod
+    def _baseline_points(segment) -> str:
+        """Return PAGE XML baseline points string for a segment."""
+        if hasattr(segment, 'baseline') and segment.baseline:
+            return ' '.join(f'{x},{y}' for x, y in segment.baseline)
+        x1, y1, x2, y2 = segment.bbox
+        bl_y = y2 - 5
+        return f'{x1},{bl_y} {x2},{bl_y}'
+    @staticmethod
+    def _coords_points(segment) -> str:
+        """Return PAGE XML coords points string for a segment."""
+        if hasattr(segment, 'coords') and segment.coords:
+            return ' '.join(f'{x},{y}' for x, y in segment.coords)
+        x1, y1, x2, y2 = segment.bbox
+        return f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
+    def _add_text_line(self, parent: ET.Element, line_id: str, segment,
+                       text: Optional[str], line_idx: int) -> None:
+        """Add a TextLine element to parent with coords, baseline and optional text."""
+        line_elem = ET.SubElement(parent, 'TextLine', {
+            'id': line_id,
+            'custom': f'readingOrder {{index:{line_idx};}}'
+        })
+        ET.SubElement(line_elem, 'Coords').set('points', self._coords_points(segment))
+        ET.SubElement(line_elem, 'Baseline').set('points', self._baseline_points(segment))
+        if text:
+            conf = '1.0'
+            if hasattr(segment, 'confidence') and segment.confidence is not None:
+                conf = str(segment.confidence)
+            text_equiv = ET.SubElement(line_elem, 'TextEquiv', {'conf': conf})
+            ET.SubElement(text_equiv, 'Unicode').text = text
+    def export(self, segments: List[LineSegment], output_path: str,
+               creator: str = "TrOCR-GUI", comments: Optional[str] = None) -> None:
+        """
+        Export line segments to PAGE XML (single TextRegion, no region info).
+        Args:
+            segments: List of LineSegment objects (may carry .text attribute)
+            output_path: Path where to save the PAGE XML file
+            creator: Software/tool that created this PAGE XML
+            comments: Optional comments about the document
+        """
+        root, page = self._make_root(creator, comments)
+        # Reading order
+        reading_order = ET.SubElement(page, 'ReadingOrder')
+        ordered_group = ET.SubElement(reading_order, 'OrderedGroup', {
+            'id': 'ro_1',
+            'caption': 'Regions reading order'
+        })
+        ET.SubElement(ordered_group, 'RegionRefIndexed', {
+            'index': '0',
+            'regionRef': 'region_1'
+        })
+        # Single text region spanning all lines
+        text_region = ET.SubElement(page, 'TextRegion', {
+            'id': 'region_1',
+            'type': 'paragraph',
+            'custom': 'readingOrder {index:0;}'
+        })
+        if segments:
+            x1 = min(seg.bbox[0] for seg in segments)
+            y1 = min(seg.bbox[1] for seg in segments)
+            x2 = max(seg.bbox[2] for seg in segments)
+            y2 = max(seg.bbox[3] for seg in segments)
+            ET.SubElement(text_region, 'Coords').set(
+                'points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
+            )
+        for idx, segment in enumerate(segments):
+            text = getattr(segment, 'text', None) or None
+            self._add_text_line(text_region, f'line_{idx + 1}', segment, text, idx)
+        self._write_xml(root, output_path)
+    def export_with_regions(
+        self,
+        regions,
+        lines,
+        output_path: str,
+        transcriptions: Optional[List[str]] = None,
+        creator: str = "TrOCR-GUI",
+        comments: Optional[str] = None,
+    ) -> None:
+        """
+        Export with proper multi-region PAGE XML structure.
+        Creates one TextRegion per detected region (e.g. columns, marginalia),
+        with TextLines nested inside their region and actual baseline polylines.
+        ReadingOrder lists regions left-to-right and lines top-to-bottom within
+        each region, matching how blla / column clustering ordered them.
+        Args:
+            regions:         List of SegRegion objects (duck-typed: .id, .line_ids,
+                             .bbox, optional .polygon).
+            lines:           Flat list of LineSegment objects, already ordered by
+                             region (region[0]'s lines first, then region[1]'s, …).
+                             The count of lines per region is len(region.line_ids).
+            output_path:     Where to write the PAGE XML file.
+            transcriptions:  Optional list of text strings, parallel to *lines*.
+                             Pass self.transcriptions from the GUI when available.
+            creator:         Creator string for Metadata.
+            comments:        Optional comments string for Metadata.
+        """
+        root, page = self._make_root(creator, comments)
+        # ReadingOrder — one RegionRefIndexed per region
+        reading_order = ET.SubElement(page, 'ReadingOrder')
+        ordered_group = ET.SubElement(reading_order, 'OrderedGroup', {
+            'id': 'ro_1',
+            'caption': 'Regions reading order'
+        })
+        for ri, region in enumerate(regions):
+            ET.SubElement(ordered_group, 'RegionRefIndexed', {
+                'index': str(ri),
+                'regionRef': region.id
+            })
+        # TextRegions — one per region, lines nested inside
+        line_offset = 0
+        for ri, region in enumerate(regions):
+            n = len(region.line_ids) if hasattr(region, 'line_ids') else 0
+            region_lines = lines[line_offset:line_offset + n]
+            line_offset += n
+            text_region = ET.SubElement(page, 'TextRegion', {
+                'id': region.id,
+                'type': 'paragraph',
+                'custom': f'readingOrder {{index:{ri};}}'
+            })
+            # Region polygon (prefer neural boundary over convex hull over bbox)
+            if hasattr(region, 'polygon') and region.polygon and len(region.polygon) >= 3:
+                pts = ' '.join(f'{x},{y}' for x, y in region.polygon)
+            else:
+                x1, y1, x2, y2 = region.bbox
+                pts = f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
+            ET.SubElement(text_region, 'Coords').set('points', pts)
+            for li, segment in enumerate(region_lines):
+                global_line_idx = line_offset - n + li  # index in the flat lines list
+                text = None
+                if transcriptions and global_line_idx < len(transcriptions):
+                    text = transcriptions[global_line_idx] or None
+                elif hasattr(segment, 'text'):
+                    text = getattr(segment, 'text', None) or None
+                self._add_text_line(
+                    text_region,
+                    f'line_{ri + 1}_{li + 1}',
+                    segment,
+                    text,
+                    li,
+                )
+        self._write_xml(root, output_path)
+    @staticmethod
+    def quick_export(image_path: str, segments: List[LineSegment],
+                     output_path: Optional[str] = None) -> str:
+        """
+        Quick export helper that automatically determines output path and image dimensions.
+        Args:
+            image_path: Path to the page image
+            segments: List of LineSegment objects
+            output_path: Optional output path (default: same as image with .xml extension)
+        Returns:
+            Path to the exported PAGE XML file
+        """
+        from PIL import Image
+        # Load image to get dimensions
+        img = Image.open(image_path)
+        width, height = img.size
+        # Determine output path
+        if output_path is None:
+            output_path = Path(image_path).with_suffix('.xml')
+        # Export
+        exporter = PageXMLExporter(image_path, width, height)
+        exporter.export(segments, str(output_path))
+        return str(output_path)
+if __name__ == "__main__":
+    # Example usage
+    from PIL import Image
+    # Create a dummy segment for testing
+    dummy_img = Image.new('L', (100, 30))
+    dummy_segment = LineSegment(
+        image=dummy_img,
+        bbox=(10, 10, 200, 40),
+        text="Example text",
+        confidence=0.95
+    )
+    exporter = PageXMLExporter("test_page.jpg", 800, 1200)
+    exporter.export([dummy_segment], "test_output.xml",
+                   creator="PAGE XML Exporter Test",
+                   comments="This is a test export")
+    print("Test PAGE XML created: test_output.xml")

web/polyscriptor_server.py ADDED Viewed

	@@ -0,0 +1,2237 @@

+"""
+Polyscriptor Web UI — FastAPI Backend
+Thin wrapper around existing HTR engine code. Provides REST API + SSE
+for browser-based transcription. All heavy lifting done by the same
+modules the PyQt6 GUI uses.
+Usage:
+    source htr_gui/bin/activate
+    python -m uvicorn web.polyscriptor_server:app --host 0.0.0.0 --port 8765
+Author: Claude Code
+Date: 2026-02-26
+"""
+import asyncio
+import hashlib
+import importlib
+import json
+import logging
+import os
+import sys
+import time
+import uuid
+from dataclasses import dataclass, field
+from types import SimpleNamespace
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import numpy as np
+from PIL import Image, ImageOps
+from fastapi import Cookie, FastAPI, File, HTTPException, Query, Request, UploadFile
+from fastapi.responses import FileResponse, Response, StreamingResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+log = logging.getLogger("polyscriptor")
+DEMO_MODE = os.environ.get("POLYSCRIPTOR_DEMO_MODE", "").strip().lower()
+# Add project root to path so we can import existing modules
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+# Load .env from project root (same as the Qt GUI does via CommercialAPIEngine)
+try:
+    from dotenv import load_dotenv
+    _env_path = PROJECT_ROOT / ".env"
+    if _env_path.exists():
+        load_dotenv(_env_path)
+        log.info(f"Loaded environment variables from {_env_path}")
+except ImportError:
+    pass  # python-dotenv not installed — env vars must be set externally
+from htr_engine_base import get_global_registry, HTREngine, TranscriptionResult
+# PDF support via PyMuPDF
+try:
+    import fitz as _fitz  # PyMuPDF
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+    log.warning("PyMuPDF not installed — PDF upload disabled. Install with: pip install pymupdf")
+# Lazy imports for segmentation (avoid slow startup)
+_segmenters_imported = False
+def _import_segmenters():
+    global _segmenters_imported
+    if _segmenters_imported:
+        return
+    global KrakenLineSegmenter, LineSegmenter, PYLAIA_MODELS
+    from kraken_segmenter import KrakenLineSegmenter
+    from inference_page import LineSegmenter
+    try:
+        from inference_pylaia_native import PYLAIA_MODELS
+    except ImportError:
+        PYLAIA_MODELS = {}
+    _segmenters_imported = True
+# ---------------------------------------------------------------------------
+# App setup
+# ---------------------------------------------------------------------------
+app = FastAPI(title="Polyscriptor HTR", version="0.1.0")
+# Serve static frontend files
+STATIC_DIR = Path(__file__).parent / "static"
+app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
+# ---------------------------------------------------------------------------
+# Engine pool — Phase 2: shared pool of loaded engine instances
+# ---------------------------------------------------------------------------
+@dataclass
+class EngineSlot:
+    """One loaded engine instance in the pool."""
+    engine: Any  # HTREngine instance (not the registry singleton)
+    engine_name: str
+    config: dict
+    pool_key: str
+    ref_count: int = 0
+    last_used: float = field(default_factory=time.time)
+    lock: asyncio.Lock = field(default_factory=asyncio.Lock)
+engine_pool: Dict[str, EngineSlot] = {}
+pool_lock = asyncio.Lock()
+# VRAM budget estimates (GB) for eviction decisions
+_ENGINE_VRAM_GB = {
+    "CRNN-CTC (PyLaia-inspired)": 2,
+    "TrOCR": 3,
+    "Qwen3-VL": 18,
+    "Churro VLM": 10,
+    "Kraken": 2,
+    "Party": 4,
+    "PaddleOCR": 2,
+}
+_NO_GPU_ENGINES = {"Commercial APIs", "OpenWebUI", "LightOnOCR", "DeepSeek-OCR"}
+_TOTAL_VRAM_GB = 92  # 2x L40S @ 46GB each
+# Factory: engine name -> (module, class) for creating fresh instances
+_ENGINE_FACTORY = {
+    "TrOCR":                       ("engines.trocr_engine",        "TrOCREngine"),
+    "CRNN-CTC (PyLaia-inspired)":  ("engines.pylaia_engine",       "PyLaiaEngine"),
+    "Qwen3-VL":                    ("engines.qwen3_engine",        "Qwen3Engine"),
+    "Churro VLM":                   ("engines.churro_engine",       "ChurroEngine"),
+    "Kraken":                       ("engines.kraken_engine",       "KrakenEngine"),
+    "Commercial APIs":              ("engines.commercial_api_engine", "CommercialAPIEngine"),
+    "Party":                        ("engines.party_engine",        "PartyEngine"),
+    "OpenWebUI":                    ("engines.openwebui_engine",    "OpenWebUIEngine"),
+    "DeepSeek-OCR":                 ("engines.deepseek_ocr_engine", "DeepSeekOCREngine"),
+    "LightOnOCR":                   ("engines.lighton_ocr_engine",  "LightOnOCREngine"),
+    "PaddleOCR":                    ("engines.paddle_engine",       "PaddleOCREngine"),
+}
+def _create_engine_instance(engine_name: str):
+    """Create a fresh engine instance (not the registry singleton).
+    The registry is used for discovery/availability only.
+    Pool slots get their own instances so multiple models can coexist.
+    """
+    entry = _ENGINE_FACTORY.get(engine_name)
+    if not entry:
+        return None
+    module_name, class_name = entry
+    mod = importlib.import_module(module_name)
+    cls = getattr(mod, class_name)
+    return cls()
+def _make_pool_key(engine_name: str, config: dict) -> str:
+    """Build a key that uniquely identifies an engine+model combination."""
+    if engine_name == "Commercial APIs":
+        provider = config.get("provider", "unknown")
+        model = config.get("model", "unknown")
+        api_key = config.get("api_key", "")
+        key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
+        return f"{engine_name}::{provider}::{model}::{key_hash}"
+    if engine_name == "OpenWebUI":
+        model = config.get("model", "unknown")
+        base_url = config.get("base_url", "unknown")
+        api_key = config.get("api_key", "")
+        key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
+        return f"{engine_name}::{base_url}::{model}::{key_hash}"
+    if engine_name == "TrOCR":
+        return f"{engine_name}::{config.get('model_path', 'default')}"
+    if engine_name in ("CRNN-CTC (PyLaia-inspired)", "Kraken"):
+        return f"{engine_name}::{config.get('model_path', 'default')}"
+    if engine_name == "Qwen3-VL":
+        base = config.get("base_model", "default")
+        adapter = config.get("adapter", "")
+        return f"{engine_name}::{base}::{adapter or 'none'}"
+    if engine_name == "Churro VLM":
+        return f"{engine_name}::{config.get('model_name', 'default')}"
+    if engine_name == "LightOnOCR":
+        return f"{engine_name}::{config.get('model_path', 'default')}"
+    # Fallback: hash the config
+    config_hash = hashlib.sha256(str(sorted(config.items())).encode()).hexdigest()[:12]
+    return f"{engine_name}::{config_hash}"
+async def _maybe_evict(new_engine_name: str):
+    """Evict LRU slots with ref_count==0 if VRAM is tight. Called UNDER pool_lock."""
+    if new_engine_name in _NO_GPU_ENGINES:
+        return
+    needed = _ENGINE_VRAM_GB.get(new_engine_name, 4)
+    used = sum(_ENGINE_VRAM_GB.get(s.engine_name, 4)
+               for s in engine_pool.values()
+               if s.engine_name not in _NO_GPU_ENGINES)
+    if used + needed <= _TOTAL_VRAM_GB:
+        return
+    # Evict: ref_count==0, oldest first
+    candidates = sorted(
+        [(k, s) for k, s in engine_pool.items()
+         if s.ref_count == 0 and s.engine_name not in _NO_GPU_ENGINES],
+        key=lambda x: x[1].last_used
+    )
+    for key, slot in candidates:
+        if used + needed <= _TOTAL_VRAM_GB:
+            break
+        log.info(f"Evicting engine slot '{key}' (last used {time.time() - slot.last_used:.0f}s ago)")
+        try:
+            slot.engine.unload_model()
+        except Exception as e:
+            log.warning(f"Error unloading evicted engine: {e}")
+        del engine_pool[key]
+        used -= _ENGINE_VRAM_GB.get(slot.engine_name, 4)
+    if used + needed > _TOTAL_VRAM_GB:
+        log.warning(f"VRAM tight: ~{used}GB used + ~{needed}GB needed > {_TOTAL_VRAM_GB}GB total")
+# Compatibility shims — will be removed after full migration
+loaded_engine: Optional[HTREngine] = None
+loaded_engine_name: str = ""
+loaded_config: dict = {}
+# Persistent upload storage (survives server restarts)
+UPLOAD_DIR = Path(__file__).parent / "uploads"
+UPLOAD_DIR.mkdir(exist_ok=True)
+# Upload TTL: 24 hours
+_UPLOAD_TTL_SECONDS = 86400
+# Session TTL: 2 hours of inactivity
+_SESSION_TTL_SECONDS = 7200
+# Cookie name for session tracking
+_SESSION_COOKIE = "polyscriptor_session"
+# ---------------------------------------------------------------------------
+# Per-user sessions — Phase 1 of multi-user refactoring
+# ---------------------------------------------------------------------------
+@dataclass
+class UserSession:
+    session_id: str
+    image_cache: Dict[str, dict] = field(default_factory=dict)
+    cancel_events: Dict[str, asyncio.Event] = field(default_factory=dict)
+    pool_key: Optional[str] = None  # Reference into engine_pool
+    created_at: float = field(default_factory=time.time)
+    last_active: float = field(default_factory=time.time)
+sessions: Dict[str, UserSession] = {}
+global_image_cache: Dict[str, dict] = {}
+def _get_or_create_session(session_id: Optional[str]) -> tuple[UserSession, bool]:
+    """Return (session, created). If session_id is missing/unknown, create a new one."""
+    if session_id and session_id in sessions:
+        session = sessions[session_id]
+        session.last_active = time.time()
+        return session, False
+    new_id = str(uuid.uuid4())
+    session = UserSession(session_id=new_id)
+    sessions[new_id] = session
+    return session, True
+def _cleanup_expired_sessions() -> int:
+    """Remove sessions inactive for more than _SESSION_TTL_SECONDS. Returns count removed."""
+    cutoff = time.time() - _SESSION_TTL_SECONDS
+    expired = [sid for sid, s in sessions.items() if s.last_active < cutoff]
+    for sid in expired:
+        session = sessions.pop(sid)
+        # Release pool reference
+        if session.pool_key and session.pool_key in engine_pool:
+            slot = engine_pool[session.pool_key]
+            slot.ref_count = max(0, slot.ref_count - 1)
+            if slot.ref_count == 0:
+                log.info(f"Immediate eviction (session expiry): '{slot.engine_name}'")
+                try:
+                    slot.engine.unload_model()
+                except Exception as e:
+                    log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
+                if session.pool_key in engine_pool:
+                    del engine_pool[session.pool_key]
+        # Clean up upload files belonging to this session
+        for iid, img_data in session.image_cache.items():
+            p = img_data.get("path")
+            if p:
+                Path(p).unlink(missing_ok=True)
+            xp = img_data.get("xml_path")
+            if xp:
+                Path(xp).unlink(missing_ok=True)
+        log.info(f"Expired session {sid[:8]}... ({len(session.image_cache)} images)")
+    return len(expired)
+_SESSION_PASSTHROUGH_PATHS = {"/api/gpu", "/api/engines", "/api/kraken/presets"}
+@app.middleware("http")
+async def session_middleware(request: Request, call_next):
+    """Inject session into request.state; set session cookie on new sessions.
+    Pure status/discovery routes (GPU poll, engine list) are excluded from
+    last_active updates so that background browser polling cannot keep a session
+    alive indefinitely and prevent engine-slot eviction.
+    """
+    session_id = request.cookies.get(_SESSION_COOKIE)
+    session, created = _get_or_create_session(session_id)
+    request.state.session = session
+    # Don't update last_active for polling-only routes
+    if request.url.path in _SESSION_PASSTHROUGH_PATHS:
+        session.last_active  # read only — no write
+    else:
+        session.last_active = time.time()
+    response = await call_next(request)
+    if created or session_id != session.session_id:
+        cookie_kwargs = {
+            "key": _SESSION_COOKIE,
+            "value": session.session_id,
+            "httponly": True,
+            "max_age": _SESSION_TTL_SECONDS,
+        }
+        if DEMO_MODE == "hf_space":
+            cookie_kwargs.update({"samesite": "none", "secure": True})
+        else:
+            cookie_kwargs.update({"samesite": "lax"})
+        response.set_cookie(
+            **cookie_kwargs
+        )
+    return response
+def _get_session(request: Request) -> UserSession:
+    """FastAPI dependency: extract session set by middleware."""
+    return request.state.session
+def _cleanup_old_uploads() -> int:
+    """Delete uploads older than TTL and evict image_cache entries across all sessions."""
+    cutoff = time.time() - _UPLOAD_TTL_SECONDS
+    deleted = 0
+    for f in list(UPLOAD_DIR.iterdir()):
+        if f.is_file():
+            try:
+                if f.stat().st_mtime < cutoff:
+                    f.unlink(missing_ok=True)
+                    deleted += 1
+            except OSError:
+                pass
+    # Evict stale image_cache entries whose file no longer exists (all sessions)
+    for session in sessions.values():
+        for iid in list(session.image_cache.keys()):
+            p = session.image_cache[iid].get("path")
+            if p and not Path(p).exists():
+                del session.image_cache[iid]
+    return deleted
+_SLOT_IDLE_TTL_SECONDS = 6 * 3600  # evict loaded engines idle for 6h, regardless of ref_count
+def _evict_idle_slots() -> int:
+    """Evict engine slots that have not been used for _SLOT_IDLE_TTL_SECONDS.
+    Called under no lock — must only be called from _periodic_cleanup (single-threaded).
+    The GPU-status poll (/api/gpu) keeps sessions alive indefinitely, so we cannot rely
+    on session expiry alone to release VRAM. This independently caps engine residency.
+    """
+    cutoff = time.time() - _SLOT_IDLE_TTL_SECONDS
+    stale = [k for k, s in engine_pool.items() if s.last_used < cutoff
+             and s.engine_name not in _NO_GPU_ENGINES]
+    for key in stale:
+        slot = engine_pool.pop(key)
+        log.info(f"Idle eviction: '{slot.engine_name}' (idle {(time.time() - slot.last_used)/3600:.1f}h)")
+        try:
+            slot.engine.unload_model()
+        except Exception as e:
+            log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
+        # Invalidate all sessions pointing at this slot
+        for session in sessions.values():
+            if session.pool_key == key:
+                session.pool_key = None
+    return len(stale)
+async def _periodic_cleanup():
+    """Background task: clean up uploads + expired sessions + idle engine slots every hour."""
+    while True:
+        await asyncio.sleep(3600)
+        n = _cleanup_old_uploads()
+        m = _cleanup_expired_sessions()
+        p = _evict_idle_slots()
+        if n or m or p:
+            log.info(f"Periodic cleanup: {n} upload(s), {m} session(s), {p} idle engine slot(s).")
+# ---------------------------------------------------------------------------
+# API key resolution — keys never stored or shared server-side (Phase 3)
+# Web UI users MUST provide their own keys via browser localStorage.
+# Server env vars (.env) are NOT used by the web UI — they exist only for
+# the PyQt GUI and CLI tools which run locally on the admin's machine.
+# ---------------------------------------------------------------------------
+# Known key slots (for validation only — env vars are NOT consulted)
+_KEY_SLOTS = {"openai", "gemini", "claude", "openwebui"}
+def _resolve_api_key(slot: str, request_value: str) -> str:
+    """
+    Return the API key from the browser request, or empty string.
+    Server env vars are deliberately NOT used as fallback — each web user
+    must supply their own key via browser localStorage.
+    """
+    if request_value and request_value.strip():
+        return request_value.strip()
+    return ""
+# ---------------------------------------------------------------------------
+# Startup config (web/server_config.yaml) — optional, auto-load an engine
+# ---------------------------------------------------------------------------
+def _load_startup_config() -> dict:
+    cfg_path = Path(__file__).parent / "server_config.yaml"
+    if not cfg_path.exists():
+        return {}
+    try:
+        import yaml
+        with open(cfg_path) as f:
+            return yaml.safe_load(f) or {}
+    except Exception as e:
+        log.warning(f"Could not read server_config.yaml: {e}")
+        return {}
+@app.on_event("startup")
+async def startup_event():
+    """Clean old uploads, start periodic cleanup, auto-load engine."""
+    # Clean up uploads left over from previous server runs
+    n = _cleanup_old_uploads()
+    if n:
+        log.info(f"Startup cleanup: removed {n} old upload file(s).")
+    # Schedule periodic cleanup (every hour)
+    asyncio.create_task(_periodic_cleanup())
+    # Auto-load default engine from server_config.yaml if present
+    cfg = _load_startup_config()
+    if not cfg.get("default_engine"):
+        return
+    engine_name = cfg["default_engine"]
+    engine_config = cfg.get("default_config", {})
+    log.info(f"Auto-loading engine '{engine_name}' from server_config.yaml ...")
+    try:
+        registry = get_global_registry()
+        reg_engine = registry.get_engine_by_name(engine_name)
+        if reg_engine and reg_engine.is_available():
+            engine = _create_engine_instance(engine_name)
+            if not engine:
+                log.warning(f"Auto-load: cannot create instance for '{engine_name}'.")
+                return
+            ok = await asyncio.to_thread(engine.load_model, engine_config)
+            if ok:
+                pool_key = _make_pool_key(engine_name, engine_config)
+                slot = EngineSlot(
+                    engine=engine, engine_name=engine_name,
+                    config=engine_config, pool_key=pool_key,
+                    ref_count=0,  # No session owns it yet
+                )
+                engine_pool[pool_key] = slot
+                # Update compat shims
+                global loaded_engine, loaded_engine_name, loaded_config
+                loaded_engine = engine
+                loaded_engine_name = engine_name
+                loaded_config = engine_config
+                log.info(f"Auto-loaded '{engine_name}' into pool as '{pool_key}'.")
+            else:
+                log.warning(f"Auto-load of '{engine_name}' failed (load_model returned False).")
+        else:
+            log.warning(f"Auto-load: engine '{engine_name}' not found or not available.")
+    except Exception as e:
+        log.warning(f"Auto-load error: {e}")
+# ---------------------------------------------------------------------------
+# Config schemas — replaces Qt config widgets for the web UI
+# ---------------------------------------------------------------------------
+def _get_pylaia_model_options() -> list:
+    _import_segmenters()
+    from inference_pylaia_native import _scan_pylaia_models
+    _scan_pylaia_models(str(Path(__file__).resolve().parents[1] / "models"))
+    options = [{"label": k, "value": k} for k in PYLAIA_MODELS.keys()]
+    options.append({"label": "Custom / local path…", "value": "__custom__"})
+    return options
+def _scan_kraken_models() -> list:
+    """Scan models/ directory for local Kraken .mlmodel files and build select options."""
+    options = []
+    models_root = Path(__file__).resolve().parents[1] / "models"
+    if models_root.exists():
+        for p in sorted(models_root.rglob("*.mlmodel")):
+            rel = str(p.relative_to(models_root.parent))  # e.g. models/kraken_cs/best.mlmodel
+            label = f"{p.parent.name}/{p.name}"
+            options.append({"label": label, "value": rel, "source": "local"})
+    # Zenodo presets from kraken_engine (auto-download on load)
+    try:
+        from engines.kraken_engine import KRAKEN_MODELS
+        for preset_id, info in KRAKEN_MODELS.items():
+            if info.get("source") == "zenodo":
+                options.append({
+                    "label": f"{info.get('label', preset_id)} [Zenodo, auto-download]",
+                    "value": f"__zenodo__{preset_id}",
+                    "source": "zenodo",
+                })
+    except Exception:
+        pass
+    return options
+def _scan_trocr_models() -> list:
+    """Scan models/ directory for TrOCR checkpoints.
+    A directory is considered a TrOCR model if it contains
+    preprocessor_config.json (TrOCR/ViT-specific) AND config.json
+    with model_type == 'vision-encoder-decoder'.
+    This avoids picking up PyLaia/CRNN-CTC directories that also
+    contain a config.json with training parameters.
+    """
+    import json as _json
+    models_dir = PROJECT_ROOT / "models"
+    options = [
+        {"label": "Custom HuggingFace ID or local path…", "value": "__custom__"},
+        {"label": "kazars24/trocr-base-handwritten-ru (HuggingFace)",
+         "value": "kazars24/trocr-base-handwritten-ru",
+         "source": "huggingface"},
+        {"label": "microsoft/trocr-base-printed — printed text, base",
+         "value": "microsoft/trocr-base-printed",
+         "source": "huggingface"},
+        {"label": "microsoft/trocr-large-printed — printed text, large",
+         "value": "microsoft/trocr-large-printed",
+         "source": "huggingface"},
+        {"label": "dh-unibe/trocr-kurrent — German Kurrent 19th c. (CER 2.66%)",
+         "value": "dh-unibe/trocr-kurrent",
+         "source": "huggingface"},
+        {"label": "dh-unibe/trocr-kurrent-XVI-XVII — German Kurrent 16th–18th c. (CER 5.42%)",
+         "value": "dh-unibe/trocr-kurrent-XVI-XVII",
+         "source": "huggingface"},
+    ]
+    if models_dir.exists():
+        for d in sorted(models_dir.iterdir()):
+            if not d.is_dir():
+                continue
+            # Require BOTH preprocessor_config.json AND config.json with
+            # model_type == 'vision-encoder-decoder'.
+            # preprocessor_config.json is ViT/TrOCR-specific (not in PyLaia).
+            # config.json model_type disambiguates from Qwen3 adapters that
+            # also ship a preprocessor_config but have no config.json.
+            if not (d / "preprocessor_config.json").exists():
+                continue
+            cfg_path = d / "config.json"
+            if not cfg_path.exists():
+                continue
+            try:
+                cfg = _json.load(open(cfg_path))
+                if cfg.get("model_type") != "vision-encoder-decoder":
+                    continue
+            except Exception:
+                continue
+            options.append({
+                "label": d.name,
+                "value": str(d),
+                "source": "local",
+            })
+    return options
+def _scan_vlm_models(engine_type: str = "qwen3") -> list:
+    """Scan models/ directory for local VLM checkpoints (LoRA adapters and full models).
+    Looks for directories containing adapter_config.json (LoRA fine-tunes) or
+    config.json mentioning Qwen/VLM/vision architectures.
+    Returns options list ending with a __custom__ sentinel for manual entry.
+    """
+    models_dir = PROJECT_ROOT / "models"
+    options = []
+    if models_dir.exists():
+        for d in sorted(models_dir.iterdir()):
+            if not d.is_dir():
+                continue
+            # Check for LoRA adapter at top-level
+            if (d / "adapter_config.json").exists():
+                try:
+                    import json as _json
+                    with open(d / "adapter_config.json") as f:
+                        adapter_cfg = _json.load(f)
+                    base = adapter_cfg.get("base_model_name_or_path", "")
+                    is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
+                    is_churro = "churro" in base.lower() or "churro" in d.name.lower()
+                    if engine_type == "qwen3" and is_qwen and not is_churro:
+                        options.append({
+                            "label": f"{d.name} (LoRA → {base})",
+                            "value": str(d),
+                            "base_model": base,
+                            "adapter": str(d),
+                        })
+                    elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
+                        options.append({
+                            "label": f"{d.name} (LoRA → {base})",
+                            "value": str(d),
+                            "base_model": base,
+                            "adapter": str(d),
+                        })
+                except Exception:
+                    pass
+                continue  # Don't also check final_model subdirs
+            # Check for final_model subdirectory with adapter
+            final = d / "final_model"
+            if final.is_dir() and (final / "adapter_config.json").exists():
+                try:
+                    import json as _json
+                    with open(final / "adapter_config.json") as f:
+                        adapter_cfg = _json.load(f)
+                    base = adapter_cfg.get("base_model_name_or_path", "")
+                    is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
+                    is_churro = "churro" in base.lower() or "churro" in d.name.lower()
+                    if engine_type == "qwen3" and is_qwen and not is_churro:
+                        options.append({
+                            "label": f"{d.name} (LoRA → {base})",
+                            "value": str(final),
+                            "base_model": base,
+                            "adapter": str(final),
+                        })
+                    elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
+                        options.append({
+                            "label": f"{d.name} (LoRA → {base})",
+                            "value": str(final),
+                            "base_model": base,
+                            "adapter": str(final),
+                        })
+                except Exception:
+                    pass
+    # Always append a "Custom / HuggingFace" sentinel as the last option
+    options.append({
+        "label": "Custom / HuggingFace model ID...",
+        "value": "__custom__",
+    })
+    return options
+ENGINE_SCHEMAS = {
+    "CRNN-CTC (PyLaia-inspired)": lambda: {
+        "fields": [
+            {"key": "model_path", "type": "select", "label": "Model",
+             "options": _get_pylaia_model_options(),
+             "custom_key": "custom_model_path",
+             "custom_placeholder": "Absolute path to best_model.pt (e.g. /home/…/models/pylaia_yiddish_20260326/best_model.pt)"},
+            {"key": "enable_spaces", "type": "checkbox",
+             "label": "Convert <space> tokens", "default": True},
+            {"key": "flip_rtl", "type": "checkbox",
+             "label": "RTL manuscript (flip line images)", "default": False,
+             "hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
+        ]
+    },
+    "TrOCR": lambda: {
+        "fields": [
+            {"key": "model_path", "type": "select", "label": "Model",
+             "options": _scan_trocr_models(),
+             "custom_key": "custom_model_path",
+             "custom_placeholder": "HuggingFace model ID (e.g. microsoft/trocr-base-handwritten) or absolute local path"},
+            {"key": "num_beams", "type": "number", "label": "Beam Search",
+             "min": 1, "max": 10, "default": 4},
+            {"key": "normalize_background", "type": "checkbox",
+             "label": "Normalize Background", "default": False},
+            {"key": "flip_rtl", "type": "checkbox",
+             "label": "RTL manuscript (flip line images)", "default": False,
+             "hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
+        ]
+    },
+    "Qwen3-VL": lambda: {
+        "fields": [
+            {"key": "model_preset", "type": "select", "label": "Model",
+             "options": _scan_vlm_models("qwen3"),
+             "custom_key": "base_model",
+             "custom_placeholder": "HuggingFace model ID, e.g. Qwen/Qwen3-VL-8B-Instruct"},
+            {"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
+             "min": 512, "max": 4096, "default": 1536},
+        ]
+    },
+    "Churro VLM": lambda: {
+        "fields": [
+            {"key": "model_preset", "type": "select", "label": "Model",
+             "options": _scan_vlm_models("churro"),
+             "custom_key": "model_name",
+             "custom_placeholder": "HuggingFace model ID, e.g. stanford-oval/churro-3B"},
+            {"key": "device", "type": "select", "label": "Device",
+             "options": [{"label": "Auto", "value": "auto"},
+                         {"label": "GPU 0", "value": "cuda:0"},
+                         {"label": "GPU 1", "value": "cuda:1"},
+                         {"label": "CPU", "value": "cpu"}]},
+            {"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
+             "min": 512, "max": 4096, "default": 2048},
+        ]
+    },
+    "Kraken": lambda: {
+        "fields": [
+            {"key": "model_path", "type": "select", "label": "Model",
+             "options": _scan_kraken_models(),
+             "custom_key": "custom_model_path",
+             "custom_placeholder": "Absolute path on server, e.g. /home/user/models/my.mlmodel",
+             "upload": True},
+        ]
+    },
+    "Commercial APIs": lambda: {
+        "fields": [
+            {"key": "provider", "type": "select", "label": "Provider",
+             "options": [
+                 {"label": "OpenAI (GPT-4o, o1, …)", "value": "OpenAI"},
+                 {"label": "Google Gemini", "value": "Gemini"},
+                 {"label": "Anthropic Claude", "value": "Claude"},
+             ]},
+            {"key": "model", "type": "select", "label": "Model",
+             "dynamic": True,
+             "dynamic_hint": "Enter API key, then ↻ to load available models",
+             # No static lists — always fetch live from the provider API
+             "per_provider_options": {},
+             "options": [],
+             "custom_key": "custom_model_id",
+             "custom_placeholder": "e.g. gpt-4.5, gemini-exp-1206, claude-opus-4"},
+            {"key": "api_key", "type": "password", "label": "API Key",
+             "default": "", "placeholder": "Paste your API key here"},
+            {"key": "temperature", "type": "number", "label": "Temperature",
+             "min": 0.0, "max": 2.0, "default": 0.0,
+             "placeholder": "0.0 = deterministic (recommended for transcription)"},
+            {"key": "max_output_tokens", "type": "number", "label": "Max output tokens (optional)",
+             "min": 512, "max": 65536, "default": None,
+             "placeholder": "Leave blank = model maximum"},
+            {"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
+             "default": "",
+             "rows": 4,
+             "placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
+             "hint": "Leave blank to use the default prompt shown above"},
+            {"key": "thinking_mode", "type": "select", "label": "Thinking Mode (Gemini only)",
+             "options": [
+                 {"label": "Auto (model decides, no cap)", "value": ""},
+                 {"label": "Low (budget: 8k tokens)", "value": "low"},
+                 {"label": "High (no cap, max reasoning)", "value": "high"},
+             ], "default": ""},
+        ]
+    },
+    "OpenWebUI": lambda: {
+        "fields": [
+            {"key": "base_url", "type": "text", "label": "Base URL",
+             "default": "",
+             "placeholder": "https://your-openwebui-instance/api or .../api/v1"},
+            {"key": "api_key", "type": "password", "label": "API Key",
+             "default": "", "placeholder": "Your OpenWebUI API key"},
+            {"key": "model", "type": "select", "label": "Model",
+             "dynamic": True,
+             "dynamic_hint": "Enter API key & base URL, then ↻ to load available models",
+             "options": [{"label": "Custom model ID…", "value": "__custom__"}],
+             "default": "__custom__",
+             "custom_key": "model_custom",
+             "custom_placeholder": "e.g. llama3.1, qwen2.5vl, gemma3, ..."},
+            {"key": "temperature", "type": "number", "label": "Temperature",
+             "min": 0.0, "max": 2.0, "default": 0.1},
+            {"key": "max_tokens", "type": "number", "label": "Max output tokens (optional)",
+             "min": 512, "max": 65536, "default": None,
+             "placeholder": "Leave blank = model maximum"},
+            {"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
+             "default": "",
+             "rows": 3,
+             "placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
+             "hint": "Leave blank to use the default prompt shown above"},
+        ]
+    },
+    "LightOnOCR": lambda: {
+        "fields": [
+            {"key": "model_path", "type": "select", "label": "Model",
+             "options": (lambda: [
+                 {"label": f"{name} — {info.get('description','')}", "value": info["id"]}
+                 for name, info in __import__('lighton_models', fromlist=['LIGHTON_MODELS']).LIGHTON_MODELS.items()
+             ] + [{"label": "Custom HuggingFace ID…", "value": "__custom__"}])(),
+             "custom_key": "custom_model_path",
+             "custom_placeholder": "HuggingFace model ID, e.g. lightonai/LightOnOCR-2-1B-base"},
+            {"key": "max_new_tokens", "type": "number", "label": "Max new tokens",
+             "min": 32, "max": 512, "default": 128},
+        ]
+    },
+    "PaddleOCR": lambda: {
+        "fields": [
+            {"key": "lang", "type": "select", "label": "Language / Script",
+             "default": "ch",
+             "options": [
+                 {"label": "Chinese + English (mixed, recommended default)",  "value": "ch"},
+                 {"label": "English",                                          "value": "en"},
+                 {"label": "German",                                           "value": "german"},
+                 {"label": "French",                                           "value": "french"},
+                 {"label": "Japanese",                                         "value": "japan"},
+                 {"label": "Korean",                                           "value": "korean"},
+                 {"label": "Arabic",                                           "value": "arabic"},
+                 {"label": "Cyrillic (Russian/Ukrainian/Bulgarian)",           "value": "cyrillic"},
+                 {"label": "Latin script (generic)",                           "value": "latin"},
+                 {"label": "Custom (enter code below)",                        "value": "__custom__"},
+             ],
+             "custom_key": "custom_lang",
+             "custom_placeholder": "PaddleOCR lang code, e.g. ru, uk, fr, es, it, pt, …",
+             "hint": "One language model per run. 'ch' is bilingual (Chinese+English) and PaddleOCR's strongest model. For mixed-script documents outside this list, run separate passes."},
+            {"key": "use_angle_cls", "type": "checkbox",
+             "label": "Text-angle classifier (correct 180° rotation)", "default": True},
+            {"key": "use_gpu", "type": "checkbox",
+             "label": "Use GPU (requires paddlepaddle-gpu)", "default": False},
+        ]
+    },
+}
+# ---------------------------------------------------------------------------
+# Request/response models
+# ---------------------------------------------------------------------------
+class EngineLoadRequest(BaseModel):
+    engine_name: str
+    config: Dict[str, Any] = {}
+class TranscribeRequest(BaseModel):
+    image_id: str
+    seg_method: str = "kraken"  # kraken, kraken-blla, hpp
+    seg_device: str = "cpu"
+    max_columns: int = 6          # blla: max sub-columns per region (iterative splitting)
+    split_width_fraction: float = 0.40  # blla: min region width (fraction of page) to trigger sub-split
+    use_pagexml: bool = True      # use attached PAGE XML for segmentation when available
+    text_direction: str = "horizontal-lr"  # reading order for Kraken: horizontal-lr, horizontal-rl, vertical-lr, vertical-rl
+    engine_config_overrides: Dict[str, Any] = {}  # live form values merged into stored config at transcription time
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+@app.get("/")
+async def index():
+    return FileResponse(str(STATIC_DIR / "index.html"))
+@app.get("/demo")
+async def pwa_demo():
+    return FileResponse(str(STATIC_DIR / "pwa" / "demo.html"))
+@app.get("/manifest.json")
+async def pwa_manifest():
+    """Serve the PWA manifest from root so scope / start_url are valid."""
+    from fastapi.responses import FileResponse as _FR
+    return _FR(str(STATIC_DIR / "pwa" / "manifest.json"), media_type="application/manifest+json")
+@app.get("/sw.js")
+async def pwa_service_worker():
+    """Serve the PWA service worker from root scope so it can control /demo."""
+    from fastapi.responses import FileResponse as _FR
+    resp = _FR(str(STATIC_DIR / "pwa" / "sw.js"), media_type="application/javascript")
+    resp.headers["Service-Worker-Allowed"] = "/"
+    return resp
+@app.get("/api/engines")
+async def list_engines():
+    registry = get_global_registry()
+    engines = []
+    for engine in registry.get_all_engines():
+        available = engine.is_available()
+        engines.append({
+            "name": engine.get_name(),
+            "description": engine.get_description(),
+            "available": available,
+            "unavailable_reason": engine.get_unavailable_reason() if not available else None,
+            "requires_line_segmentation": engine.requires_line_segmentation(),
+            "has_config_schema": engine.get_name() in ENGINE_SCHEMAS,
+        })
+    return engines
+@app.get("/api/engine/{name}/config-schema")
+async def get_config_schema(name: str):
+    if name not in ENGINE_SCHEMAS:
+        return {"fields": []}
+    schema = ENGINE_SCHEMAS[name]()
+    # Key status: always "missing" from server perspective — browser localStorage
+    # is the only key store. The frontend checks localStorage client-side.
+    for field in schema.get("fields", []):
+        if field.get("type") == "password":
+            field["key_status"] = "missing"
+    return schema
+def _openwebui_model_urls(base_url: str) -> list[str]:
+    base = base_url.strip().rstrip("/")
+    if not base:
+        return []
+    urls = [f"{base}/models"]
+    if base.endswith("/api"):
+        urls.append(f"{base}/v1/models")
+        urls.append(f"{base[:-4]}/v1/models")
+    elif base.endswith("/api/v1"):
+        urls.append(f"{base[:-3]}/models")
+        urls.append(f"{base}/models")
+    elif base.endswith("/v1"):
+        urls.append(f"{base[:-3]}/api/models")
+    else:
+        urls.append(f"{base}/api/models")
+        urls.append(f"{base}/api/v1/models")
+        urls.append(f"{base}/v1/models")
+    return list(dict.fromkeys(urls))
+def _extract_openwebui_model_ids(payload: Any) -> list[str]:
+    if isinstance(payload, dict):
+        for key in ("data", "models"):
+            items = payload.get(key)
+            if isinstance(items, list):
+                return _extract_openwebui_model_ids(items)
+        return [
+            str(value.get("id") or value.get("name"))
+            for value in payload.values()
+            if isinstance(value, dict) and (value.get("id") or value.get("name"))
+        ]
+    if isinstance(payload, list):
+        models = []
+        for item in payload:
+            if isinstance(item, str):
+                models.append(item)
+            elif isinstance(item, dict):
+                model_id = item.get("id") or item.get("name") or item.get("model")
+                if model_id:
+                    models.append(str(model_id))
+        return sorted(set(models))
+    return []
+def _fetch_openwebui_models(base_url: str, api_key: str) -> list[str]:
+    import urllib.error
+    import urllib.request
+    errors = []
+    for url in _openwebui_model_urls(base_url):
+        req = urllib.request.Request(
+            url,
+            headers={
+                "Authorization": f"Bearer {api_key}",
+                "x-api-key": api_key,
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+                "User-Agent": "Polyscriptor-HTR-Demo/1.0",
+            },
+        )
+        try:
+            with urllib.request.urlopen(req, timeout=20) as resp:
+                status = resp.status
+                content_type = resp.headers.get("Content-Type", "")
+                body = resp.read().decode("utf-8", errors="replace")
+            try:
+                payload = json.loads(body)
+            except json.JSONDecodeError:
+                sample = body.strip().replace("\n", " ")[:120] or "<empty response>"
+                errors.append(f"{url}: HTTP {status}, non-JSON response ({content_type}): {sample}")
+                continue
+            models = _extract_openwebui_model_ids(payload)
+            if models:
+                return models
+            errors.append(f"{url}: no model ids in response")
+        except urllib.error.HTTPError as exc:
+            body = exc.read().decode("utf-8", errors="replace")[:200]
+            errors.append(f"{url}: HTTP {exc.code} {body}")
+        except Exception as exc:
+            errors.append(f"{url}: {exc}")
+    raise RuntimeError("; ".join(errors) if errors else "No OpenWebUI model endpoint tried")
+@app.get("/api/engine/status")
+async def engine_status(request: Request):
+    session = _get_session(request)
+    if session.pool_key and session.pool_key in engine_pool:
+        slot = engine_pool[session.pool_key]
+        return {
+            "loaded": slot.engine.is_model_loaded(),
+            "engine_name": slot.engine_name,
+            "config": slot.config,
+        }
+    # Fallback: compat shim for tests / startup
+    return {
+        "loaded": loaded_engine is not None and loaded_engine.is_model_loaded(),
+        "engine_name": loaded_engine_name,
+        "config": loaded_config,
+    }
+@app.get("/api/engine/{name}/models")
+async def get_engine_models(
+    name: str,
+    api_key: str = "",
+    provider: str = "openai",
+    base_url: str = "",
+):
+    """
+    Fetch available models for engines whose model list is dynamic.
+    - OpenWebUI: queries the OpenWebUI /api/models endpoint
+    - Commercial APIs: uses existing fetch_* helpers with fallback lists
+    """
+    if name == "OpenWebUI":
+        resolved = _resolve_api_key("openwebui", api_key)
+        if not resolved:
+            return {"models": [], "error": "No API key — paste one in the form"}
+        effective_url = base_url.strip().rstrip("/")
+        if not effective_url:
+            return {"models": [], "error": "Enter your OpenWebUI base URL"}
+        try:
+            models = await asyncio.to_thread(_fetch_openwebui_models, effective_url, resolved)
+            return {"models": models}
+        except Exception as e:
+            return {"models": [], "error": str(e)}
+    elif name == "Commercial APIs":
+        prov = provider.lower()
+        resolved = _resolve_api_key(prov, api_key)
+        if not resolved:
+            return {"models": [], "error": "No API key — paste one in the form"}
+        try:
+            sys.path.insert(0, str(PROJECT_ROOT))
+            if prov == "openai":
+                from inference_commercial_api import fetch_openai_models
+                models = await asyncio.to_thread(fetch_openai_models, resolved)
+                return {"models": models}
+            elif prov == "gemini":
+                from inference_commercial_api import fetch_gemini_models
+                models = await asyncio.to_thread(fetch_gemini_models, resolved)
+                return {"models": models}
+            elif prov == "claude":
+                from inference_commercial_api import fetch_claude_models
+                models = await asyncio.to_thread(fetch_claude_models, resolved)
+                return {"models": models}
+            else:
+                return {"models": [], "error": f"Unknown provider: {provider}"}
+        except Exception as e:
+            return {"models": [], "error": str(e)}
+    return {"models": [], "error": f"Dynamic model listing not supported for '{name}'"}
+@app.post("/api/engine/load")
+async def load_engine(request: Request, req: EngineLoadRequest):
+    global loaded_engine, loaded_engine_name, loaded_config
+    session = _get_session(request)
+    registry = get_global_registry()
+    reg_engine = registry.get_engine_by_name(req.engine_name)
+    if not reg_engine:
+        raise HTTPException(404, f"Engine '{req.engine_name}' not found")
+    if not reg_engine.is_available():
+        raise HTTPException(400, f"Engine not available: {reg_engine.get_unavailable_reason()}")
+    # --- Config resolution (unchanged logic) ---
+    config = dict(req.config)
+    if req.engine_name == "CRNN-CTC (PyLaia-inspired)" and "model_path" in config:
+        custom_val = config.pop("custom_model_path", "").strip()
+        if config["model_path"] == "__custom__":
+            if not custom_val:
+                raise HTTPException(400, "Please enter an absolute path to a best_model.pt file")
+            config["model_path"] = custom_val
+        # else: named preset from PYLAIA_MODELS — engine resolves it
+    elif req.engine_name == "Kraken" and "model_path" in config:
+        custom_val = config.pop("custom_model_path", "").strip()
+        val = config["model_path"]
+        if val == "__custom__":
+            if not custom_val:
+                raise HTTPException(400, "Please enter a path to a local .mlmodel file")
+            config["model_path"] = custom_val
+        elif val.startswith("__zenodo__"):
+            # Zenodo preset: pass preset_id, let engine handle download
+            config["preset_id"] = val[len("__zenodo__"):]
+            config["model_path"] = None
+        # else: relative local path from select (e.g. "models/kraken_cs/best.mlmodel") — use as-is
+    elif req.engine_name == "TrOCR" and "model_path" in config:
+        custom_val = config.pop("custom_model_path", "").strip()
+        if config["model_path"] == "__custom__":
+            if not custom_val:
+                raise HTTPException(400, "Please enter a HuggingFace model ID or local path")
+            config["model_path"] = custom_val
+        from pathlib import Path as _P
+        if _P(config["model_path"]).exists():
+            config["model_source"] = "local"
+        else:
+            config["model_source"] = "huggingface"
+    elif req.engine_name == "Qwen3-VL" and "model_preset" in config:
+        preset_val = config.pop("model_preset")
+        custom_val = config.pop("base_model", "").strip()
+        if preset_val == "__custom__":
+            config["base_model"] = custom_val or "Qwen/Qwen3-VL-8B-Instruct"
+            config["adapter"] = None
+        else:
+            vlm_opts = _scan_vlm_models("qwen3")
+            matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
+            if matched:
+                config["base_model"] = matched.get("base_model", preset_val)
+                config["adapter"] = matched.get("adapter")
+            else:
+                config["base_model"] = preset_val
+                config["adapter"] = None
+    elif req.engine_name == "Churro VLM" and "model_preset" in config:
+        preset_val = config.pop("model_preset")
+        custom_val = config.pop("model_name", "").strip()
+        if preset_val == "__custom__":
+            config["model_name"] = custom_val or "stanford-oval/churro-3B"
+            config["adapter_path"] = None
+        else:
+            vlm_opts = _scan_vlm_models("churro")
+            matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
+            if matched:
+                config["model_name"] = matched.get("base_model", preset_val)
+                config["adapter_path"] = matched.get("adapter")
+            else:
+                config["model_name"] = preset_val
+                config["adapter_path"] = None
+    elif req.engine_name == "LightOnOCR" and "model_path" in config:
+        custom_val = config.pop("custom_model_path", "").strip()
+        if config["model_path"] == "__custom__":
+            if not custom_val:
+                raise HTTPException(400, "Please enter a HuggingFace model ID for LightOnOCR")
+            config["model_path"] = custom_val
+    elif req.engine_name == "PaddleOCR" and "lang" in config:
+        if config["lang"] == "__custom__":
+            custom_lang = config.pop("custom_lang", "").strip()
+            if not custom_lang:
+                raise HTTPException(400, "Please enter a PaddleOCR language code")
+            config["lang"] = custom_lang
+        else:
+            config.pop("custom_lang", None)
+    elif req.engine_name == "Commercial APIs":
+        if config.get("model") == "__custom__":
+            config["model"] = config.pop("model_custom", "").strip() or "gpt-4o"
+    elif req.engine_name == "OpenWebUI":
+        if config.get("model") == "__custom__":
+            custom_model = config.pop("model_custom", "").strip()
+            if not custom_model:
+                raise HTTPException(400, "Please enter an OpenWebUI model ID")
+            config["model"] = custom_model
+    # Resolve API keys
+    if req.engine_name == "Commercial APIs":
+        provider_slot = config.get("provider", "openai").lower()
+        raw_key = config.get("api_key", "")
+        resolved = _resolve_api_key(provider_slot, raw_key)
+        if not resolved:
+            raise HTTPException(400, f"No API key for {config.get('provider')}. "
+                                     "Paste your API key in the field.")
+        config["api_key"] = resolved
+    elif req.engine_name == "OpenWebUI":
+        base_url = config.get("base_url", "").strip().rstrip("/")
+        if not base_url:
+            raise HTTPException(400, "No OpenWebUI base URL. "
+                                     "Enter your own OpenWebUI API base URL.")
+        config["base_url"] = base_url
+        raw_key = config.get("api_key", "")
+        resolved = _resolve_api_key("openwebui", raw_key)
+        if not resolved:
+            raise HTTPException(400, "No API key for OpenWebUI. "
+                                     "Paste your API key in the field.")
+        config["api_key"] = resolved
+    # Strip empty custom_prompt for API engines (use engine default)
+    if req.engine_name in ("Commercial APIs", "OpenWebUI"):
+        if not config.get("custom_prompt", "").strip():
+            config["custom_prompt"] = None
+    # --- Engine pool logic ---
+    pool_key = _make_pool_key(req.engine_name, config)
+    async with pool_lock:
+        # Release previous engine reference for this session
+        if session.pool_key and session.pool_key in engine_pool:
+            prev_slot = engine_pool[session.pool_key]
+            prev_slot.ref_count = max(0, prev_slot.ref_count - 1)
+            if prev_slot.ref_count == 0:
+                log.info(f"Immediate eviction (engine switch): '{prev_slot.engine_name}'")
+                try:
+                    prev_slot.engine.unload_model()
+                except Exception as e:
+                    log.warning(f"unload_model() failed for '{prev_slot.engine_name}': {e}")
+                if session.pool_key in engine_pool:
+                    del engine_pool[session.pool_key]
+        # Check if this exact engine+model is already loaded
+        if pool_key in engine_pool:
+            slot = engine_pool[pool_key]
+            slot.ref_count += 1
+            slot.last_used = time.time()
+            session.pool_key = pool_key
+            # Update compat shims
+            loaded_engine = slot.engine
+            loaded_engine_name = slot.engine_name
+            loaded_config = slot.config
+            log.info(f"Pool hit: reusing '{pool_key}' (ref_count={slot.ref_count})")
+            return {"success": True, "load_time_s": 0.0,
+                    "engine_name": req.engine_name, "reused": True}
+        # Need new slot — evict if VRAM tight
+        await _maybe_evict(req.engine_name)
+    # Load model OUTSIDE pool_lock (blocking I/O)
+    engine = _create_engine_instance(req.engine_name)
+    if not engine:
+        raise HTTPException(500, f"Cannot create engine instance for '{req.engine_name}'")
+    start = time.time()
+    success = await asyncio.to_thread(engine.load_model, config)
+    elapsed = time.time() - start
+    if not success:
+        raise HTTPException(500, "Failed to load model")
+    slot = EngineSlot(
+        engine=engine,
+        engine_name=req.engine_name,
+        config=config,
+        pool_key=pool_key,
+        ref_count=1,
+        last_used=time.time(),
+    )
+    async with pool_lock:
+        # Double-check: another request may have loaded the same key concurrently
+        if pool_key in engine_pool:
+            engine.unload_model()
+            slot = engine_pool[pool_key]
+            slot.ref_count += 1
+            slot.last_used = time.time()
+        else:
+            engine_pool[pool_key] = slot
+        session.pool_key = pool_key
+        # Update compat shims
+        loaded_engine = slot.engine
+        loaded_engine_name = slot.engine_name
+        loaded_config = slot.config
+    log.info(f"Pool miss: loaded '{pool_key}' in {elapsed:.1f}s (pool size={len(engine_pool)})")
+    return {"success": True, "load_time_s": round(elapsed, 2),
+            "engine_name": req.engine_name, "reused": False}
+@app.get("/api/keys")
+async def list_keys():
+    """Keys are stored in browser localStorage only. Server has no key info.
+    This endpoint returns an empty dict — it exists for backwards compatibility.
+    """
+    return {}
+@app.post("/api/admin/evict-all")
+async def admin_evict_all(request: Request):
+    """Force-evict all engine slots from VRAM (localhost admin only)."""
+    if request.client and request.client.host not in ("127.0.0.1", "::1"):
+        from fastapi import HTTPException
+        raise HTTPException(status_code=403, detail="localhost only")
+    async with pool_lock:
+        evicted = []
+        for key, slot in list(engine_pool.items()):
+            try:
+                slot.engine.unload_model()
+            except Exception as e:
+                log.warning(f"admin evict failed for '{key}': {e}")
+            del engine_pool[key]
+            evicted.append(key)
+        for session in sessions.values():
+            session.pool_key = None
+        global loaded_engine, loaded_engine_name, loaded_config
+        loaded_engine = None
+        loaded_engine_name = ""
+        loaded_config = {}
+    log.info(f"Admin force-evict: cleared {len(evicted)} slot(s): {evicted}")
+    return {"evicted": evicted}
+@app.post("/api/engine/unload")
+async def unload_engine(request: Request):
+    global loaded_engine, loaded_engine_name, loaded_config
+    session = _get_session(request)
+    async with pool_lock:
+        if session.pool_key and session.pool_key in engine_pool:
+            slot = engine_pool[session.pool_key]
+            slot.ref_count = max(0, slot.ref_count - 1)
+            if slot.ref_count == 0:
+                log.info(f"Immediate eviction (explicit unload): '{slot.engine_name}'")
+                try:
+                    slot.engine.unload_model()
+                except Exception as e:
+                    log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
+                if session.pool_key in engine_pool:
+                    del engine_pool[session.pool_key]
+        session.pool_key = None
+        # Update compat shims
+        loaded_engine = None
+        loaded_engine_name = ""
+        loaded_config = {}
+    return {"success": True}
+def _register_image(session: UserSession, pil_image: Image.Image, filename: str, save_path: Path) -> str:
+    """Store a PIL image in the session's cache and return its image_id."""
+    image_id = str(uuid.uuid4())
+    image_data = {
+        "path": save_path,
+        "xml_path": None,
+        "pil_image": pil_image,
+        "width": pil_image.width,
+        "height": pil_image.height,
+        "filename": filename,
+        "lines": None,
+    }
+    session.image_cache[image_id] = image_data
+    global_image_cache[image_id] = image_data
+    return image_id
+def _get_image_data(session: UserSession, image_id: str) -> Optional[dict]:
+    """Return image data, tolerating missing cookies in embedded Space contexts."""
+    if image_id in session.image_cache:
+        return session.image_cache[image_id]
+    img_data = global_image_cache.get(image_id)
+    if img_data is not None:
+        session.image_cache[image_id] = img_data
+    return img_data
+@app.post("/api/image/upload")
+async def upload_image(
+    request: Request,
+    file: UploadFile = File(...),
+    max_dim: Optional[int] = Query(default=None, ge=100, description="Resize long edge to this many pixels (mobile upload only)"),
+):
+    session = _get_session(request)
+    filename = file.filename or "upload"
+    is_pdf = (
+        filename.lower().endswith(".pdf") or
+        (file.content_type or "").startswith("application/pdf")
+    )
+    image_exts = {
+        ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp"
+    }
+    is_image = (
+        (file.content_type or "").startswith("image/") or
+        Path(filename).suffix.lower() in image_exts
+    )
+    content = await file.read()
+    if len(content) > 200 * 1024 * 1024:
+        raise HTTPException(400, "File too large (max 200MB)")
+    # ── PDF: render each page as a separate image ──────────────────────────
+    if is_pdf:
+        if not PDF_AVAILABLE:
+            raise HTTPException(400, "PDF support requires PyMuPDF. Install with: pip install pymupdf")
+        try:
+            import asyncio
+            from concurrent.futures import ThreadPoolExecutor
+            def _render_pdf(data: bytes, stem: str, sess: UserSession) -> list:
+                mat = _fitz.Matrix(150 / 72, 150 / 72)
+                doc = _fitz.open(stream=data, filetype="pdf")
+                results = []
+                for i, page in enumerate(doc):
+                    pix = page.get_pixmap(matrix=mat, colorspace=_fitz.csRGB)
+                    pil_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                    page_filename = f"{stem}_page{i+1:03d}.png"
+                    save_path = UPLOAD_DIR / f"{uuid.uuid4()}.png"
+                    pil_page.save(save_path)
+                    pid = _register_image(sess, pil_page, page_filename, save_path)
+                    results.append({
+                        "image_id": pid,
+                        "filename": page_filename,
+                        "width": pil_page.width,
+                        "height": pil_page.height,
+                        "page": i + 1,
+                    })
+                doc.close()
+                return results
+            stem = Path(filename).stem
+            loop = asyncio.get_event_loop()
+            with ThreadPoolExecutor(max_workers=1) as pool:
+                pages_out = await loop.run_in_executor(pool, _render_pdf, content, stem, session)
+            return {
+                "is_pdf": True,
+                "filename": filename,
+                "num_pages": len(pages_out),
+                "pages": pages_out,
+            }
+        except Exception as e:
+            raise HTTPException(400, f"Failed to render PDF: {e}")
+    # ── Regular image ───────────────────────────────────────────────────────
+    if not is_image:
+        raise HTTPException(400, "File must be an image or PDF")
+    ext = Path(filename).suffix or ".jpg"
+    save_path = UPLOAD_DIR / f"{uuid.uuid4()}{ext}"
+    save_path.write_bytes(content)
+    try:
+        pil_image = Image.open(save_path)
+        pil_image = ImageOps.exif_transpose(pil_image)
+        pil_image = pil_image.convert("RGB")
+        if max_dim and max(pil_image.width, pil_image.height) > max_dim:
+            pil_image.thumbnail((max_dim, max_dim), Image.LANCZOS)
+            pil_image.save(save_path)
+    except Exception as e:
+        save_path.unlink(missing_ok=True)
+        raise HTTPException(400, f"Invalid image: {e}")
+    image_id = _register_image(session, pil_image, filename, save_path)
+    return {
+        "image_id": image_id,
+        "width": pil_image.width,
+        "height": pil_image.height,
+        "filename": filename,
+    }
+@app.post("/api/image/{image_id}/xml")
+async def upload_xml(request: Request, image_id: str, file: UploadFile = File(...)):
+    """Attach a PAGE XML file to an already-uploaded image."""
+    session = _get_session(request)
+    img_data = _get_image_data(session, image_id)
+    if img_data is None:
+        raise HTTPException(404, "Image not found — upload image first")
+    content = await file.read()
+    if len(content) > 10 * 1024 * 1024:
+        raise HTTPException(400, "XML too large (max 10MB)")
+    xml_path = UPLOAD_DIR / f"{image_id}.xml"
+    xml_path.write_bytes(content)
+    img_data["xml_path"] = xml_path
+    return {"success": True, "filename": file.filename}
+@app.get("/api/image/{image_id}")
+async def get_image(request: Request, image_id: str):
+    session = _get_session(request)
+    img_data = _get_image_data(session, image_id)
+    if img_data is None:
+        raise HTTPException(404, "Image not found")
+    return FileResponse(str(img_data["path"]))
+@app.get("/api/image/{image_id}/info")
+async def image_info(request: Request, image_id: str):
+    session = _get_session(request)
+    d = _get_image_data(session, image_id)
+    if d is None:
+        raise HTTPException(404, "Image not found")
+    return {
+        "image_id": image_id,
+        "filename": d["filename"],
+        "width": d["width"],
+        "height": d["height"],
+        "has_xml": d["xml_path"] is not None,
+    }
+async def _run_segmentation(img_data: dict, method: str, device: str = "cpu",
+                            max_columns: int = 6,
+                            split_width_fraction: float = 0.40,
+                            text_direction: str = "horizontal-lr") -> dict:
+    """
+    Shared segmentation helper.  Runs the appropriate segmenter, stores
+    results in img_data, and returns a serialisable dict ready for SSE or JSON.
+    Also populates img_data["line_regions"] with a per-line region index list
+    so the transcription loop can tag each line with its column.
+    """
+    if DEMO_MODE == "hf_space" and method == "kraken-blla":
+        method = "kraken"
+        device = "cpu"
+    pil_image = img_data["pil_image"]
+    xml_path  = img_data.get("xml_path")
+    if DEMO_MODE == "hf_space" and xml_path is None and method == "hpp":
+        return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data)
+    _import_segmenters()
+    regions: list = []
+    lines: list   = []
+    xml_region_data: list = []  # TextRegion bboxes from PAGE XML (for visualization)
+    if xml_path is not None:
+        from inference_page import PageXMLSegmenter as _PXSeg
+        segmenter = _PXSeg(str(xml_path))
+        lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
+        source = "pagexml"
+        xml_region_data = getattr(segmenter, 'region_data', []) or []
+    elif method == "kraken-blla":
+        segmenter = KrakenLineSegmenter(device=device)
+        regions, lines = await asyncio.to_thread(
+            segmenter.segment_with_regions, pil_image,
+            device=device,
+            max_columns=max_columns,
+            split_width_fraction=split_width_fraction,
+            text_direction=text_direction,
+        )
+        source = "kraken-blla"
+    elif method == "kraken":
+        try:
+            segmenter = KrakenLineSegmenter()
+            # Use column-aware segmentation so multi-column pages read correctly
+            regions, lines = await asyncio.to_thread(
+                segmenter.segment_classical_with_regions, pil_image,
+                max_columns=max_columns,
+            )
+            source = "kraken"
+        except Exception as exc:
+            if DEMO_MODE == "hf_space":
+                log.warning("Kraken segmentation failed in HF Space; falling back to HPP: %s", exc)
+                return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")
+            raise
+    else:  # hpp
+        segmenter = LineSegmenter()
+        lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
+        source = "hpp"
+    if DEMO_MODE == "hf_space" and method == "kraken" and not lines:
+        log.warning("Kraken returned no lines in HF Space; falling back to HPP")
+        return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")
+    # Build per-line region index (used by transcription loop for column view)
+    line_regions: list[int] = []
+    if regions:
+        offset = 0
+        for ri, r in enumerate(regions):
+            for _ in r.line_ids:
+                line_regions.append(ri)
+            offset += len(r.line_ids)
+    else:
+        line_regions = [0] * len(lines)
+    img_data["lines"]        = lines
+    img_data["line_regions"] = line_regions
+    img_data["seg_source"]   = source
+    # PAGE XML provides region bboxes directly; Kraken/blla provide SegRegion objects
+    if xml_region_data:
+        img_data["seg_regions"] = xml_region_data
+    elif regions:
+        img_data["seg_regions"] = [
+            {"id": r.id, "bbox": list(r.bbox), "num_lines": len(r.line_ids)}
+            for r in regions
+        ]
+    else:
+        img_data["seg_regions"] = []
+    result: dict = {
+        "num_lines": len(lines),
+        "bboxes":    [list(l.bbox) for l in lines],
+        "source":    source,
+    }
+    if img_data["seg_regions"]:
+        result["regions"] = img_data["seg_regions"]
+    return result
+def _run_demo_hpp_segmentation(img_data: dict, source: str = "hpp") -> dict:
+    """Small dependency-light line segmenter for the hosted CPU demo fallback."""
+    pil_image = img_data["pil_image"]
+    gray = np.array(pil_image.convert("L"))
+    if gray.size == 0:
+        lines = []
+    else:
+        threshold = min(220, max(90, float(np.percentile(gray, 42))))
+        ink = gray < threshold
+        row_density = ink.mean(axis=1)
+        kernel = np.ones(9, dtype=np.float32) / 9.0
+        smooth = np.convolve(row_density, kernel, mode="same")
+        active_threshold = max(0.01, float(smooth.max()) * 0.13)
+        min_height = max(10, int(pil_image.height * 0.008))
+        bands = []
+        start = None
+        for y, value in enumerate(smooth):
+            if value > active_threshold and start is None:
+                start = y
+            elif (value <= active_threshold or y == len(smooth) - 1) and start is not None:
+                end = y if y == len(smooth) - 1 else y - 1
+                if end - start + 1 >= min_height:
+                    bands.append((start, end))
+                start = None
+        lines = []
+        for y1, y2 in bands[:100]:
+            pad_y = max(3, int((y2 - y1 + 1) * 0.25))
+            top = max(0, y1 - pad_y)
+            bottom = min(pil_image.height, y2 + pad_y + 1)
+            band_ink = ink[top:bottom, :]
+            cols = np.where(band_ink.any(axis=0))[0]
+            if cols.size:
+                left = max(0, int(cols[0]) - 8)
+                right = min(pil_image.width, int(cols[-1]) + 9)
+            else:
+                left = 0
+                right = pil_image.width
+            bbox = (left, top, right, bottom)
+            lines.append(SimpleNamespace(
+                image=pil_image.crop(bbox),
+                bbox=bbox,
+                coords=None,
+            ))
+    img_data["lines"] = lines
+    img_data["line_regions"] = [0] * len(lines)
+    img_data["seg_source"] = source
+    img_data["seg_regions"] = []
+    return {
+        "num_lines": len(lines),
+        "bboxes": [list(line.bbox) for line in lines],
+        "source": source,
+    }
+@app.delete("/api/image/{image_id}/region/{region_index}")
+async def delete_region(request: Request, image_id: str, region_index: int):
+    """
+    Remove one detected region and its lines from the cached segmentation.
+    Returns updated segmentation data in the same format as /segment,
+    so the client can redraw the canvas.
+    """
+    session = _get_session(request)
+    img_data = _get_image_data(session, image_id)
+    if img_data is None:
+        raise HTTPException(404, "Image not found")
+    seg_regions = img_data.get("seg_regions") or []
+    if not seg_regions:
+        raise HTTPException(400, "No segmentation data — run Segment first")
+    if region_index < 0 or region_index >= len(seg_regions):
+        raise HTTPException(400, f"Region index out of range (0–{len(seg_regions)-1})")
+    lines        = img_data.get("lines") or []
+    line_regions = img_data.get("line_regions") or ([0] * len(lines))
+    # Keep lines that are NOT in the deleted region; re-index later regions
+    new_lines: list = []
+    new_line_regions: list = []
+    for line, lr in zip(lines, line_regions):
+        if lr == region_index:
+            continue
+        new_lines.append(line)
+        new_line_regions.append(lr if lr < region_index else lr - 1)
+    new_regions = [r for i, r in enumerate(seg_regions) if i != region_index]
+    img_data["lines"]        = new_lines
+    img_data["line_regions"] = new_line_regions
+    img_data["seg_regions"]  = new_regions
+    result: dict = {
+        "num_lines": len(new_lines),
+        "bboxes":    [list(l.bbox) for l in new_lines],
+        "source":    img_data.get("seg_source", "modified"),
+    }
+    if new_regions:
+        result["regions"] = new_regions
+    return result
+@app.get("/api/image/{image_id}/segment")
+async def segment_image(
+    request: Request,
+    image_id: str,
+    method: str = "kraken",
+    device: str = "cpu",
+    max_columns: int = 6,
+    split_width_fraction: float = 0.40,
+    text_direction: str = "horizontal-lr",
+):
+    """
+    Run segmentation only (no transcription) and return line bboxes as JSON.
+    Useful for previewing line layout before transcribing.
+    """
+    session = _get_session(request)
+    img_data = _get_image_data(session, image_id)
+    if img_data is None:
+        raise HTTPException(404, "Image not found — upload first")
+    try:
+        return await _run_segmentation(img_data, method, device,
+                                       max_columns, split_width_fraction, text_direction)
+    except Exception as e:
+        raise HTTPException(500, f"Segmentation failed: {e}")
+@app.post("/api/transcribe")
+async def transcribe(request: Request, req: TranscribeRequest):
+    session = _get_session(request)
+    # Resolve engine from session's pool slot
+    if not session.pool_key or session.pool_key not in engine_pool:
+        # Fallback: check compat shims (e.g. auto-loaded engine, no session yet)
+        if not loaded_engine or not loaded_engine.is_model_loaded():
+            raise HTTPException(400, "No engine loaded")
+    slot = engine_pool.get(session.pool_key) if session.pool_key else None
+    # Build effective engine/config references
+    eff_engine = slot.engine if slot else loaded_engine
+    _base_config = slot.config if slot else loaded_config
+    # Merge live form overrides into a copy of the stored config so changes to
+    # runtime-only fields (custom_prompt, thinking_mode, temperature, …) take
+    # effect without requiring a model reload.  Never overwrite security-sensitive
+    # keys that were set during load (api_key, provider, model, model_path, …).
+    _RELOAD_ONLY_KEYS = {"api_key", "provider", "model", "model_path", "model_source",
+                         "base_model", "adapter", "model_name", "preset_id", "lang",
+                         "use_gpu", "venv_path"}
+    if req.engine_config_overrides:
+        eff_config = dict(_base_config)
+        for k, v in req.engine_config_overrides.items():
+            if k not in _RELOAD_ONLY_KEYS:
+                eff_config[k] = v
+    else:
+        eff_config = _base_config
+    eff_engine_name = slot.engine_name if slot else loaded_engine_name
+    if not eff_engine or not eff_engine.is_model_loaded():
+        raise HTTPException(400, "No engine loaded")
+    img_data = _get_image_data(session, req.image_id)
+    if img_data is None:
+        raise HTTPException(404, "Image not found — upload first")
+    pil_image = img_data["pil_image"]
+    # Per-request cancel event (replaces global cancel_event)
+    request_id = str(uuid.uuid4())
+    cancel_evt = asyncio.Event()
+    session.cancel_events[request_id] = cancel_evt
+    async def event_stream():
+        _import_segmenters()
+        try:
+            # --- Segmentation ---
+            xml_path = img_data.get("xml_path") if req.use_pagexml else None
+            if not eff_engine.requires_line_segmentation() and not xml_path:
+                # Page-level engine with no PAGE XML — send whole page as single line
+                from inference_page import LineSegment
+                lines = [LineSegment(
+                    image=pil_image,
+                    bbox=(0, 0, pil_image.width, pil_image.height),
+                    coords=None,
+                )]
+                img_data["lines"]        = lines
+                img_data["line_regions"] = [0]
+                img_data["seg_source"]   = "page"
+                img_data["seg_regions"]  = []
+                yield _sse("segmentation", {
+                    "num_lines": 1,
+                    "bboxes": [[0, 0, pil_image.width, pil_image.height]],
+                    "source": "page",
+                })
+            else:
+                # Reuse cached segmentation if method matches (e.g. user clicked Segment first)
+                cached_lines   = img_data.get("lines")
+                cached_source  = img_data.get("seg_source")
+                desired_source = "pagexml" if (xml_path and req.use_pagexml) else req.seg_method
+                if cached_lines and cached_source == desired_source:
+                    lines = cached_lines
+                    yield _sse("status", {"message": "Using cached segmentation..."})
+                    seg_event: dict = {
+                        "num_lines": len(lines),
+                        "bboxes":    [list(l.bbox) for l in lines],
+                        "source":    cached_source,
+                    }
+                    if img_data.get("seg_regions"):
+                        seg_event["regions"] = img_data["seg_regions"]
+                    yield _sse("segmentation", seg_event)
+                elif xml_path is not None:
+                    yield _sse("status", {"message": "Reading line layout from PAGE XML..."})
+                    seg_result = await _run_segmentation(img_data, "pagexml",
+                                                         req.seg_device, req.max_columns,
+                                                         req.split_width_fraction,
+                                                         req.text_direction)
+                    lines = img_data["lines"]
+                    yield _sse("segmentation", seg_result)
+                else:
+                    yield _sse("status", {"message": f"Segmenting with {req.seg_method}..."})
+                    seg_result = await _run_segmentation(img_data, req.seg_method,
+                                                         req.seg_device, req.max_columns,
+                                                         req.split_width_fraction,
+                                                         req.text_direction)
+                    lines = img_data["lines"]
+                    yield _sse("segmentation", seg_result)
+            # --- Transcription ---
+            results = []
+            token_usage: Dict[str, Any] = {}
+            start_time = time.time()
+            line_regions = img_data.get("line_regions") or ([0] * len(lines))
+            for i, line in enumerate(lines):
+                # Check for cancellation before each line
+                if cancel_evt.is_set():
+                    yield _sse("cancelled", {})
+                    return
+                line_img = line.image if line.image is not None else pil_image.crop(line.bbox)
+                img_array = np.array(line_img.convert("RGB"))
+                # Use slot lock to serialize access to this engine instance
+                if slot:
+                    async with slot.lock:
+                        slot.last_used = time.time()
+                        result = await asyncio.to_thread(
+                            eff_engine.transcribe_line, img_array, eff_config
+                        )
+                else:
+                    result = await asyncio.to_thread(
+                        eff_engine.transcribe_line, img_array, eff_config
+                    )
+                text = str(result.text) if hasattr(result, "text") else str(result)
+                confidence = None
+                if hasattr(result, "confidence") and result.confidence is not None:
+                    confidence = float(result.confidence)
+                    if confidence > 1:
+                        confidence = confidence / 100.0
+                # Accumulate token usage and extract thinking text from API engines (e.g. Gemini)
+                thinking_text = None
+                if hasattr(result, "metadata") and isinstance(result.metadata, dict):
+                    tu = result.metadata.get("token_usage")
+                    if tu:
+                        for k, v in tu.items():
+                            if v is not None:
+                                token_usage[k] = token_usage.get(k, 0) + v
+                    thinking_text = result.metadata.get("thinking_text")
+                line_data = {
+                    "index": i,
+                    "text": text,
+                    "confidence": confidence,
+                    "bbox": list(line.bbox),
+                    "region": line_regions[i] if i < len(line_regions) else 0,
+                }
+                if thinking_text:
+                    line_data["thinking_text"] = thinking_text
+                results.append(line_data)
+                progress_data: Dict[str, Any] = {
+                    "current": i + 1,
+                    "total": len(lines),
+                    "line": line_data,
+                }
+                if token_usage:
+                    progress_data["token_usage"] = dict(token_usage)
+                yield _sse("progress", progress_data)
+                # Check for cancellation after each line's progress event
+                if cancel_evt.is_set():
+                    yield _sse("cancelled", {})
+                    return
+            # Store completed results in session image_cache for export
+            img_data["results"] = results
+            elapsed = time.time() - start_time
+            complete_data: Dict[str, Any] = {
+                "lines": results,
+                "total_time_s": round(elapsed, 2),
+                "engine": eff_engine_name,
+            }
+            if token_usage:
+                complete_data["token_usage"] = token_usage
+            yield _sse("complete", complete_data)
+        except Exception as e:
+            log.exception("Transcription error")
+            yield _sse("error", {"message": str(e)})
+        finally:
+            # Clean up this request's cancel event
+            session.cancel_events.pop(request_id, None)
+    return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",  # Disable nginx buffering if behind proxy
+        },
+    )
+@app.post("/api/transcribe/cancel")
+async def cancel_transcription(request: Request):
+    """Signal all running transcriptions for this session to stop."""
+    session = _get_session(request)
+    for evt in session.cancel_events.values():
+        evt.set()
+    return {"success": True}
+@app.post("/api/image/{image_id}/export-xml")
+async def export_xml(request: Request, image_id: str):
+    """Export transcription results for image_id as PAGE XML."""
+    session = _get_session(request)
+    pretty, stem = _build_xml_bytes(session, image_id)
+    return Response(
+        content=pretty,
+        media_type="application/xml",
+        headers={"Content-Disposition": f'attachment; filename="{stem}.xml"'},
+    )
+def _build_xml_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
+    """Return (xml_bytes, stem) for a cached image, or raise HTTPException."""
+    import xml.etree.ElementTree as ET
+    from xml.dom import minidom
+    from page_xml_exporter import PageXMLExporter
+    img_data = _get_image_data(session, image_id)
+    if img_data is None:
+        raise HTTPException(404, f"Image {image_id} not found")
+    results = img_data.get("results")
+    if not results:
+        raise HTTPException(400, f"No results for {image_id}")
+    filename = img_data.get("filename", img_data["path"].name)
+    width = img_data["width"]
+    height = img_data["height"]
+    class _SegProxy:
+        __slots__ = ("bbox", "coords", "text", "confidence")
+        def __init__(self, r):
+            bbox = r.get("bbox")
+            self.bbox = tuple(bbox) if bbox else (0, 0, width, height)
+            self.coords = None
+            self.text = r.get("text", "")
+            self.confidence = r.get("confidence")
+    segments = [_SegProxy(r) for r in results]
+    exporter = PageXMLExporter(str(filename), width, height)
+    root, page = exporter._make_root("Polyscriptor Web UI", None)
+    reading_order = ET.SubElement(page, 'ReadingOrder')
+    ordered_group = ET.SubElement(reading_order, 'OrderedGroup',
+                                  {'id': 'ro_1', 'caption': 'Regions reading order'})
+    ET.SubElement(ordered_group, 'RegionRefIndexed', {'index': '0', 'regionRef': 'region_1'})
+    text_region = ET.SubElement(page, 'TextRegion',
+                                 {'id': 'region_1', 'type': 'paragraph', 'custom': 'readingOrder {index:0;}'})
+    if segments:
+        x1 = min(s.bbox[0] for s in segments)
+        y1 = min(s.bbox[1] for s in segments)
+        x2 = max(s.bbox[2] for s in segments)
+        y2 = max(s.bbox[3] for s in segments)
+        ET.SubElement(text_region, 'Coords').set('points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}')
+    for idx, seg in enumerate(segments):
+        exporter._add_text_line(text_region, f'line_{idx + 1}', seg, seg.text, idx)
+    xml_bytes = ET.tostring(root, encoding='utf-8', method='xml')
+    pretty = minidom.parseString(xml_bytes).toprettyxml(indent='  ', encoding='utf-8')
+    return pretty, Path(filename).stem
+def _build_thinking_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
+    """Return (thinking_bytes, stem) for a cached image, or raise HTTPException(404) if no thinking."""
+    img_data = _get_image_data(session, image_id)
+    if img_data is None:
+        raise HTTPException(404, f"Image {image_id} not found")
+    results = img_data.get("results")
+    if not results:
+        raise HTTPException(400, f"No results for {image_id}")
+    filename = img_data.get("filename", img_data["path"].name)
+    stem = Path(filename).stem
+    blocks = []
+    for i, r in enumerate(results):
+        t = r.get("thinking_text", "")
+        if t:
+            if len(results) > 1:
+                blocks.append(f"=== Line {i + 1} ===\n{t}")
+            else:
+                blocks.append(t)
+    if not blocks:
+        raise HTTPException(404, f"No thinking text for {image_id}")
+    return "\n\n".join(blocks).encode("utf-8"), stem
+def _build_txt_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
+    """Return (txt_bytes, stem) for a cached image, or raise HTTPException."""
+    img_data = _get_image_data(session, image_id)
+    if img_data is None:
+        raise HTTPException(404, f"Image {image_id} not found")
+    results = img_data.get("results")
+    if not results:
+        raise HTTPException(400, f"No results for {image_id}")
+    filename = img_data.get("filename", img_data["path"].name)
+    text = "\n".join(r.get("text", "") for r in results)
+    return text.encode("utf-8"), Path(filename).stem
+class BatchXMLRequest(BaseModel):
+    image_ids: list[str]
+@app.post("/api/batch/export-thinking")
+async def batch_export_thinking(request: Request, req: BatchXMLRequest):
+    """Return a ZIP archive containing one thinking-text file per image (skips pages without thinking)."""
+    session = _get_session(request)
+    import zipfile, io
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for image_id in req.image_ids:
+            try:
+                thinking_bytes, stem = _build_thinking_bytes(session, image_id)
+                zf.writestr(f"{stem}_thinking.txt", thinking_bytes)
+            except HTTPException:
+                pass  # skip pages without thinking
+    buf.seek(0)
+    return Response(
+        content=buf.read(),
+        media_type="application/zip",
+        headers={"Content-Disposition": 'attachment; filename="batch_thinking.zip"'},
+    )
+@app.post("/api/batch/export-txt")
+async def batch_export_txt(request: Request, req: BatchXMLRequest):
+    """Return a ZIP archive containing one plain-text file per image."""
+    session = _get_session(request)
+    import zipfile, io
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for image_id in req.image_ids:
+            try:
+                txt_bytes, stem = _build_txt_bytes(session, image_id)
+                zf.writestr(f"{stem}.txt", txt_bytes)
+            except HTTPException:
+                pass  # skip images without results
+    buf.seek(0)
+    return Response(
+        content=buf.read(),
+        media_type="application/zip",
+        headers={"Content-Disposition": 'attachment; filename="batch_export_txt.zip"'},
+    )
+@app.post("/api/batch/export-xml")
+async def batch_export_xml(request: Request, req: BatchXMLRequest):
+    """Return a ZIP archive containing one PAGE XML file per image."""
+    session = _get_session(request)
+    import zipfile, io
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for image_id in req.image_ids:
+            try:
+                xml_bytes, stem = _build_xml_bytes(session, image_id)
+                zf.writestr(f"{stem}.xml", xml_bytes)
+            except HTTPException:
+                pass  # skip images without results
+    buf.seek(0)
+    return Response(
+        content=buf.read(),
+        media_type="application/zip",
+        headers={"Content-Disposition": 'attachment; filename="batch_export.zip"'},
+    )
+@app.get("/api/session")
+async def session_info(request: Request):
+    """Return info about the current session (useful for debugging)."""
+    session = _get_session(request)
+    return {
+        "session_id": session.session_id[:8] + "...",
+        "images": len(session.image_cache),
+        "active_transcriptions": len(session.cancel_events),
+        "pool_key": session.pool_key,
+        "created_at": session.created_at,
+        "last_active": session.last_active,
+        "total_sessions": len(sessions),
+    }
+@app.get("/api/engine/pool")
+async def pool_status():
+    """Return current engine pool state (admin/debug endpoint)."""
+    slots = []
+    for key, slot in engine_pool.items():
+        slots.append({
+            "pool_key": key,
+            "engine_name": slot.engine_name,
+            "ref_count": slot.ref_count,
+            "loaded": slot.engine.is_model_loaded(),
+            "last_used": slot.last_used,
+            "age_s": round(time.time() - slot.last_used, 0),
+        })
+    return {
+        "pool_size": len(engine_pool),
+        "slots": slots,
+        "total_sessions": len(sessions),
+    }
+@app.get("/api/kraken/presets")
+async def kraken_presets():
+    """Return list of available Kraken model presets (local + Zenodo)."""
+    try:
+        from engines.kraken_engine import KRAKEN_MODELS
+    except ImportError:
+        return {"presets": []}
+    presets = []
+    for model_id, info in KRAKEN_MODELS.items():
+        presets.append({
+            "id": model_id,
+            "label": info.get("description", model_id),
+            "language": info.get("language", ""),
+            "source": info.get("source", ""),
+        })
+    return {"presets": presets}
+@app.post("/api/models/upload")
+async def upload_model(file: UploadFile = File(...)):
+    """Upload a Kraken .mlmodel file to the server's models/kraken_uploads/ directory."""
+    filename = file.filename or "model.mlmodel"
+    if not filename.lower().endswith(".mlmodel"):
+        raise HTTPException(400, "Only .mlmodel files are accepted")
+    content = await file.read()
+    if len(content) > 500 * 1024 * 1024:
+        raise HTTPException(400, "File too large (max 500 MB)")
+    upload_dir = PROJECT_ROOT / "models" / "kraken_uploads"
+    upload_dir.mkdir(parents=True, exist_ok=True)
+    # Sanitize filename — keep only safe characters
+    safe_name = Path(filename).name
+    safe_name = "".join(c for c in safe_name if c.isalnum() or c in "._- ")
+    safe_name = safe_name.strip() or "uploaded.mlmodel"
+    dest = upload_dir / safe_name
+    dest.write_bytes(content)
+    log.info(f"Uploaded Kraken model: {dest} ({len(content)} bytes)")
+    rel_path = str(dest.relative_to(PROJECT_ROOT))  # e.g. models/kraken_uploads/foo.mlmodel
+    return {
+        "path": rel_path,
+        "filename": safe_name,
+        "size": len(content),
+        "options": _scan_kraken_models(),  # refreshed list for frontend to repopulate select
+    }
+@app.get("/api/gpu")
+async def gpu_status():
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return {"available": False, "gpus": []}
+        # pynvml (nvidia-ml-py) for utilization %; graceful fallback if missing
+        nvml_utils: dict[int, dict] = {}
+        try:
+            import pynvml
+            pynvml.nvmlInit()
+            for _i in range(pynvml.nvmlDeviceGetCount()):
+                h = pynvml.nvmlDeviceGetHandleByIndex(_i)
+                u = pynvml.nvmlDeviceGetUtilizationRates(h)
+                nvml_utils[_i] = {"gpu_pct": u.gpu, "mem_pct": u.memory}
+        except Exception:
+            pass  # pynvml unavailable — utilization fields omitted
+        gpus = []
+        for i in range(torch.cuda.device_count()):
+            free, total = torch.cuda.mem_get_info(i)
+            entry: dict = {
+                "index": i,
+                "name": torch.cuda.get_device_name(i),
+                "memory_total_mb": round(total / 1e6),
+                "memory_used_mb": round((total - free) / 1e6),
+                "memory_free_mb": round(free / 1e6),
+            }
+            if i in nvml_utils:
+                entry["utilization_gpu_pct"] = nvml_utils[i]["gpu_pct"]
+                entry["utilization_mem_pct"] = nvml_utils[i]["mem_pct"]
+            gpus.append(entry)
+        return {"available": True, "gpus": gpus}
+    except Exception:
+        return {"available": False, "gpus": []}
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _sse(event: str, data: dict) -> str:
+    """Format a Server-Sent Event."""
+    return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"

web/server_config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+# Polyscriptor Web UI — server startup configuration
+# Uncomment and adjust to auto-load an engine on server start.
+#
+# Usage:
+#   python -m uvicorn web.polyscriptor_server:app --host 0.0.0.0 --port 8765
+#
+# The server will load the specified engine at startup so the first
+# transcription request doesn't need to wait for model loading.
+# --- Auto-load (optional) ---
+# Set default_engine to the engine name shown in the UI dropdown.
+# Leave blank or comment out to start without a loaded model.
+# Example: auto-load Church Slavonic CRNN-CTC model
+# default_engine: "CRNN-CTC (PyLaia-inspired)"
+# default_config:
+#   model_path: "Church Slavonic (2.89% CER)"
+#   enable_spaces: true
+# Example: auto-load TrOCR from HuggingFace
+# default_engine: "TrOCR"
+# default_config:
+#   model_path: "kazars24/trocr-base-handwritten-ru"
+#   num_beams: 4
+#   normalize_background: false

web/static/app.css ADDED Viewed

	@@ -0,0 +1,1269 @@

+/* ── Self-hosted fonts ─── */
+@font-face {
+    font-family: 'Monomakh';
+    src: url('fonts/MonomakhUnicode-Regular.woff2') format('woff2');
+    font-weight: normal;
+    font-style: normal;
+    font-display: swap;
+    unicode-range: U+0000-007F, U+0080-00FF, U+0300-036F, U+0400-04FF,
+        U+0500-052F, U+1C80-1C8F, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F;
+}
+/* ── Design tokens ───────────────────────────────────────────────────── */
+:root {
+    --bg:          #111827;
+    --bg-panel:    #1f2937;
+    --bg-section:  #1a2333;
+    --bg-input:    #111827;
+    --bg-hover:    #2a3a52;
+    --text:        #e2e8f0;
+    --text-muted:  #64748b;
+    --text-dim:    #94a3b8;
+    --accent:      #e94560;
+    --accent-hover:#ff6b81;
+    --primary:     #3b82f6;
+    --primary-hover:#60a5fa;
+    --success:     #22c55e;
+    --warning:     #f59e0b;
+    --danger:      #ef4444;
+    --border:      #2d3f59;
+    --border-light:#3a4f6e;
+    --radius:      6px;
+    --radius-lg:   10px;
+    --font:        'Segoe UI', system-ui, -apple-system, sans-serif;
+    --font-mono:   'Consolas', 'Fira Code', 'Cascadia Code', monospace;
+    --header-h:    44px;
+    --tabs-h:      56px;
+    --shadow:      0 4px 20px rgba(0,0,0,0.4);
+}
+/* ── Reset & base ───────────────────────────────────────────────────── */
+*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+html { height: 100%; }
+body {
+    font-family: var(--font);
+    background: var(--bg);
+    color: var(--text);
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+    overflow: hidden;
+}
+/* ── Header ─────────────────────────────────────────────────────────── */
+#header {
+    height: var(--header-h);
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 0 14px;
+    background: var(--bg-panel);
+    border-bottom: 1px solid var(--border);
+    flex-shrink: 0;
+    gap: 12px;
+}
+.header-left {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    min-width: 0;
+}
+.header-logo {
+    font-size: 1.3rem;
+    color: var(--primary);
+    line-height: 1;
+}
+#header h1 {
+    font-size: 1rem;
+    font-weight: 700;
+    letter-spacing: 0.3px;
+    white-space: nowrap;
+}
+.header-sub {
+    font-weight: 400;
+    color: var(--text-muted);
+    font-size: 0.9rem;
+    letter-spacing: 2px;
+    margin-left: 2px;
+}
+.header-right {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    flex-shrink: 0;
+}
+.gpu-badge {
+    font-size: 0.75rem;
+    padding: 3px 10px;
+    border-radius: 12px;
+    background: var(--bg-input);
+    color: var(--text-muted);
+    border: 1px solid var(--border);
+    white-space: nowrap;
+    max-width: 280px;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+.btn-icon {
+    width: 30px;
+    height: 30px;
+    border: 1px solid var(--border);
+    border-radius: 50%;
+    background: var(--bg-input);
+    color: var(--text-muted);
+    font-size: 0.85rem;
+    font-weight: 700;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    flex-shrink: 0;
+    transition: border-color 0.15s, color 0.15s;
+}
+.btn-icon:hover { border-color: var(--primary); color: var(--primary); }
+/* ── GPU widget ─────────────────────────────────────────────────────── */
+.gpu-widget {
+    display: flex;
+    gap: 8px;
+    align-items: center;
+}
+.gpu-card {
+    display: flex;
+    flex-direction: column;
+    gap: 3px;
+    font-size: 0.7rem;
+    color: var(--text-muted);
+    min-width: 90px;
+    max-width: 160px;
+}
+.gpu-card-name {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    gap: 4px;
+    white-space: nowrap;
+    overflow: hidden;
+}
+.gpu-card-name span { overflow: hidden; text-overflow: ellipsis; }
+.gpu-util-pct {
+    font-size: 0.68rem;
+    color: var(--text-dim);
+    flex-shrink: 0;
+}
+.gpu-mem-bar {
+    height: 4px;
+    background: var(--bg-input);
+    border-radius: 2px;
+    overflow: hidden;
+}
+.gpu-mem-fill {
+    height: 100%;
+    border-radius: 2px;
+    background: var(--primary);
+    transition: width 0.5s ease;
+}
+.gpu-mem-fill.warm  { background: var(--warning); }
+.gpu-mem-fill.hot   { background: var(--danger); }
+.gpu-mem-label {
+    font-size: 0.65rem;
+    color: var(--text-muted);
+}
+/* ── Toast notifications ─────────────────────────────────────────────── */
+#toast-container {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    z-index: 9999;
+    pointer-events: none;
+}
+.toast {
+    padding: 10px 16px;
+    border-radius: var(--radius);
+    font-size: 0.85rem;
+    box-shadow: var(--shadow);
+    pointer-events: auto;
+    animation: toast-in 0.2s ease;
+    max-width: 320px;
+}
+.toast-error   { background: #7f1d1d; color: #fca5a5; border: 1px solid #991b1b; }
+.toast-success { background: #14532d; color: #86efac; border: 1px solid #15803d; }
+.toast-info    { background: #1e3a5f; color: #93c5fd; border: 1px solid #1d4ed8; }
+@keyframes toast-in { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: none; } }
+/* ── Main layout (3 columns) ────────────────────────────────────────── */
+#app {
+    display: grid;
+    grid-template-columns: var(--panel-left, 260px) 5px 1fr 5px var(--panel-right, 360px);
+    grid-template-rows: 1fr;
+    gap: 0;
+    flex: 1;
+    min-height: 0;
+    background: var(--border);
+}
+.panel-resize-handle {
+    background: var(--border);
+    cursor: col-resize;
+    transition: background 0.15s;
+    z-index: 10;
+    position: relative;
+}
+.panel-resize-handle:hover,
+.panel-resize-handle.dragging {
+    background: var(--primary);
+}
+.panel-resize-handle::after {
+    content: '';
+    position: absolute;
+    inset: 0 -4px; /* wider hit area */
+}
+.panel {
+    background: var(--bg-panel);
+    overflow-y: auto;
+    overflow-x: hidden;
+    min-height: 0;
+}
+/* Left panel flex column */
+#engine-panel {
+    display: flex;
+    flex-direction: column;
+    gap: 0;
+    padding: 0;
+}
+.panel-section {
+    padding: 12px 12px 8px;
+    display: flex;
+    flex-direction: column;
+    gap: 7px;
+}
+.panel-footer {
+    padding: 10px 12px;
+    border-top: 1px solid var(--border);
+    margin-top: auto;
+}
+.footer-btn-row {
+    display: flex;
+    gap: 6px;
+}
+.footer-btn-row .btn {
+    flex: 1;
+}
+.panel h2 {
+    font-size: 0.7rem;
+    text-transform: uppercase;
+    letter-spacing: 1.2px;
+    color: var(--text-muted);
+    margin-bottom: 2px;
+}
+#engine-panel hr {
+    border: none;
+    border-top: 1px solid var(--border);
+    flex-shrink: 0;
+}
+/* ── Form elements ──────────────────────────────────────────────────── */
+label {
+    font-size: 0.78rem;
+    color: var(--text-dim);
+}
+select,
+input[type="text"],
+input[type="number"],
+input[type="password"] {
+    width: 100%;
+    padding: 6px 9px;
+    background: var(--bg-input);
+    color: var(--text);
+    border: 1px solid var(--border);
+    border-radius: var(--radius);
+    font-size: 0.83rem;
+    font-family: var(--font);
+    transition: border-color 0.15s;
+}
+select:focus, input:focus, textarea:focus {
+    outline: none;
+    border-color: var(--primary);
+    box-shadow: 0 0 0 2px rgba(59,130,246,0.12);
+}
+textarea {
+    width: 100%;
+    padding: 6px 9px;
+    background: var(--bg-input);
+    color: var(--text);
+    border: 1px solid var(--border);
+    border-radius: var(--radius);
+    font-size: 0.83rem;
+    font-family: var(--font);
+    transition: border-color 0.15s;
+    box-sizing: border-box;
+}
+input::placeholder,
+textarea::placeholder {
+    color: var(--text-dim);
+    font-style: italic;
+    opacity: 0.65;
+}
+select option { background: var(--bg-panel); color: var(--text); }
+/* Config form fields */
+.config-field {
+    display: flex;
+    flex-direction: column;
+    gap: 3px;
+}
+/* Select + refresh button row */
+.select-row {
+    display: flex;
+    gap: 4px;
+    align-items: center;
+}
+.select-row select { flex: 1; min-width: 0; width: auto; }
+.btn-refresh {
+    flex-shrink: 0;
+    width: 28px;
+    height: 28px;
+    border: 1px solid var(--border);
+    border-radius: var(--radius);
+    background: var(--bg-input);
+    color: var(--text-muted);
+    font-size: 1rem;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    transition: background 0.15s, border-color 0.15s, color 0.15s;
+}
+.btn-refresh:hover:not(:disabled) { background: var(--bg-hover); border-color: var(--primary); color: var(--primary); }
+.btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
+.dynamic-hint {
+    font-size: 0.7rem;
+    color: var(--text-muted);
+    min-height: 1em;
+}
+.config-field label { font-size: 0.75rem; color: var(--text-muted); }
+.config-field-checkbox {
+    flex-direction: row;
+    align-items: center;
+    gap: 7px;
+}
+.config-field-checkbox input[type="checkbox"] {
+    width: auto;
+    accent-color: var(--primary);
+    cursor: pointer;
+}
+.config-field-checkbox label { font-size: 0.82rem; color: var(--text); cursor: pointer; }
+#blla-options {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+#blla-options label { flex-shrink: 0; }
+#blla-options input { width: 64px; }
+/* ── Buttons ────────────────────────────────────────────────────────── */
+.btn {
+    padding: 8px 14px;
+    border: none;
+    border-radius: var(--radius);
+    font-size: 0.83rem;
+    font-family: var(--font);
+    cursor: pointer;
+    transition: background 0.15s, transform 0.1s;
+    white-space: nowrap;
+}
+.btn:active:not(:disabled) { transform: translateY(1px); }
+.btn:disabled { opacity: 0.38; cursor: not-allowed; }
+.btn-full { width: 100%; }
+.btn-primary { background: var(--primary); color: white; }
+.btn-primary:hover:not(:disabled) { background: var(--primary-hover); }
+.btn-accent { background: var(--accent); color: white; }
+.btn-accent:hover:not(:disabled) { background: var(--accent-hover); }
+.btn-small {
+    padding: 5px 10px;
+    font-size: 0.78rem;
+    background: var(--bg-input);
+    color: var(--text-dim);
+    border: 1px solid var(--border);
+}
+.btn-small:hover:not(:disabled) { background: var(--bg-hover); border-color: var(--border-light); color: var(--text); }
+.btn-outline {
+    background: transparent;
+    border: 1px solid var(--border);
+    cursor: pointer;
+    border-radius: var(--radius);
+    display: inline-flex;
+    align-items: center;
+    font-size: 0.78rem;
+    color: var(--text-dim);
+    padding: 4px 8px;
+    transition: background 0.15s, border-color 0.15s;
+}
+.btn-outline:hover { background: var(--bg-hover); border-color: var(--primary); color: var(--text); }
+.btn-row {
+    display: flex;
+    gap: 6px;
+    flex-wrap: wrap;
+    margin-top: 6px;
+}
+/* Save key row */
+.key-save-row {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    margin-top: 4px;
+    font-size: 0.76rem;
+    color: var(--text-muted);
+}
+.key-save-row input[type="checkbox"] { width: auto; margin: 0; accent-color: var(--primary); }
+.key-save-row label { cursor: pointer; }
+input[disabled] { opacity: 0.45; cursor: not-allowed; }
+/* ── Upload area ────────────────────────────────────────────────────── */
+.upload-area {
+    border: 2px dashed var(--border);
+    border-radius: var(--radius-lg);
+    padding: 18px 12px;
+    text-align: center;
+    cursor: pointer;
+    transition: border-color 0.2s, background 0.2s;
+    font-size: 0.83rem;
+    color: var(--text-muted);
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    gap: 8px;
+}
+.upload-area:hover, .upload-area.dragover {
+    border-color: var(--primary);
+    background: rgba(59,130,246,0.06);
+    color: var(--text-dim);
+}
+.upload-icon {
+    width: 28px;
+    height: 28px;
+    opacity: 0.5;
+}
+.upload-area:hover .upload-icon,
+.upload-area.dragover .upload-icon { opacity: 0.8; }
+/* XML row */
+.xml-row {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.xml-row .muted {
+    flex: 1;
+    font-size: 0.78rem;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
+.xml-ok { color: var(--success) !important; }
+/* ── Image viewer (center) ──────────────────────────────────────────── */
+#viewer-panel {
+    padding: 0;
+    position: relative;
+    overflow: hidden;
+    display: flex;
+    flex-direction: column;
+}
+/* Zoom toolbar */
+.zoom-toolbar {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+    padding: 5px 8px;
+    background: var(--bg-panel);
+    border-bottom: 1px solid var(--border);
+    flex-shrink: 0;
+    z-index: 2;
+}
+.zoom-btn {
+    width: 26px;
+    height: 26px;
+    border: 1px solid var(--border);
+    border-radius: var(--radius);
+    background: var(--bg-input);
+    color: var(--text-dim);
+    font-size: 1rem;
+    line-height: 1;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    transition: background 0.15s, border-color 0.15s;
+}
+.zoom-btn:hover { background: var(--bg-hover); border-color: var(--border-light); color: var(--text); }
+.zoom-fit { font-size: 0.8rem; width: auto; padding: 0 7px; }
+.zoom-toolbar-sep { width: 1px; background: var(--border); margin: 0 4px; align-self: stretch; }
+.nav-btn { padding: 2px 8px; font-size: .8rem; line-height: 1.6; }
+.nav-btn:disabled { opacity: 0.3; cursor: default; }
+.batch-nav-label-toolbar { font-size: .78rem; color: var(--text-muted); min-width: 36px; text-align: center; }
+.zoom-level {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    min-width: 3.5em;
+    text-align: center;
+    font-family: var(--font-mono);
+}
+/* Scrollable image area */
+#viewer-scroll {
+    flex: 1;
+    overflow: auto;
+    display: flex;
+    align-items: flex-start;
+    justify-content: flex-start;
+    min-height: 0;
+    position: relative;
+}
+/* Placeholder — fills scroll area and centers content */
+.viewer-placeholder {
+    width: 100%;
+    height: 100%;
+    min-height: 200px;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    gap: 14px;
+    color: var(--text-muted);
+    font-size: 0.9rem;
+    user-select: none;
+}
+.viewer-placeholder.dragover {
+    color: var(--primary);
+    background: rgba(59, 130, 246, 0.08);
+}
+.viewer-placeholder svg {
+    width: 56px;
+    height: 56px;
+    opacity: 0.25;
+}
+.viewer-placeholder p { opacity: 0.6; }
+/* Image container — only shows when image is loaded */
+#image-container {
+    position: relative;
+    flex-shrink: 0;
+    line-height: 0;
+}
+#page-image {
+    display: block;
+    /* width controlled by zoom JS; height auto */
+    transition: width 0.08s ease-out, height 0.08s ease-out;
+}
+#overlay-canvas {
+    position: absolute;
+    top: 0;
+    left: 0;
+    pointer-events: auto;
+    cursor: crosshair;
+    transition: width 0.08s ease-out, height 0.08s ease-out;
+}
+/* ── Results panel (right) ──────────────────────────────────────────── */
+#results-panel {
+    display: flex;
+    flex-direction: column;
+}
+.results-header {
+    padding: 12px 12px 8px;
+    border-bottom: 1px solid var(--border);
+    flex-shrink: 0;
+}
+.results-header-row {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    margin-bottom: 0;
+}
+.results-header-row h2 { margin-bottom: 0; }
+.results-header-controls {
+    display: flex;
+    align-items: center;
+    gap: 5px;
+}
+.btn-icon.active { border-color: var(--primary); color: var(--primary); background: rgba(59,130,246,0.1); }
+/* Font selector in results header */
+.font-select {
+    width: auto !important;
+    padding: 3px 5px !important;
+    font-size: 0.72rem !important;
+    height: 26px;
+    border-radius: var(--radius);
+    color: var(--text-muted);
+    max-width: 140px;
+}
+#transcription-lines {
+    flex: 1;
+    overflow-y: auto;
+    font-family: var(--font-results, var(--font-mono));
+    font-size: 0.83rem;
+    line-height: 1.5;
+    padding: 4px 0;
+}
+.line-result {
+    padding: 5px 10px;
+    border-bottom: 1px solid rgba(45,63,89,0.5);
+    cursor: pointer;
+    transition: background 0.1s;
+}
+.line-result:last-child { border-bottom: none; }
+.line-result:hover { background: var(--bg-hover); }
+.line-num {
+    color: var(--text-muted);
+    font-size: 0.68rem;
+    margin-right: 7px;
+    user-select: none;
+    display: inline-block;
+    min-width: 2.2em;
+    text-align: right;
+}
+.confidence {
+    float: right;
+    font-size: 0.68rem;
+    padding: 1px 6px;
+    border-radius: 8px;
+    margin-left: 6px;
+    margin-top: 2px;
+}
+.conf-high { background: rgba(34,197,94,0.15);  color: var(--success); }
+.conf-mid  { background: rgba(245,158,11,0.15); color: var(--warning); }
+.conf-low  { background: rgba(239,68,68,0.15);  color: var(--danger);  }
+.line-result.line-active {
+    background: rgba(233,69,96,0.12);
+    border-left: 3px solid var(--accent);
+}
+.line-result.highlight {
+    background: rgba(59,130,246,0.12);
+    border-left: 3px solid var(--primary);
+}
+/* Dimmed lines (below confidence threshold) */
+.line-result.line-dimmed {
+    opacity: 0.28;
+}
+/* Inline editing */
+.line-text {
+    display: inline;
+    outline: none;
+    border-radius: 2px;
+}
+.line-text[contenteditable="true"] {
+    background: rgba(58, 134, 255, 0.08);
+    outline: 1px dashed var(--primary);
+    padding: 0 3px;
+    cursor: text;
+}
+/* Gemini thinking/reasoning block */
+.thinking-block {
+    display: block;
+    width: 100%;
+    margin-top: 4px;
+}
+.thinking-toggle {
+    font-size: 0.7rem;
+    color: var(--text-dim);
+    cursor: pointer;
+    user-select: none;
+    letter-spacing: 0.04em;
+    text-transform: uppercase;
+}
+.thinking-toggle:hover { color: var(--primary); }
+.thinking-text {
+    margin: 4px 0 0 0;
+    padding: 6px 10px;
+    font-size: 0.72rem;
+    font-family: monospace;
+    white-space: pre-wrap;
+    word-break: break-word;
+    background: var(--bg-input);
+    border-left: 2px solid var(--border);
+    color: var(--text-dim);
+    border-radius: 0 3px 3px 0;
+    max-height: 300px;
+    overflow-y: auto;
+}
+.line-result.line-edited .line-num::after {
+    content: '✎';
+    color: var(--primary);
+    font-size: 0.6rem;
+    margin-left: 2px;
+}
+/* Confidence filter row */
+.results-search-row {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    padding: 4px 0;
+}
+.results-search-row input[type="search"] {
+    flex: 1;
+    min-width: 0;
+    background: var(--bg-input);
+    border: 1px solid var(--border);
+    border-radius: var(--radius);
+    color: var(--text);
+    font-size: 0.8rem;
+    padding: 3px 8px;
+}
+.results-search-row input[type="search"]:focus {
+    outline: none;
+    border-color: var(--primary);
+}
+#results-search-count {
+    font-size: 0.72rem;
+    white-space: nowrap;
+}
+.line-result.line-hidden { display: none; }
+.line-result mark { background: color-mix(in srgb, var(--accent) 35%, transparent); border-radius: 2px; }
+/* Thinking / reasoning block (Gemini, Claude) */
+.thinking-block {
+    margin-top: 4px;
+    border-left: 2px solid var(--accent);
+    border-radius: 0 var(--radius-sm) var(--radius-sm) 0;
+    background: color-mix(in srgb, var(--accent) 6%, var(--bg-secondary));
+    font-size: 0.75rem;
+}
+.thinking-toggle {
+    cursor: pointer;
+    padding: 2px 6px;
+    color: var(--accent);
+    user-select: none;
+    font-style: italic;
+    list-style: none;
+}
+.thinking-toggle::marker,
+.thinking-toggle::-webkit-details-marker { display: none; }
+.thinking-toggle::before {
+    content: '▶ ';
+    font-style: normal;
+    font-size: 0.65rem;
+    transition: transform 0.15s;
+}
+details[open] > .thinking-toggle::before { content: '▼ '; }
+.thinking-text {
+    margin: 0;
+    padding: 4px 8px 6px;
+    white-space: pre-wrap;
+    word-break: break-word;
+    color: var(--text-secondary);
+    font-family: inherit;
+    line-height: 1.45;
+    max-height: 200px;
+    overflow-y: auto;
+}
+.conf-filter-row {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    padding: 4px 0 6px;
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.conf-filter-row input[type="range"] {
+    flex: 1;
+    width: auto;
+    height: 3px;
+    cursor: pointer;
+    accent-color: var(--primary);
+    padding: 0;
+    background: none;
+    border: none;
+}
+/* Batch queue */
+#batch-queue-section {
+    margin-top: 6px;
+    border-top: 1px solid var(--border);
+    padding-top: 8px;
+}
+.batch-queue-header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    margin-bottom: 4px;
+}
+.section-label {
+    font-size: 0.7rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: .06em;
+    color: var(--text-muted);
+}
+.batch-overall-progress {
+    font-size: 0.72rem;
+    font-family: var(--font-mono);
+    color: var(--accent);
+    background: color-mix(in srgb, var(--accent) 12%, transparent);
+    padding: 1px 7px;
+    border-radius: 10px;
+}
+.batch-item {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    padding: 4px 2px;
+    font-size: 0.78rem;
+    border-bottom: 1px solid rgba(45,63,89,0.4);
+}
+.batch-item:last-child { border-bottom: none; }
+.batch-drag-handle {
+    cursor: grab;
+    color: var(--text-muted);
+    font-size: 0.9rem;
+    line-height: 1;
+    padding: 0 2px;
+    flex-shrink: 0;
+    user-select: none;
+}
+.batch-drag-handle:active { cursor: grabbing; }
+.batch-item.batch-dragging { opacity: 0.4; }
+.batch-item.batch-drag-over {
+    border-top: 2px solid var(--accent);
+    margin-top: -1px;
+}
+.batch-item-name {
+    flex: 1;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+    color: var(--text);
+}
+.batch-status {
+    font-size: 0.68rem;
+    flex-shrink: 0;
+    min-width: 56px;
+    text-align: right;
+    color: var(--text-muted);
+}
+.batch-status.done   { color: var(--success); }
+.batch-status.error  { color: var(--danger); }
+.batch-status.active { color: var(--primary); }
+.batch-nav-row {
+    display: flex;
+    align-items: center;
+    gap: 4px;
+    margin-top: 6px;
+}
+.batch-options-row {
+    display: flex;
+    gap: 14px;
+    align-items: center;
+    margin-top: 6px;
+    flex-wrap: wrap;
+}
+.checkbox-label {
+    display: flex;
+    align-items: center;
+    gap: 5px;
+    font-size: .8rem;
+    color: var(--text-muted);
+    cursor: pointer;
+    user-select: none;
+}
+.checkbox-label input[type="checkbox"] { cursor: pointer; }
+/* Column layout (multi-region side-by-side) */
+#transcription-lines.col-layout {
+    display: flex;
+    flex-direction: row;
+    align-items: flex-start;  /* columns grow to their content height */
+    overflow-x: auto;
+    /* overflow-y stays 'auto' from the base rule — unified scroll */
+    padding: 0;
+    gap: 0;
+}
+.region-column {
+    flex: 0 0 auto;       /* don't shrink; grow to content */
+    min-width: 220px;
+    width: max-content;   /* each column as wide as its widest line */
+    max-width: min(520px, 75vw);
+    display: flex;
+    flex-direction: column;
+    border-right: 1px solid var(--border);
+    /* No overflow-y — parent handles the single scrollbar */
+}
+.region-column:last-child { border-right: none; }
+/* Prevent line text from wrapping inside column cells */
+.region-column .line-result { white-space: nowrap; }
+.region-col-header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 4px;
+    padding: 4px 8px;
+    font-size: 0.68rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.8px;
+    color: var(--text-muted);
+    background: var(--bg-section);
+    border-bottom: 1px solid var(--border);
+    position: sticky;
+    top: 0;
+    z-index: 1;
+}
+.region-col-close {
+    flex-shrink: 0;
+    background: none;
+    border: none;
+    color: var(--text-muted);
+    cursor: pointer;
+    font-size: 1rem;
+    line-height: 1;
+    padding: 0 2px;
+    border-radius: 3px;
+    transition: color 0.1s, background 0.1s;
+}
+.region-col-close:hover { color: var(--danger); background: rgba(239,68,68,0.1); }
+/* Detected region list (below segmentation controls) */
+#seg-regions-list {
+    margin: 0 12px 8px;
+    border: 1px solid var(--border);
+    border-radius: var(--radius);
+    background: var(--bg-section);
+    overflow: hidden;
+}
+.seg-regions-header {
+    padding: 5px 10px;
+    font-size: 0.68rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.8px;
+    color: var(--text-muted);
+    border-bottom: 1px solid var(--border);
+}
+.seg-region-row {
+    display: flex;
+    align-items: center;
+    gap: 7px;
+    padding: 5px 10px;
+    border-bottom: 1px solid rgba(45,63,89,0.4);
+    font-size: 0.78rem;
+}
+.seg-region-row:last-child { border-bottom: none; }
+.seg-region-dot {
+    width: 10px;
+    height: 10px;
+    border-radius: 50%;
+    flex-shrink: 0;
+}
+.seg-region-label { font-weight: 600; color: var(--text); min-width: 2em; }
+.seg-region-count { flex: 1; color: var(--text-muted); }
+.seg-region-del {
+    width: 22px !important;
+    height: 22px !important;
+    font-size: 0.9rem !important;
+    flex-shrink: 0;
+}
+/* Region separator */
+.region-separator {
+    padding: 4px 10px;
+    font-size: 0.68rem;
+    color: var(--text-muted);
+    background: var(--bg-section);
+    border-bottom: 1px solid var(--border);
+    letter-spacing: 0.5px;
+    text-transform: uppercase;
+}
+#results-footer {
+    padding: 8px 12px;
+    border-top: 1px solid var(--border);
+    flex-shrink: 0;
+}
+/* ── Progress bar ───────────────────────────────────────────────────── */
+.progress-row {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    gap: 8px;
+    margin-top: 4px;
+}
+#progress-bar {
+    height: 4px;
+    background: var(--bg-input);
+    border-radius: 2px;
+    overflow: hidden;
+    margin-top: 8px;
+}
+#progress-fill {
+    height: 100%;
+    width: 0%;
+    background: linear-gradient(90deg, var(--primary), var(--accent));
+    transition: width 0.25s ease;
+    border-radius: 2px;
+}
+/* ── Status badges ──────────────────────────────────────────────────── */
+.status-badge {
+    font-size: 0.78rem;
+    padding: 4px 10px;
+    border-radius: var(--radius);
+    text-align: center;
+}
+.status-loaded  { background: rgba(34,197,94,0.12);  color: var(--success); border: 1px solid rgba(34,197,94,0.25); }
+.status-loading { background: rgba(59,130,246,0.12); color: var(--primary); border: 1px solid rgba(59,130,246,0.25); }
+/* ── Spinner on buttons ─────────────────────────────────────────────── */
+.btn.loading { pointer-events: none; opacity: 0.7; }
+.btn.loading::after {
+    content: '';
+    display: inline-block;
+    width: 11px;
+    height: 11px;
+    margin-left: 7px;
+    border: 2px solid transparent;
+    border-top-color: currentColor;
+    border-radius: 50%;
+    animation: spin 0.65s linear infinite;
+    vertical-align: middle;
+}
+@keyframes spin { to { transform: rotate(360deg); } }
+/* ── Utilities ──────────────────────────────────────────────────────── */
+.muted   { color: var(--text-muted); font-size: 0.8rem; }
+.hidden  { display: none !important; }
+/* ── Help modal ─────────────────────────────────────────────────────── */
+#help-modal {
+    background: var(--bg-panel);
+    color: var(--text);
+    border: 1px solid var(--border);
+    border-radius: var(--radius-lg);
+    box-shadow: var(--shadow);
+    padding: 0;
+    width: min(680px, 96vw);
+    max-height: 82vh;
+    overflow: hidden;
+}
+#help-modal[open] {
+    display: flex;
+    flex-direction: column;
+}
+#help-modal::backdrop {
+    background: rgba(0,0,0,0.65);
+    backdrop-filter: blur(2px);
+}
+.modal-header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 14px 18px;
+    border-bottom: 1px solid var(--border);
+    flex-shrink: 0;
+}
+.modal-header h2 { font-size: 1rem; font-weight: 600; }
+.modal-body {
+    overflow-y: auto;
+    padding: 18px;
+    display: flex;
+    flex-direction: column;
+    gap: 16px;
+    font-size: 0.88rem;
+    line-height: 1.6;
+}
+.modal-body h3 {
+    font-size: 0.8rem;
+    text-transform: uppercase;
+    letter-spacing: 0.8px;
+    color: var(--text-muted);
+    border-bottom: 1px solid var(--border);
+    padding-bottom: 4px;
+    margin-top: 4px;
+}
+.modal-body ol, .modal-body ul { padding-left: 1.4em; display: flex; flex-direction: column; gap: 5px; }
+.modal-body li { color: var(--text-dim); }
+.modal-body strong { color: var(--text); font-weight: 600; }
+.modal-body table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.83rem;
+}
+.modal-body th, .modal-body td {
+    padding: 5px 10px;
+    text-align: left;
+    border-bottom: 1px solid var(--border);
+}
+.modal-body th { color: var(--text-muted); font-weight: 600; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.5px; }
+.modal-body td:first-child { color: var(--text); font-weight: 500; white-space: nowrap; }
+.modal-body tr:last-child td { border-bottom: none; }
+.modal-body tr:hover td { background: var(--bg-hover); }
+.modal-body kbd {
+    display: inline-block;
+    padding: 1px 6px;
+    background: var(--bg-input);
+    border: 1px solid var(--border);
+    border-radius: 4px;
+    font-family: var(--font-mono);
+    font-size: 0.78rem;
+    color: var(--text-dim);
+}
+.demo-badge {
+    font-size: 0.72rem;
+    padding: 1px 6px;
+    border-radius: 8px;
+    display: inline-block;
+    margin: 0 2px;
+}
+/* ── Mobile tab bar ─────────────────────────────────────────────────── */
+#mobile-tabs {
+    display: none;   /* hidden on desktop */
+    height: var(--tabs-h);
+    background: var(--bg-panel);
+    border-top: 1px solid var(--border);
+    flex-shrink: 0;
+}
+.tab-btn {
+    flex: 1;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    gap: 3px;
+    background: none;
+    border: none;
+    color: var(--text-muted);
+    font-size: 0.68rem;
+    cursor: pointer;
+    padding: 6px 4px;
+    transition: color 0.15s;
+}
+.tab-btn svg { width: 20px; height: 20px; }
+.tab-btn.active { color: var(--primary); }
+.tab-btn:hover { color: var(--text-dim); }
+.tab-btn.active:hover { color: var(--primary-hover); }
+/* ── Responsive — tablet (≤ 960px) ─────────────────────────────────── */
+@media (max-width: 960px) {
+    #app { grid-template-columns: var(--panel-left, 240px) 5px 1fr 5px var(--panel-right, 300px); }
+    .gpu-badge { max-width: 160px; font-size: 0.7rem; }
+}
+@media (max-width: 780px) and (min-width: 641px) {
+    #app { grid-template-columns: var(--panel-left, 200px) 5px 1fr 5px var(--panel-right, 240px); }
+    .gpu-badge { max-width: 120px; font-size: 0.68rem; }
+}
+/* ── Responsive — mobile (≤ 640px) ─────────────────────────────────── */
+@media (max-width: 640px) {
+    :root { --header-h: 48px; }
+    #header h1 { font-size: 0.9rem; }
+    .gpu-badge { display: none; }      /* too little space */
+    /* Single-column; tab bar controls which panel is visible */
+    #app {
+        grid-template-columns: 1fr;
+        grid-template-rows: 1fr;
+    }
+    .panel-resize-handle { display: none; }
+    #mobile-tabs { display: flex; }
+    /* All panels are hidden by default; JS adds panel-active */
+    [data-panel] {
+        display: none;
+    }
+    [data-panel].panel-active {
+        display: flex;
+        flex-direction: column;
+    }
+    /* Engine panel needs special treatment (flex column) */
+    [data-panel="settings"].panel-active {
+        overflow-y: auto;
+    }
+    body { overflow: hidden; }
+    /* Account for tab bar height */
+    #app { height: calc(100vh - var(--header-h) - var(--tabs-h)); }
+    /* Results panel: stack vertically */
+    #results-panel.panel-active { gap: 0; }
+    /* Larger touch targets */
+    .btn { padding: 10px 16px; font-size: 0.9rem; }
+    .btn-small { padding: 7px 12px; font-size: 0.82rem; }
+    select, input[type="text"], input[type="number"], input[type="password"] {
+        padding: 8px 10px;
+        font-size: 0.9rem;
+    }
+    /* Upload area takes less vertical space */
+    .upload-area { padding: 14px 10px; }
+    /* Full-width help modal */
+    #help-modal { width: 100vw; max-height: 90vh; border-radius: var(--radius-lg) var(--radius-lg) 0 0; }
+}

web/static/app.js ADDED Viewed

	@@ -0,0 +1,298 @@

+/**
+ * Polyscriptor Web UI — Main application entry point
+ *
+ * Central state + event bus, wires up components.
+ * No framework, no build step — native ES modules.
+ */
+import { initEnginePanel } from './components/engine-panel.js';
+import { initImageViewer } from './components/image-viewer.js';
+import { initTranscriptionPanel } from './components/transcription-panel.js';
+import { initBatchPanel } from './components/batch-panel.js';
+// ── Global state ───────────────────────────────────────────────────────
+export const state = {
+    engines: [],
+    currentEngine: null,
+    engineLoaded: false,
+    imageId: null,
+    imageInfo: null,
+    lines: [],           // [{index, text, confidence, bbox, region}]
+    regions: [],         // [{id, bbox, num_lines}] — from latest segmentation
+    isProcessing: false,
+};
+// ── Event bus ──────────────────────────────────────────────────────────
+export const events = new EventTarget();
+export function emit(name, detail) {
+    events.dispatchEvent(new CustomEvent(name, { detail }));
+}
+export function on(name, fn) {
+    events.addEventListener(name, e => fn(e.detail));
+}
+// ── API helper ─────────────────────────────────────────────────────────
+export async function api(path, options = {}) {
+    const resp = await fetch(path, {
+        headers: { 'Content-Type': 'application/json', ...options.headers },
+        ...options,
+    });
+    if (!resp.ok) {
+        const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+        throw new Error(err.detail || err.message || 'API error');
+    }
+    return resp;
+}
+// ── Toast notifications ────────────────────────────────────────────────
+export function toast(message, type = 'info', durationMs = 4000) {
+    const container = document.getElementById('toast-container');
+    const el = document.createElement('div');
+    el.className = `toast toast-${type}`;
+    el.textContent = message;
+    container.appendChild(el);
+    setTimeout(() => el.remove(), durationMs);
+}
+// ── GPU status widget ──────────────────────────────────────────────────
+function shortName(name) {
+    // Abbreviate long GPU names for the header
+    return name
+        .replace('NVIDIA ', '')
+        .replace('GeForce ', '')
+        .replace('Tesla ', '')
+        .replace('Quadro ', '');
+}
+async function updateGpuStatus() {
+    const widget = document.getElementById('gpu-status');
+    try {
+        const resp = await api('/api/gpu');
+        const data = await resp.json();
+        if (!data.available || data.gpus.length === 0) {
+            widget.innerHTML = '<span class="gpu-card-name"><span>GPU: N/A</span></span>';
+            return;
+        }
+        widget.innerHTML = data.gpus.map(g => {
+            const usedPct = Math.round((g.memory_used_mb / g.memory_total_mb) * 100);
+            const fillClass = usedPct >= 85 ? 'hot' : usedPct >= 60 ? 'warm' : '';
+            const usedGb   = (g.memory_used_mb / 1000).toFixed(1);
+            const totalGb  = (g.memory_total_mb / 1000).toFixed(0);
+            const utilHtml = g.utilization_gpu_pct != null
+                ? `<span class="gpu-util-pct">${g.utilization_gpu_pct}%</span>` : '';
+            return `<div class="gpu-card">
+                <div class="gpu-card-name">
+                    <span title="${g.name}">${shortName(g.name)}</span>${utilHtml}
+                </div>
+                <div class="gpu-mem-bar">
+                    <div class="gpu-mem-fill ${fillClass}" style="width:${usedPct}%"></div>
+                </div>
+                <div class="gpu-mem-label">${usedGb}/${totalGb} GB VRAM</div>
+            </div>`;
+        }).join('');
+    } catch {
+        widget.innerHTML = '<span style="font-size:.75rem;color:var(--text-muted)">GPU: error</span>';
+    }
+}
+// ── Zoom controls ──────────────────────────────────────────────────────
+let zoomLevel = 1.0;
+const ZOOM_STEP = 0.25;
+let   ZOOM_MIN  = 0.25; // updated per image in fitZoom() so large images are always reachable
+const ZOOM_MAX  = 4.0;
+function applyZoom(level) {
+    const img    = document.getElementById('page-image');
+    const canvas = document.getElementById('overlay-canvas');
+    if (!img || !img.naturalWidth) return;
+    zoomLevel = Math.max(ZOOM_MIN, Math.min(ZOOM_MAX, level));
+    const w = Math.round(img.naturalWidth  * zoomLevel);
+    const h = Math.round(img.naturalHeight * zoomLevel);
+    img.style.width  = w + 'px';
+    img.style.height = h + 'px';
+    canvas.style.width  = w + 'px';
+    canvas.style.height = h + 'px';
+    document.getElementById('zoom-level').textContent =
+        Math.round(zoomLevel * 100) + '%';
+}
+export function fitZoom() {
+    const img    = document.getElementById('page-image');
+    const scroll = document.getElementById('viewer-scroll');
+    if (!img || !img.naturalWidth || !scroll) return;
+    const scaleW = scroll.clientWidth  / img.naturalWidth;
+    const scaleH = scroll.clientHeight / img.naturalHeight;
+    const fit = Math.min(scaleW, scaleH, 1.0); // never zoom in beyond 100% on fit
+    // Ensure the fit level is always reachable: lower ZOOM_MIN for large images (min 5%)
+    ZOOM_MIN = Math.min(0.25, Math.max(0.05, fit));
+    applyZoom(fit);
+}
+function initZoomControls() {
+    document.getElementById('btn-zoom-in') .addEventListener('click', () => applyZoom(zoomLevel + ZOOM_STEP));
+    document.getElementById('btn-zoom-out').addEventListener('click', () => applyZoom(zoomLevel - ZOOM_STEP));
+    document.getElementById('btn-zoom-fit').addEventListener('click', fitZoom);
+    // Mouse-wheel zoom in viewer — multiplicative for smooth feel
+    document.getElementById('viewer-scroll').addEventListener('wheel', e => {
+        if (!e.ctrlKey && !e.metaKey) return;
+        e.preventDefault();
+        const factor = e.deltaY < 0 ? 1.10 : 1 / 1.10;
+        applyZoom(zoomLevel * factor);
+    }, { passive: false });
+    on('image-uploaded', () => {
+        document.getElementById('zoom-toolbar').classList.remove('hidden');
+        // fit after short delay to let image render
+        setTimeout(fitZoom, 80);
+    });
+    // Also show toolbar when a batch item is displayed in the viewer
+    on('batch-item-start', () => {
+        document.getElementById('zoom-toolbar').classList.remove('hidden');
+    });
+}
+// ── Sticky engine config (localStorage) ───────────────────────────────
+const LS_ENGINE = 'polyscriptor_last_engine';
+const LS_CONFIG = name => `polyscriptor_config_${name}`;
+export function saveEngineConfig(engineName, configObj) {
+    try {
+        localStorage.setItem(LS_ENGINE, engineName);
+        localStorage.setItem(LS_CONFIG(engineName), JSON.stringify(configObj));
+    } catch { /* storage full or private mode */ }
+}
+export function loadSavedEngineName() {
+    try { return localStorage.getItem(LS_ENGINE); } catch { return null; }
+}
+export function loadSavedEngineConfig(engineName) {
+    try {
+        const raw = localStorage.getItem(LS_CONFIG(engineName));
+        return raw ? JSON.parse(raw) : null;
+    } catch { return null; }
+}
+// ── Mobile tab helper ───────────────────────────────────────────────────
+function mobileActivateTab(target) {
+    const tabBtns = document.querySelectorAll('.tab-btn');
+    const panels  = document.querySelectorAll('[data-panel]');
+    if (!tabBtns.length) return;
+    tabBtns.forEach(b => b.classList.toggle('active', b.dataset.target === target));
+    panels.forEach(p => p.classList.toggle('panel-active', p.dataset.panel === target));
+}
+// ── Resizable panels ───────────────────────────────────────────────────
+const LS_PANEL_LEFT  = 'polyscriptor_panel_left';
+const LS_PANEL_RIGHT = 'polyscriptor_panel_right';
+function initResizablePanels() {
+    const app = document.getElementById('app');
+    const handleLeft  = document.getElementById('resize-left');
+    const handleRight = document.getElementById('resize-right');
+    if (!handleLeft || !handleRight) return;
+    // Restore saved widths
+    const savedLeft  = localStorage.getItem(LS_PANEL_LEFT);
+    const savedRight = localStorage.getItem(LS_PANEL_RIGHT);
+    if (savedLeft)  document.documentElement.style.setProperty('--panel-left',  savedLeft);
+    if (savedRight) document.documentElement.style.setProperty('--panel-right', savedRight);
+    function startDrag(handle, isLeft) {
+        handle.classList.add('dragging');
+        document.body.style.cursor = 'col-resize';
+        document.body.style.userSelect = 'none';
+        const onMove = (e) => {
+            const appRect = app.getBoundingClientRect();
+            const x = (e.touches ? e.touches[0].clientX : e.clientX) - appRect.left;
+            const totalW = appRect.width;
+            if (isLeft) {
+                const w = Math.max(160, Math.min(x, totalW * 0.4));
+                const val = Math.round(w) + 'px';
+                document.documentElement.style.setProperty('--panel-left', val);
+                localStorage.setItem(LS_PANEL_LEFT, val);
+            } else {
+                const w = Math.max(200, Math.min(totalW - x, totalW * 0.5));
+                const val = Math.round(w) + 'px';
+                document.documentElement.style.setProperty('--panel-right', val);
+                localStorage.setItem(LS_PANEL_RIGHT, val);
+            }
+        };
+        const onUp = () => {
+            handle.classList.remove('dragging');
+            document.body.style.cursor = '';
+            document.body.style.userSelect = '';
+            document.removeEventListener('mousemove', onMove);
+            document.removeEventListener('mouseup', onUp);
+            document.removeEventListener('touchmove', onMove);
+            document.removeEventListener('touchend', onUp);
+        };
+        document.addEventListener('mousemove', onMove);
+        document.addEventListener('mouseup', onUp);
+        document.addEventListener('touchmove', onMove, { passive: true });
+        document.addEventListener('touchend', onUp);
+    }
+    handleLeft.addEventListener('mousedown',  e => { e.preventDefault(); startDrag(handleLeft,  true); });
+    handleRight.addEventListener('mousedown', e => { e.preventDefault(); startDrag(handleRight, false); });
+    handleLeft.addEventListener('touchstart',  e => startDrag(handleLeft,  true),  { passive: true });
+    handleRight.addEventListener('touchstart', e => startDrag(handleRight, false), { passive: true });
+}
+// ── Keyboard shortcuts ─────────────────────────────────────────────────
+function initKeyboardShortcuts() {
+    document.addEventListener('keydown', e => {
+        // Ignore when typing in an input / textarea / contenteditable
+        const tag = e.target.tagName;
+        const editable = e.target.isContentEditable;
+        if (tag === 'INPUT' || tag === 'TEXTAREA' || tag === 'SELECT' || editable) return;
+        // Ctrl+Enter — transcribe
+        if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
+            e.preventDefault();
+            document.getElementById('btn-transcribe')?.click();
+            return;
+        }
+        // ArrowLeft / ArrowRight — batch prev/next
+        if (e.key === 'ArrowLeft')  { e.preventDefault(); document.getElementById('btn-nav-prev')?.click(); }
+        if (e.key === 'ArrowRight') { e.preventDefault(); document.getElementById('btn-nav-next')?.click(); }
+    });
+}
+// ── Prevent browser from opening dropped files in a new tab ────────────
+function initGlobalDropBlocker() {
+    document.addEventListener('dragover', e => e.preventDefault());
+    document.addEventListener('drop',     e => e.preventDefault());
+}
+// ── Init ───────────────────────────────────────────────────────────────
+document.addEventListener('DOMContentLoaded', () => {
+    initEnginePanel();
+    initImageViewer();
+    initTranscriptionPanel();
+    initBatchPanel();
+    initZoomControls();
+    initResizablePanels();
+    initKeyboardShortcuts();
+    initGlobalDropBlocker();
+    updateGpuStatus();
+    setInterval(updateGpuStatus, 15000); // refresh every 15s
+    // On mobile: auto-switch tab after key events
+    on('image-uploaded',        () => mobileActivateTab('image'));
+    on('segment-preview',       () => mobileActivateTab('image'));
+    on('transcription-start',   () => mobileActivateTab('results'));
+});

web/static/components/batch-panel.js ADDED Viewed

	@@ -0,0 +1,735 @@

+/**
+ * Batch Panel — multi-image queue, sequential processing, combined export
+ *
+ * Activated when the user selects/drops multiple images.
+ * Each item is processed using the existing upload + transcribe flow.
+ * Results are stored per-item and can be exported as combined TXT or CSV.
+ */
+import { state, emit, on, api, toast } from '../app.js';
+const $ = id => document.getElementById(id);
+// Batch state (separate from state.lines which tracks the current single image)
+const batch = {
+    items: [],      // { file, imageId, status, lines, filename }
+    running: false,
+    cancelled: false,
+    currentIndex: -1,     // item currently shown in the viewer
+    processingIndex: -1,  // item currently being transcribed (may differ when user navigates away)
+    userNavigated: false, // user manually navigated away from auto-advance
+    abortController: null,
+};
+export function initBatchPanel() {
+    // Hook into the file input to detect multiple files, PDFs, or second image.
+    // Use capture:true so this fires before image-viewer's bubble listener, letting us
+    // stopImmediatePropagation() and own the upload when batch-panel takes over.
+    const fileInput = $('file-input');
+    fileInput.addEventListener('change', e => {
+        const files = Array.from(fileInput.files);
+        const hasPdf = files.some(f => f.name.toLowerCase().endsWith('.pdf'));
+        // Intercept: multiple files, PDF, or single image when one is already loaded
+        if (files.length > 1 || hasPdf || (files.length === 1 && !hasPdf && state.imageId)) {
+            e.stopImmediatePropagation(); // prevent image-viewer from also uploading the PDF
+            handleMultipleFiles(files);
+            fileInput.value = '';
+        }
+        // Single non-PDF image with no existing image → handled by image-viewer.js
+    }, true); // capture:true — fires before image-viewer's non-capture listener
+    // Multiple XML selection from the Upload XML button
+    const xmlInput = $('xml-input');
+    xmlInput.addEventListener('change', e => {
+        if (xmlInput.files.length <= 1) return; // single XML → image-viewer handles normally
+        e.stopImmediatePropagation();
+        uploadXmlFiles(Array.from(xmlInput.files));
+        xmlInput.value = '';
+    }, true); // capture — fires before image-viewer's listener
+    // Drag-drop: intercept multiple images/PDFs or any drop when image already loaded
+    const uploadArea = $('upload-area');
+    uploadArea.addEventListener('drop', e => {
+        const files = Array.from(e.dataTransfer.files);
+        const xmlFiles = files.filter(f => f.name.toLowerCase().endsWith('.xml'));
+        const nonXml  = files.filter(f => !f.name.toLowerCase().endsWith('.xml'));
+        const hasPdf  = nonXml.some(f => f.name.toLowerCase().endsWith('.pdf'));
+        // Take over if: multiple images, a PDF, a second image on top of existing, or multiple XMLs
+        const takeBatch = nonXml.length > 1 || hasPdf || (nonXml.length === 1 && state.imageId);
+        const takeXml   = xmlFiles.length > 1 || (xmlFiles.length === 1 && batch.items.length > 0);
+        if (takeBatch || takeXml) {
+            e.preventDefault();
+            e.stopImmediatePropagation();
+            if (nonXml.length > 0) handleMultipleFiles(nonXml);
+            if (xmlFiles.length > 0) uploadXmlFiles(xmlFiles);
+        }
+    }, true); // capture phase — fires before image-viewer's bubble handler
+    // PDF pages from single-PDF drop on image-viewer — add to batch
+    on('pdf-pages-ready', data => {
+        const existing = new Set(batch.items.map(i => i.filename));
+        for (const page of data.pages) {
+            if (!existing.has(page.filename)) {
+                batch.items.push({
+                    file: null,
+                    imageId: page.image_id,
+                    status: 'pending',
+                    lines: [],
+                    filename: page.filename,
+                    preUploaded: true,
+                });
+                existing.add(page.filename);
+            }
+        }
+        if (batch.items.length > 0) {
+            renderQueue();
+            // PDF pages are already uploaded — always preview the first one directly,
+            // bypassing the state.imageId guard in previewFirstBatchItem().
+            const first = batch.items[0];
+            if (first && first.preUploaded && first.imageId) {
+                batch.currentIndex = 0;
+                emit('batch-item-start', { imageId: first.imageId, filename: first.filename });
+                updateNavButtons();
+            }
+        }
+    });
+    $('btn-process-batch').addEventListener('click', processBatch);
+    $('btn-clear-batch').addEventListener('click', clearBatch);
+    $('btn-export-batch-txt').addEventListener('click', exportAllTxt);
+    $('btn-export-batch-csv').addEventListener('click', exportAllCsv);
+    $('btn-export-batch-txt-zip').addEventListener('click', exportAllTxtZip);
+    $('btn-export-batch-thinking-zip').addEventListener('click', exportAllThinkingZip);
+    $('btn-export-batch-xml').addEventListener('click', exportAllXml);
+    $('btn-nav-prev').addEventListener('click', () => navigate(-1));
+    $('btn-nav-next').addEventListener('click', () => navigate(+1));
+    // Persist PAGE XML and resume checkboxes across sessions
+    const usePageXmlEl = $('batch-use-pagexml');
+    const resumeEl     = $('batch-resume');
+    const savedPageXml = localStorage.getItem('batch_use_pagexml');
+    const savedResume  = localStorage.getItem('batch_resume');
+    if (savedPageXml !== null) usePageXmlEl.checked = savedPageXml === 'true';
+    if (savedResume  !== null) resumeEl.checked     = savedResume  === 'true';
+    usePageXmlEl.addEventListener('change', () => localStorage.setItem('batch_use_pagexml', usePageXmlEl.checked));
+    resumeEl.addEventListener('change',     () => localStorage.setItem('batch_resume',      resumeEl.checked));
+    // Cancel during batch: abort current SSE + stop the queue loop
+    $('btn-cancel').addEventListener('click', () => {
+        if (!batch.running) return;
+        batch.cancelled = true;
+        batch.abortController?.abort();
+    }, { capture: true });
+}
+// ── XML matching for batch ────────────────────────────────────────────────────
+// Match XML files to batch items by filename stem (e.g. page001.xml → page001.jpg)
+async function uploadXmlFiles(xmlFiles) {
+    if (!xmlFiles.length) return;
+    const stem = name => name.replace(/\.[^/.]+$/, '').toLowerCase();
+    let matched = 0, deferred = 0, skipped = 0;
+    for (const xml of xmlFiles) {
+        const xmlStem = stem(xml.name);
+        const item = batch.items.find(it => stem(it.filename) === xmlStem);
+        if (!item) { skipped++; continue; }
+        if (item.imageId) {
+            // Already uploaded → send to server immediately
+            try {
+                const fd = new FormData();
+                fd.append('file', xml);
+                const resp = await fetch(`/api/image/${item.imageId}/xml`, { method: 'POST', body: fd });
+                if (!resp.ok) throw new Error((await resp.json()).detail);
+                item.xmlUploaded = true;
+                matched++;
+            } catch (err) {
+                toast(`XML ${xml.name}: ${err.message}`, 'error');
+            }
+        } else {
+            // Image not yet uploaded — store XML, send during processBatch
+            item.xmlFile = xml;
+            deferred++;
+        }
+    }
+    const parts = [];
+    if (matched  > 0) parts.push(`${matched} uploaded`);
+    if (deferred > 0) parts.push(`${deferred} queued for batch`);
+    if (skipped  > 0) parts.push(`${skipped} unmatched`);
+    toast(`XML files: ${parts.join(', ')}`, matched + deferred > 0 ? 'success' : 'error');
+}
+// ── Queue management ─────────────────────────────────────────────────────────
+function handleMultipleFiles(files) {
+    // If a single image is already loaded (not yet in batch), add it first
+    if (batch.items.length === 0 && state.imageId) {
+        batch.items.push({
+            file: null,
+            imageId: state.imageId,
+            status: 'pending',
+            lines: state.lines.length ? state.lines : [],
+            filename: (state.imageInfo && state.imageInfo.filename) || 'current image',
+            preUploaded: true,
+        });
+    }
+    // Add new files (skip duplicates by name)
+    const existing = new Set(batch.items.map(i => i.filename));
+    const added = files.filter(f => !existing.has(f.name));
+    added.forEach(f => {
+        batch.items.push({ file: f, imageId: null, status: 'pending', lines: [], filename: f.name });
+    });
+    if (batch.items.length > 0) { renderQueue(); previewFirstBatchItem(); }
+}
+// Auto-preview all batch items (upload if needed), expanding PDFs into pages immediately
+async function previewFirstBatchItem() {
+    if (batch.running) return;
+    let i = 0;
+    let safetyCounter = 0;
+    while (i < batch.items.length && safetyCounter < 100) {
+        safetyCounter++;
+        const item = batch.items[i];
+        if (item.preUploaded && item.imageId) {
+            i++;
+            continue;
+        }
+        if (item.file) {
+            try {
+                const fd = new FormData();
+                fd.append('file', item.file);
+                const resp = await fetch('/api/image/upload', { method: 'POST', body: fd });
+                if (!resp.ok) { i++; continue; }
+                const data = await resp.json();
+                if (data.is_pdf) {
+                    const newItems = data.pages.map(p => ({
+                        file: null, imageId: p.image_id, status: 'pending',
+                        lines: [], filename: p.filename, preUploaded: true,
+                    }));
+                    batch.items.splice(i, 1, ...newItems);
+                    renderQueue();
+                    continue;
+                }
+                item.imageId = data.image_id;
+                item.preUploaded = true;
+                renderQueue();
+                if (i === 0 && !state.imageId) {
+                    batch.currentIndex = 0;
+                    emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
+                    updateNavButtons();
+                }
+                i++;
+            } catch (err) {
+                console.error('Error pre-uploading batch item:', err);
+                i++;
+            }
+        } else {
+            i++;
+        }
+    }
+}
+function clearBatch() {
+    if (batch.running) return;
+    batch.items = [];
+    batch.currentIndex = -1;
+    $('batch-queue-section').classList.add('hidden');
+    $('batch-export-row').classList.add('hidden');
+    updateNavButtons();
+}
+let _dragSrcIndex = null;
+function renderQueue() {
+    const section = $('batch-queue-section');
+    const list = $('batch-list');
+    section.classList.remove('hidden');
+    list.innerHTML = '';
+    batch.items.forEach((item, i) => {
+        const row = document.createElement('div');
+        row.className = 'batch-item';
+        row.id = `batch-item-${i}`;
+        row.dataset.index = i;
+        // Drag handle
+        const handle = document.createElement('span');
+        handle.className = 'batch-drag-handle';
+        handle.textContent = '⠿';
+        handle.title = 'Drag to reorder';
+        const name = document.createElement('span');
+        name.className = 'batch-item-name';
+        name.title = item.filename;
+        name.textContent = item.filename;
+        const status = document.createElement('span');
+        status.className = 'batch-status';
+        status.id = `batch-status-${i}`;
+        _setStatusEl(status, item.status, item.lines.length);
+        row.appendChild(handle);
+        row.appendChild(name);
+        row.appendChild(status);
+        // Click a done item to reload it, or a preUploaded pending item to load for manual transcription
+        const canPreview = item.status === 'done' || (item.preUploaded && item.imageId);
+        if (canPreview) {
+            row.style.cursor = 'pointer';
+            row.addEventListener('click', e => {
+                if (e.target === handle) return; // don't trigger on drag handle click
+                if (item.status === 'done') {
+                    loadBatchItem(i);
+                } else {
+                    // Load preUploaded pending page so user can manually segment/transcribe it
+                    batch.currentIndex = i;
+                    emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
+                    updateNavButtons();
+                }
+            });
+        }
+        // Drag-to-reorder (only when not running)
+        if (!batch.running) {
+            row.draggable = true;
+            row.addEventListener('dragstart', e => {
+                _dragSrcIndex = i;
+                e.dataTransfer.effectAllowed = 'move';
+                row.classList.add('batch-dragging');
+            });
+            row.addEventListener('dragend', () => {
+                row.classList.remove('batch-dragging');
+                list.querySelectorAll('.batch-item').forEach(r => r.classList.remove('batch-drag-over'));
+            });
+            row.addEventListener('dragover', e => {
+                e.preventDefault();
+                e.dataTransfer.dropEffect = 'move';
+                list.querySelectorAll('.batch-item').forEach(r => r.classList.remove('batch-drag-over'));
+                row.classList.add('batch-drag-over');
+            });
+            row.addEventListener('dragleave', () => row.classList.remove('batch-drag-over'));
+            row.addEventListener('drop', e => {
+                e.preventDefault();
+                row.classList.remove('batch-drag-over');
+                const destIndex = parseInt(row.dataset.index, 10);
+                if (_dragSrcIndex == null || _dragSrcIndex === destIndex) return;
+                // Reorder batch.items
+                const [moved] = batch.items.splice(_dragSrcIndex, 1);
+                batch.items.splice(destIndex, 0, moved);
+                // Fix currentIndex if it pointed to a moved item
+                if (batch.currentIndex === _dragSrcIndex) {
+                    batch.currentIndex = destIndex;
+                } else if (_dragSrcIndex < destIndex) {
+                    if (batch.currentIndex > _dragSrcIndex && batch.currentIndex <= destIndex) batch.currentIndex--;
+                } else {
+                    if (batch.currentIndex >= destIndex && batch.currentIndex < _dragSrcIndex) batch.currentIndex++;
+                }
+                _dragSrcIndex = null;
+                renderQueue();
+            });
+        }
+        list.appendChild(row);
+    });
+    // Show export row if any item is done
+    const anyDone = batch.items.some(i => i.status === 'done');
+    $('batch-export-row').classList.toggle('hidden', !anyDone);
+    updateNavButtons();
+}
+function _setStatusEl(el, status, lineCount) {
+    el.className = 'batch-status';
+    if (status === 'pending')    { el.textContent = 'pending'; }
+    else if (status === 'active'){ el.textContent = 'running…'; el.classList.add('active'); }
+    else if (status === 'done')  { el.textContent = `✓ ${lineCount} lines`; el.classList.add('done'); }
+    else if (status === 'error') { el.textContent = 'error'; el.classList.add('error'); }
+}
+function updateItemStatus(index, status, lineCount = 0) {
+    batch.items[index].status = status;
+    const el = $(`batch-status-${index}`);
+    if (el) _setStatusEl(el, status, lineCount);
+}
+function updateOverallProgress(current = null, total = null) {
+    const el = $('batch-overall-progress');
+    if (current == null) {
+        el.classList.add('hidden');
+        el.textContent = '';
+    } else {
+        el.textContent = `${current} / ${total}`;
+        el.classList.remove('hidden');
+    }
+}
+function updateNavButtons() {
+    const done = batch.items.filter(i => i.status === 'done');
+    const hasBatch = done.length > 0;
+    const idx = batch.currentIndex;
+    // Allow navigation to done items even while batch is running
+    const prevDone = hasBatch && batch.items.slice(0, idx).some(i => i.status === 'done');
+    const nextDone = hasBatch && batch.items.slice(idx + 1).some(i => i.status === 'done');
+    $('btn-nav-prev').disabled = !prevDone;
+    $('btn-nav-next').disabled = !nextDone;
+    const label = $('batch-nav-label');
+    if (hasBatch && idx >= 0) {
+        const pos = done.indexOf(batch.items[idx]) + 1;
+        label.textContent = `${pos}/${done.length}`;
+    } else {
+        label.textContent = '';
+    }
+}
+function navigate(delta) {
+    const indices = batch.items
+        .map((item, i) => item.status === 'done' ? i : -1)
+        .filter(i => i >= 0);
+    if (indices.length < 2) return;
+    const cur = indices.indexOf(batch.currentIndex);
+    const next = indices[cur + delta];
+    if (next != null) loadBatchItem(next);
+}
+// ── Processing ───────────────────────────────────────────────────────────────
+async function processBatch() {
+    if (batch.running || !state.engineLoaded) {
+        if (!state.engineLoaded) toast('Load an engine first', 'error');
+        return;
+    }
+    batch.running = true;
+    batch.cancelled = false;
+    batch.userNavigated = false;  // reset: auto-advance viewer from scratch
+    $('btn-process-batch').disabled = true;
+    $('btn-cancel').classList.remove('hidden');
+    const segMethod   = $('seg-method').value;
+    const segDevice   = $('seg-device').value;
+    const maxColumns  = parseInt($('seg-max-columns')?.value || '6', 10);
+    const splitWidth  = parseFloat($('seg-split-width')?.value || '40') / 100;
+    const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
+    const usePageXml  = $('batch-use-pagexml').checked;
+    const resume      = $('batch-resume').checked;
+    const pending = batch.items.filter(i => resume ? i.status === 'pending' : i.status !== 'done').length;
+    let doneThisRun = 0;
+    updateOverallProgress(0, pending);
+    for (let i = 0; i < batch.items.length; i++) {
+        if (batch.cancelled) {
+            // Mark remaining pending items back to pending (they stay pending)
+            break;
+        }
+        const item = batch.items[i];
+        if (item.status === 'done') {
+            // Resume mode: skip done; non-resume mode: also skip done
+            continue;
+        }
+        batch.processingIndex = i;
+        updateItemStatus(i, 'active');
+        updateNavButtons();
+        try {
+            // 1. Upload image (skip if already uploaded, e.g. PDF page pre-rendered by server)
+            if (item.preUploaded && item.imageId) {
+                // Already registered server-side — no upload needed
+            } else {
+                const fd = new FormData();
+                fd.append('file', item.file);
+                const upResp = await fetch('/api/image/upload', { method: 'POST', body: fd });
+                if (!upResp.ok) throw new Error(`Upload failed: ${upResp.statusText}`);
+                const upData = await upResp.json();
+                // PDF uploaded directly: expand into sub-items and skip this placeholder
+                if (upData.is_pdf) {
+                    const newItems = upData.pages.map(p => ({
+                        file: null, imageId: p.image_id, status: 'pending',
+                        lines: [], filename: p.filename, preUploaded: true,
+                    }));
+                    batch.items.splice(i + 1, 0, ...newItems);
+                    updateItemStatus(i, 'done', 0);
+                    renderQueue();
+                    continue;
+                }
+                item.imageId = upData.image_id;
+            }
+            // Upload deferred XML if one was matched earlier
+            if (item.xmlFile && item.imageId) {
+                try {
+                    const fd = new FormData();
+                    fd.append('file', item.xmlFile);
+                    await fetch(`/api/image/${item.imageId}/xml`, { method: 'POST', body: fd });
+                    item.xmlUploaded = true;
+                } catch { /* non-fatal */ }
+            }
+            // Show in viewer — skip if user manually navigated to a different item
+            if (!batch.userNavigated) {
+                batch.currentIndex = i;
+                emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
+            }
+            // 2. Transcribe via SSE (abortable)
+            batch.abortController = new AbortController();
+            const result = await transcribeSSE(
+                item.imageId, segMethod, segDevice, maxColumns, splitWidth, usePageXml, batch.abortController.signal, textDirection
+            );
+            item.lines        = result.lines;
+            item.time_s       = result.time_s;
+            item.token_usage  = result.token_usage;
+            updateItemStatus(i, 'done', result.lines.length);
+            doneThisRun++;
+            updateOverallProgress(doneThisRun, pending);
+            // Fire sse-complete so the panel shows footer, column toggle, confidence filter, etc.
+            if (batch.currentIndex === i) {
+                emit('sse-complete', { lines: item.lines, total_time_s: item.time_s, engine: '(batch)', token_usage: item.token_usage });
+            }
+        } catch (err) {
+            if (err.name === 'AbortError' || batch.cancelled) {
+                updateItemStatus(i, 'pending');
+            } else {
+                updateItemStatus(i, 'error');
+                toast(`${item.filename}: ${err.message}`, 'error');
+            }
+        }
+        // Re-render to make done items clickable
+        renderQueue();
+    }
+    batch.running = false;
+    batch.processingIndex = -1;
+    batch.userNavigated = false;
+    batch.abortController = null;
+    $('btn-process-batch').disabled = false;
+    $('btn-cancel').classList.add('hidden');
+    $('batch-export-row').classList.remove('hidden');
+    updateOverallProgress(null);
+    updateNavButtons();
+    const doneCount = batch.items.filter(i => i.status === 'done').length;
+    if (batch.cancelled) {
+        toast(`Batch cancelled — ${doneCount} image(s) done`, 'info', 4000);
+    } else {
+        toast(`Batch complete: ${doneCount}/${batch.items.length} images`, 'success', 5000);
+    }
+    emit('batch-complete', { items: batch.items });
+}
+function _collectLiveOverrides() {
+    const overrides = {};
+    const form = document.getElementById('config-form');
+    if (!form) return overrides;
+    for (const el of form.querySelectorAll('[data-key]')) {
+        if (el.dataset.saveFor) continue;
+        if (el.dataset.passwordField) continue;
+        const key = el.dataset.key;
+        if (el.type === 'checkbox')     overrides[key] = el.checked;
+        else if (el.type === 'number')  overrides[key] = Number(el.value);
+        else                            overrides[key] = el.value;
+    }
+    return overrides;
+}
+function transcribeSSE(imageId, segMethod, segDevice, maxColumns, splitWidthFraction = 0.4, usePageXml = true, signal = null, textDirection = 'horizontal-lr') {
+    return new Promise((resolve, reject) => {
+        const lines = [];
+        let startTime = null;
+        let lastTokenUsage = null;
+        const body = JSON.stringify({
+            image_id: imageId, seg_method: segMethod,
+            seg_device: segDevice, max_columns: maxColumns,
+            split_width_fraction: splitWidthFraction,
+            text_direction: textDirection,
+            use_pagexml: usePageXml,
+            engine_config_overrides: _collectLiveOverrides(),
+        });
+        const finish = (cancelled = false) => {
+            const time_s = startTime ? Math.round((Date.now() - startTime) / 100) / 10 : 0;
+            resolve({ lines, time_s, token_usage: lastTokenUsage, cancelled });
+        };
+        fetch('/api/transcribe', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body,
+            signal,
+        }).then(resp => {
+            if (!resp.ok) return reject(new Error(resp.statusText));
+            const reader = resp.body.getReader();
+            const decoder = new TextDecoder();
+            let buf = '';
+            const pump = () => reader.read().then(({ done, value }) => {
+                if (done) { finish(); return; }
+                buf += decoder.decode(value, { stream: true });
+                const parts = buf.split('\n\n');
+                buf = parts.pop();
+                for (const chunk of parts) {
+                    const evLine  = chunk.split('\n').find(l => l.startsWith('event:'));
+                    const dataLine = chunk.split('\n').find(l => l.startsWith('data:'));
+                    if (!evLine || !dataLine) continue;
+                    const event = evLine.slice(7).trim();
+                    const data  = JSON.parse(dataLine.slice(5).trim());
+                    if (event === 'progress') {
+                        if (!startTime) startTime = Date.now();
+                        if (data.token_usage) lastTokenUsage = data.token_usage;
+                        lines.push(data.line);
+                        // Only stream to panel when user is watching this item
+                        if (batch.currentIndex === batch.processingIndex) emit('sse-progress', data);
+                    } else if (event === 'segmentation') {
+                        // Store bboxes/regions so loadBatchItem can restore them later
+                        if (batch.items[batch.processingIndex]) {
+                            batch.items[batch.processingIndex].bboxes  = data.bboxes  || [];
+                            batch.items[batch.processingIndex].regions = data.regions || [];
+                        }
+                        if (batch.currentIndex === batch.processingIndex) emit('sse-segmentation', data);
+                    } else if (event === 'complete') {
+                        if (data.token_usage) lastTokenUsage = data.token_usage;
+                        finish();
+                    } else if (event === 'error') {
+                        reject(new Error(data.message));
+                    } else if (event === 'cancelled') {
+                        finish(true);
+                    }
+                }
+                pump();
+            }).catch(reject);
+            pump();
+        }).catch(reject);
+    });
+}
+// Load a completed batch item back into the viewer / results panel
+function loadBatchItem(index) {
+    const item = batch.items[index];
+    if (item.status !== 'done') return;
+    batch.currentIndex = index;
+    batch.userNavigated = true;  // user left auto-advance mode
+    emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
+    updateNavButtons();
+    // Restore segmentation data so line-click highlighting works.
+    // batch-item-start clears currentBboxes in the image viewer; re-populate them here.
+    const bboxes  = item.bboxes  || [];
+    const regions = item.regions || [];
+    emit('sse-segmentation', { num_lines: item.lines.length, bboxes, regions, source: 'batch-restore' });
+    // Re-populate state.lines so exports and confidence filter work
+    state.lines = item.lines.map((l, i) => ({ ...l, index: i }));
+    // Re-emit each line to rebuild the transcription panel
+    $('transcription-lines').innerHTML = '';
+    $('conf-filter-row').classList.add('hidden');
+    state.lines.forEach(l => emit('sse-progress', {
+        current: l.index + 1, total: state.lines.length, line: l
+    }));
+    emit('sse-complete', { lines: state.lines, total_time_s: item.time_s || 0, engine: '(batch)', token_usage: item.token_usage || null });
+}
+// ── Export ────────────────────────────────────────────────────────────────────
+function exportAllTxt() {
+    const done = batch.items.filter(i => i.status === 'done');
+    if (!done.length) return;
+    const text = done.map(item =>
+        `=== ${item.filename} ===\n` + item.lines.map(l => l.text).join('\n')
+    ).join('\n\n');
+    downloadFile('batch_transcription.txt', text, 'text/plain');
+}
+function exportAllCsv() {
+    const done = batch.items.filter(i => i.status === 'done');
+    if (!done.length) return;
+    const header = 'File,Line,Text,Confidence\n';
+    const rows = done.flatMap(item =>
+        item.lines.map(l => {
+            const conf = l.confidence != null ? l.confidence.toFixed(4) : '';
+            return `"${item.filename.replace(/"/g,'""')}",${l.index + 1},"${l.text.replace(/"/g,'""')}",${conf}`;
+        })
+    );
+    downloadFile('batch_transcription.csv', header + rows.join('\n'), 'text/csv');
+}
+async function exportAllThinkingZip() {
+    const done = batch.items.filter(i => i.status === 'done' && i.imageId);
+    if (!done.length) return;
+    try {
+        const resp = await fetch('/api/batch/export-thinking', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
+        });
+        if (!resp.ok) throw new Error(await resp.text());
+        const blob = await resp.blob();
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement('a');
+        a.href = url; a.download = 'batch_thinking.zip'; a.click();
+        URL.revokeObjectURL(url);
+    } catch (err) {
+        toast(`Thinking export failed: ${err.message}`, 'error');
+    }
+}
+async function exportAllTxtZip() {
+    const done = batch.items.filter(i => i.status === 'done' && i.imageId);
+    if (!done.length) return;
+    try {
+        const resp = await fetch('/api/batch/export-txt', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
+        });
+        if (!resp.ok) throw new Error(await resp.text());
+        const blob = await resp.blob();
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement('a');
+        a.href = url; a.download = 'batch_export_txt.zip'; a.click();
+        URL.revokeObjectURL(url);
+    } catch (err) {
+        toast(`TXT ZIP export failed: ${err.message}`, 'error');
+    }
+}
+async function exportAllXml() {
+    const done = batch.items.filter(i => i.status === 'done' && i.imageId);
+    if (!done.length) return;
+    try {
+        const resp = await fetch('/api/batch/export-xml', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
+        });
+        if (!resp.ok) throw new Error(await resp.text());
+        const blob = await resp.blob();
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement('a');
+        a.href = url; a.download = 'batch_export.zip'; a.click();
+        URL.revokeObjectURL(url);
+    } catch (err) {
+        toast(`XML export failed: ${err.message}`, 'error');
+    }
+}
+function downloadFile(filename, content, mime) {
+    const blob = new Blob([content], { type: mime });
+    const url = URL.createObjectURL(blob);
+    const a = document.createElement('a');
+    a.href = url; a.download = filename; a.click();
+    URL.revokeObjectURL(url);
+}

web/static/components/engine-panel.js ADDED Viewed

	@@ -0,0 +1,1091 @@

+/**
+ * Engine Panel — engine selection, dynamic config form, model loading
+ */
+import { state, emit, on, api, saveEngineConfig, loadSavedEngineName, loadSavedEngineConfig, toast } from '../app.js';
+const $ = id => document.getElementById(id);
+// --- API Key localStorage helpers (keys never stored on server) ---
+const _KEY_PREFIX = 'polyscriptor_key_';
+let _browserOpenWebUIConfig = null;
+let _browserOpenWebUIAbort = null;
+function _loadBrowserKey(slot) {
+    try { return localStorage.getItem(_KEY_PREFIX + slot) || ''; }
+    catch (_) { return ''; }
+}
+function _saveBrowserKey(slot, key) {
+    try {
+        if (key) localStorage.setItem(_KEY_PREFIX + slot, key);
+        else localStorage.removeItem(_KEY_PREFIX + slot);
+        return true;
+    } catch (_) { /* private browsing etc. */ }
+    return false;
+}
+function _hasBrowserKey(slot) {
+    return !!_loadBrowserKey(slot);
+}
+function _normalizeBaseUrl(baseUrl) {
+    return (baseUrl || '').trim().replace(/\/+$/, '');
+}
+function _openWebUIModelUrls(baseUrl) {
+    const base = _normalizeBaseUrl(baseUrl);
+    if (!base) return [];
+    const urls = [`${base}/models`];
+    if (base.endsWith('/api')) {
+        urls.push(`${base}/v1/models`);
+        urls.push(`${base.slice(0, -4)}/v1/models`);
+    } else if (base.endsWith('/api/v1')) {
+        urls.push(`${base.slice(0, -3)}/models`);
+        urls.push(`${base}/models`);
+    } else if (base.endsWith('/v1')) {
+        urls.push(`${base.slice(0, -3)}/api/models`);
+    } else {
+        urls.push(`${base}/api/models`);
+        urls.push(`${base}/api/v1/models`);
+        urls.push(`${base}/v1/models`);
+    }
+    return [...new Set(urls)];
+}
+function _extractModelIds(payload) {
+    if (Array.isArray(payload)) {
+        return [...new Set(payload.map(item => {
+            if (typeof item === 'string') return item;
+            if (item && typeof item === 'object') return item.id || item.name || item.model;
+            return null;
+        }).filter(Boolean))].sort();
+    }
+    if (payload && typeof payload === 'object') {
+        for (const key of ['data', 'models']) {
+            if (Array.isArray(payload[key])) return _extractModelIds(payload[key]);
+        }
+        return _extractModelIds(Object.values(payload));
+    }
+    return [];
+}
+async function _fetchOpenWebUIModelsInBrowser(baseUrl, apiKey) {
+    const errors = [];
+    for (const url of _openWebUIModelUrls(baseUrl)) {
+        try {
+            const resp = await fetch(url, {
+                headers: {
+                    'Authorization': `Bearer ${apiKey}`,
+                    'Content-Type': 'application/json',
+                    'Accept': 'application/json',
+                },
+            });
+            const contentType = resp.headers.get('content-type') || '';
+            const text = await resp.text();
+            if (!resp.ok) {
+                errors.push(`${url}: HTTP ${resp.status}`);
+                continue;
+            }
+            if (!contentType.includes('json')) {
+                const sample = text.trim().replace(/\s+/g, ' ').slice(0, 120) || '<empty response>';
+                errors.push(`${url}: non-JSON response: ${sample}`);
+                continue;
+            }
+            const models = _extractModelIds(JSON.parse(text));
+            if (models.length) return models;
+            errors.push(`${url}: no model ids in response`);
+        } catch (err) {
+            errors.push(`${url}: ${err.message}`);
+        }
+    }
+    throw new Error(errors.join('; ') || 'No OpenWebUI model endpoint tried');
+}
+async function _blobToDataUrl(blob) {
+    return await new Promise((resolve, reject) => {
+        const reader = new FileReader();
+        reader.onload = () => resolve(reader.result);
+        reader.onerror = () => reject(reader.error || new Error('Could not read image'));
+        reader.readAsDataURL(blob);
+    });
+}
+function _resolveOpenWebUIModel(config) {
+    if (config.model === '__custom__') return (config.model_custom || '').trim();
+    return (config.model || '').trim();
+}
+export function initEnginePanel() {
+    loadEngines();
+    $('engine-select').addEventListener('change', onEngineSelected);
+    $('btn-load-model').addEventListener('click', onLoadModel);
+    $('btn-transcribe').addEventListener('click', onTranscribe);
+    $('btn-segment').addEventListener('click', onSegment);
+    // Show/hide blla-specific options
+    const segMethodSel = $('seg-method');
+    const bllaopts = $('blla-options');
+    const syncBllaOpts = () => {
+        if (bllaopts) bllaopts.style.display = segMethodSel.value === 'kraken-blla' ? '' : 'none';
+    };
+    segMethodSel.addEventListener('change', syncBllaOpts);
+    syncBllaOpts();
+    // Cancel button — visible during transcription
+    $('btn-cancel').addEventListener('click', async () => {
+        if ($('engine-select')?.value === 'OpenWebUI' && _browserOpenWebUIAbort) {
+            _browserOpenWebUIAbort.abort();
+            return;
+        }
+        try {
+            await fetch('/api/transcribe/cancel', { method: 'POST' });
+        } catch (_) { /* ignore */ }
+    });
+    // Enable transcribe/segment buttons when image is ready
+    on('engine-loaded',     () => { updateTranscribeBtn(); updateSegmentBtn(); });
+    on('image-uploaded',    () => { updateTranscribeBtn(); updateSegmentBtn(); });
+    on('batch-item-start',  () => { updateTranscribeBtn(); updateSegmentBtn(); });
+    on('transcription-complete', () => {
+        state.isProcessing = false;
+        $('btn-transcribe').classList.remove('loading');
+        $('btn-transcribe').textContent = 'Transcribe';
+        $('btn-cancel').classList.add('hidden');
+        updateTranscribeBtn();
+        updateSegmentBtn();
+    });
+    // Region list — appears after segmentation, cleared on new image/transcription
+    on('sse-segmentation', data => renderRegionList(data.regions || []));
+    on('image-uploaded',   () => { $('seg-regions-list').classList.add('hidden'); $('seg-regions-list').innerHTML = ''; });
+}
+async function loadEngines() {
+    try {
+        const resp = await api('/api/engines');
+        state.engines = await resp.json();
+        const select = $('engine-select');
+        select.innerHTML = '';
+        const available = state.engines.filter(e => e.available);
+        const unavailable = state.engines.filter(e => !e.available);
+        if (available.length === 0) {
+            select.innerHTML = '<option>No engines available</option>';
+            return;
+        }
+        const savedEngine = loadSavedEngineName();
+        for (const eng of available) {
+            const opt = document.createElement('option');
+            opt.value = eng.name;
+            opt.textContent = eng.name;
+            select.appendChild(opt);
+        }
+        if (unavailable.length > 0) {
+            const group = document.createElement('optgroup');
+            group.label = 'Unavailable';
+            for (const eng of unavailable) {
+                const opt = document.createElement('option');
+                opt.value = eng.name;
+                opt.textContent = `${eng.name} (${eng.unavailable_reason || 'missing deps'})`;
+                opt.disabled = true;
+                group.appendChild(opt);
+            }
+            select.appendChild(group);
+        }
+        // Restore last used engine if available
+        if (savedEngine && available.find(e => e.name === savedEngine)) {
+            select.value = savedEngine;
+        }
+        select.disabled = false;
+        onEngineSelected();
+    } catch (err) {
+        $('engine-description').textContent = `Error loading engines: ${err.message}`;
+    }
+}
+async function onEngineSelected() {
+    const name = $('engine-select').value;
+    const eng = state.engines.find(e => e.name === name);
+    state.currentEngine = eng;
+    // Description
+    $('engine-description').textContent = eng?.description || '';
+    // Show/hide segmentation controls based on engine capability
+    updateSegmentationVisibility(eng);
+    // Load config schema
+    const configForm = $('config-form');
+    configForm.innerHTML = '';
+    if (!eng) return;
+    try {
+        const resp = await api(`/api/engine/${encodeURIComponent(name)}/config-schema`);
+        const schema = await resp.json();
+        for (const field of schema.fields || []) {
+            configForm.appendChild(createField(field));
+        }
+        // Restore saved config values for this engine (skip password fields for security)
+        const savedCfg = loadSavedEngineConfig(name);
+        if (savedCfg) {
+            for (const el of configForm.querySelectorAll('[data-key]')) {
+                if (el.dataset.passwordField) continue;  // never prefill secrets
+                const val = savedCfg[el.dataset.key];
+                if (val == null) continue;
+                if (el.type === 'checkbox') el.checked = !!val;
+                else el.value = val;
+            }
+        }
+        $('btn-load-model').disabled = false;
+        // For Commercial APIs: when provider changes, swap model list and update key hint
+        const providerSel = $('cfg-provider');
+        const modelSel    = $('cfg-model');
+        if (providerSel && modelSel) {
+            const syncModelList = async () => {
+                // Clear model list and auto-fetch from live API if a key is available
+                _populateSelect(modelSel, []);  // show "— click ↻ to load —"
+                modelSel.dispatchEvent(new Event('change'));
+                // Auto-trigger fetch if we have a browser key for this provider
+                const prov = providerSel.value.toLowerCase();
+                const keyEl = $('cfg-api_key');
+                const hasBrowser = _hasBrowserKey(prov);
+                const hasTyped   = keyEl?.value?.trim().length > 0;
+                if (hasBrowser || hasTyped) {
+                    const refreshBtn = modelSel.closest('.config-field')?.querySelector('.btn-refresh');
+                    if (refreshBtn) refreshBtn.click();
+                }
+            };
+            providerSel.addEventListener('change', syncModelList);
+            syncModelList();  // run once on load to match default provider
+        }
+        const keyInput = $('cfg-api_key');
+        if (providerSel && keyInput) {
+            const updateKeyHint = () => {
+                const slot = providerSel.value.toLowerCase();
+                const hasBrowser = _hasBrowserKey(slot);
+                const saveRow = keyInput.closest('.config-field')?.querySelector('.key-save-row');
+                const saveBox = saveRow?.querySelector('input[type="checkbox"]');
+                if (hasBrowser) {
+                    keyInput.placeholder = '••••••••  (saved in browser — leave blank to keep)';
+                    keyInput.dataset.hasBrowser = 'true';
+                    keyInput.disabled = false;
+                    if (saveRow) { saveRow.style.display = ''; saveRow.querySelector('label').textContent = 'Key saved in browser'; }
+                    if (saveBox) saveBox.checked = true;
+                } else {
+                    keyInput.placeholder = 'Paste API key here';
+                    keyInput.disabled = false;
+                    delete keyInput.dataset.hasBrowser;
+                    if (saveRow) { saveRow.style.display = ''; saveRow.querySelector('label').textContent = 'Save key in browser'; }
+                    if (saveBox) saveBox.checked = false;
+                }
+            };
+            providerSel.addEventListener('change', updateKeyHint);
+            updateKeyHint();  // run once on load
+        }
+        // Kraken: show preset dropdown and load preset list
+        const krakenPresetRow = $('kraken-preset-row');
+        if (krakenPresetRow) {
+            if (name === 'Kraken') {
+                krakenPresetRow.classList.remove('hidden');
+                _loadKrakenPresets();
+            } else {
+                krakenPresetRow.classList.add('hidden');
+            }
+        }
+        // Auto-load model if this engine was previously configured.
+        // Skip engines with dynamic model lists (need live fetch first — user loads manually).
+        const hasDynamic = schema.fields?.some(f => f.dynamic);
+        if (savedCfg && !hasDynamic) {
+            onLoadModel();
+        }
+    } catch (err) {
+        configForm.innerHTML = `<p class="muted">Error: ${err.message}</p>`;
+    }
+}
+let _krakenPresetsLoaded = false;
+async function _loadKrakenPresets() {
+    if (_krakenPresetsLoaded) return;
+    const sel = $('kraken-preset-select');
+    const status = $('kraken-preset-status');
+    if (!sel) return;
+    try {
+        const resp = await fetch('/api/kraken/presets');
+        const data = await resp.json();
+        sel.innerHTML = '';
+        const blank = document.createElement('option');
+        blank.value = '';
+        blank.textContent = '— use model path above —';
+        sel.appendChild(blank);
+        for (const p of data.presets || []) {
+            const opt = document.createElement('option');
+            opt.value = p.id;
+            const icon = p.source === 'local' ? '📁' : '⬇️';
+            opt.textContent = `${icon} ${p.label} (${p.language})`;
+            sel.appendChild(opt);
+        }
+        _krakenPresetsLoaded = true;
+    } catch (e) {
+        if (status) status.textContent = 'Could not load presets';
+    }
+    sel.addEventListener('change', () => {
+        const status = $('kraken-preset-status');
+        const modelPathEl = $('cfg-model_path');
+        const val = sel.value;
+        if (!val) {
+            if (status) status.textContent = '';
+            return;
+        }
+        if (status) {
+            status.textContent = val === 'blla-local'
+                ? '📁 Local model — loads instantly'
+                : '⬇️ Auto-downloads from Zenodo on first use (~30–120s)';
+        }
+        // Pre-fill model_path field with the preset ID so server knows what to load
+        if (modelPathEl) modelPathEl.value = '';  // clear — preset_id takes priority
+    });
+}
+/**
+ * Show or hide segmentation controls depending on whether the selected engine
+ * requires line segmentation. Page-level engines (VLMs, Commercial APIs, etc.)
+ * do their own segmentation internally — showing these controls is misleading.
+ */
+function updateSegmentationVisibility(eng) {
+    const needsSeg = eng ? eng.requires_line_segmentation : true;
+    const segControls = $('seg-controls');
+    if (segControls) {
+        segControls.style.display = needsSeg ? '' : 'none';
+    }
+}
+function createField(field) {
+    const wrapper = document.createElement('div');
+    if (field.type === 'checkbox') {
+        wrapper.className = 'config-field config-field-checkbox';
+        const input = document.createElement('input');
+        input.type = 'checkbox';
+        input.id = `cfg-${field.key}`;
+        input.dataset.key = field.key;
+        input.checked = field.default ?? false;
+        const label = document.createElement('label');
+        label.htmlFor = input.id;
+        label.textContent = field.label;
+        wrapper.appendChild(input);
+        wrapper.appendChild(label);
+    } else {
+        wrapper.className = 'config-field';
+        const label = document.createElement('label');
+        label.htmlFor = `cfg-${field.key}`;
+        label.textContent = field.label;
+        wrapper.appendChild(label);
+        if (field.type === 'select') {
+            // Row: select + optional refresh button
+            const selectRow = document.createElement('div');
+            selectRow.className = 'select-row';
+            const select = document.createElement('select');
+            select.id = `cfg-${field.key}`;
+            select.dataset.key = field.key;
+            if (field.per_provider_options) {
+                // Store for later use when provider changes
+                select.dataset.perProviderOptions = JSON.stringify(field.per_provider_options);
+            }
+            _populateSelect(select, field.options || [], field.default);
+            selectRow.appendChild(select);
+            // Dynamic refresh button — fetches live model list from server
+            if (field.dynamic) {
+                const hint = document.createElement('span');
+                hint.className = 'dynamic-hint muted';
+                hint.textContent = field.dynamic_hint || 'Click ↻ to load models';
+                const refreshBtn = document.createElement('button');
+                refreshBtn.type = 'button';
+                refreshBtn.className = 'btn-refresh';
+                refreshBtn.title = 'Refresh model list from server';
+                refreshBtn.textContent = '↻';
+                refreshBtn.addEventListener('click', async () => {
+                    const engineName = $('engine-select').value;
+                    const providerEl = $('cfg-provider');
+                    const keyEl = $('cfg-api_key');
+                    const provider = providerEl?.value?.toLowerCase() || 'openai';
+                    const keySlot = engineName === 'OpenWebUI' ? 'openwebui' : provider;
+                    const apiKey = keyEl?.value?.trim() || _loadBrowserKey(keySlot);
+                    refreshBtn.textContent = '…';
+                    refreshBtn.disabled = true;
+                    try {
+                        const baseUrlEl = $('cfg-base_url');
+                        const baseUrl = baseUrlEl?.value?.trim() || '';
+                        let data;
+                        if (engineName === 'OpenWebUI') {
+                            if (!baseUrl) throw new Error('Enter your OpenWebUI base URL');
+                            if (!apiKey) throw new Error('Enter your OpenWebUI API key');
+                            const models = await _fetchOpenWebUIModelsInBrowser(baseUrl, apiKey);
+                            data = { models };
+                        } else {
+                            const params = new URLSearchParams({ provider, api_key: apiKey, base_url: baseUrl });
+                            const resp = await fetch(
+                                `/api/engine/${encodeURIComponent(engineName)}/models?${params}`
+                            );
+                            data = await resp.json();
+                        }
+                        if (data.error) {
+                            hint.textContent = `Error: ${data.error}`;
+                        } else if (data.models.length === 0) {
+                            hint.textContent = 'No models found';
+                        } else {
+                            const current = select.value;
+                            // Build options, keep __custom__ at the end if present
+                            const newOpts = data.models.map(m => ({ label: m, value: m }));
+                            if (field.custom_key) newOpts.push({ label: 'Custom model ID…', value: '__custom__' });
+                            _populateSelect(select, newOpts, current);
+                            hint.textContent = `${data.models.length} models loaded`;
+                        }
+                    } catch (e) {
+                        hint.textContent = `Error: ${e.message}`;
+                    } finally {
+                        refreshBtn.textContent = '↻';
+                        refreshBtn.disabled = false;
+                    }
+                });
+                selectRow.appendChild(refreshBtn);
+                wrapper.appendChild(selectRow);
+                wrapper.appendChild(hint);
+            } else {
+                wrapper.appendChild(selectRow);
+            }
+            // If this select can have a __custom__ sentinel, wire up a
+            // hidden text input that appears when "__custom__" is chosen.
+            if (field.custom_key) {
+                const customInput = document.createElement('input');
+                customInput.type = 'text';
+                customInput.id = `cfg-${field.custom_key}`;
+                customInput.dataset.key = field.custom_key;
+                customInput.placeholder = field.custom_placeholder || 'Enter custom value';
+                customInput.style.marginTop = '4px';
+                // Show/hide based on current select value
+                const syncCustomVisibility = () => {
+                    const isCustom = select.value === '__custom__';
+                    customInput.style.display = isCustom ? '' : 'none';
+                    customInput.required = isCustom;
+                };
+                select.addEventListener('change', syncCustomVisibility);
+                syncCustomVisibility();  // run once on creation
+                wrapper.appendChild(customInput);
+            }
+            // Upload button — lets users upload a local .mlmodel file from their machine
+            if (field.upload) {
+                const uploadRow = document.createElement('div');
+                uploadRow.className = 'upload-model-row';
+                uploadRow.style.cssText = 'display:flex;align-items:center;gap:6px;margin-top:6px;';
+                const fileInput = document.createElement('input');
+                fileInput.type = 'file';
+                fileInput.accept = '.mlmodel';
+                fileInput.style.display = 'none';
+                const uploadBtn = document.createElement('button');
+                uploadBtn.type = 'button';
+                uploadBtn.className = 'btn-secondary btn-sm';
+                uploadBtn.textContent = 'Upload .mlmodel…';
+                uploadBtn.title = 'Upload a Kraken model file from your computer';
+                const uploadStatus = document.createElement('span');
+                uploadStatus.className = 'muted';
+                uploadStatus.style.fontSize = '0.85em';
+                uploadBtn.addEventListener('click', () => fileInput.click());
+                fileInput.addEventListener('change', async () => {
+                    const f = fileInput.files[0];
+                    if (!f) return;
+                    uploadStatus.textContent = `Uploading ${f.name}…`;
+                    uploadBtn.disabled = true;
+                    try {
+                        const fd = new FormData();
+                        fd.append('file', f);
+                        const resp = await fetch('/api/models/upload', { method: 'POST', body: fd });
+                        if (!resp.ok) {
+                            const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+                            throw new Error(err.detail || resp.statusText);
+                        }
+                        const data = await resp.json();
+                        // Repopulate select with fresh options returned by server
+                        const newPath = data.path;
+                        _populateSelect(select, data.options, newPath);
+                        uploadStatus.textContent = `Uploaded: ${data.filename}`;
+                        // Re-run custom visibility sync (new value might not be __custom__)
+                        if (field.custom_key) {
+                            const isCustom = select.value === '__custom__';
+                            const ci = document.getElementById(`cfg-${field.custom_key}`);
+                            if (ci) { ci.style.display = isCustom ? '' : 'none'; ci.required = isCustom; }
+                        }
+                    } catch (e) {
+                        uploadStatus.textContent = `Upload failed: ${e.message}`;
+                    } finally {
+                        uploadBtn.disabled = false;
+                        fileInput.value = '';
+                    }
+                });
+                uploadRow.appendChild(fileInput);
+                uploadRow.appendChild(uploadBtn);
+                uploadRow.appendChild(uploadStatus);
+                wrapper.appendChild(uploadRow);
+            }
+        } else if (field.type === 'number') {
+            const input = document.createElement('input');
+            input.type = 'number';
+            input.id = `cfg-${field.key}`;
+            input.dataset.key = field.key;
+            if (field.min != null) input.min = field.min;
+            if (field.max != null) input.max = field.max;
+            input.value = field.default ?? '';
+            wrapper.appendChild(input);
+        } else if (field.type === 'password') {
+            const input = document.createElement('input');
+            input.type = 'password';
+            input.id = `cfg-${field.key}`;
+            input.dataset.key = field.key;
+            input.dataset.passwordField = 'true';
+            // Determine effective key slot for localStorage lookup
+            function _getKeySlot() {
+                const providerEl = $('cfg-provider');
+                if (providerEl) return providerEl.value.toLowerCase();
+                const engineEl = $('engine-select');
+                if (engineEl?.value === 'OpenWebUI') return 'openwebui';
+                return field.key;
+            }
+            function applyKeyHint() {
+                const slot = _getKeySlot();
+                const hasBrowser = _hasBrowserKey(slot);
+                if (hasBrowser) {
+                    input.placeholder = '••••••••  (saved in browser — leave blank to keep)';
+                    input.dataset.hasBrowser = 'true';
+                } else {
+                    input.placeholder = field.placeholder || 'Paste API key here';
+                    delete input.dataset.hasBrowser;
+                }
+                input.disabled = false;
+            }
+            applyKeyHint();
+            wrapper.appendChild(input);
+            // "Save key in browser" checkbox
+            const saveRow = document.createElement('div');
+            saveRow.className = 'key-save-row';
+            const saveBox = document.createElement('input');
+            saveBox.type = 'checkbox';
+            saveBox.id = `cfg-${field.key}-save`;
+            saveBox.dataset.saveFor = field.key;
+            const slot = _getKeySlot();
+            saveBox.checked = _hasBrowserKey(slot);
+            const saveLabel = document.createElement('label');
+            saveLabel.htmlFor = saveBox.id;
+            saveLabel.textContent = _hasBrowserKey(slot)
+                ? 'Key saved in browser' : 'Save key in browser';
+            saveRow.appendChild(saveBox);
+            saveRow.appendChild(saveLabel);
+            wrapper.appendChild(saveRow);
+        } else if (field.type === 'textarea') {
+            const ta = document.createElement('textarea');
+            ta.id = `cfg-${field.key}`;
+            ta.dataset.key = field.key;
+            ta.rows = field.rows || 3;
+            ta.value = field.default ?? '';
+            if (field.placeholder) ta.placeholder = field.placeholder;
+            ta.style.width = '100%';
+            ta.style.resize = 'vertical';
+            wrapper.appendChild(ta);
+            if (field.hint) {
+                const hint = document.createElement('small');
+                hint.textContent = field.hint;
+                hint.style.color = 'var(--text-muted, #888)';
+                wrapper.appendChild(hint);
+            }
+        } else {
+            // text
+            const input = document.createElement('input');
+            input.type = 'text';
+            input.id = `cfg-${field.key}`;
+            input.dataset.key = field.key;
+            input.value = field.default ?? '';
+            if (field.placeholder) input.placeholder = field.placeholder;
+            wrapper.appendChild(input);
+        }
+    }
+    return wrapper;
+}
+function collectConfig() {
+    const config = {};
+    const fields = $('config-form').querySelectorAll('[data-key]');
+    for (const el of fields) {
+        const key = el.dataset.key;
+        if (el.dataset.saveFor) continue;  // "save key" checkboxes are not config
+        if (el.type === 'checkbox') {
+            config[key] = el.checked;
+        } else if (el.type === 'number') {
+            config[key] = Number(el.value);
+        } else if (el.dataset.passwordField && !el.value.trim()) {
+            // Blank password field — inject key from browser localStorage
+            const providerEl = $('cfg-provider');
+            let slot = key;
+            if (providerEl) slot = providerEl.value.toLowerCase();
+            else if ($('engine-select')?.value === 'OpenWebUI') slot = 'openwebui';
+            const browserKey = _loadBrowserKey(slot);
+            config[key] = browserKey;  // may be empty — server will check env next
+        } else {
+            config[key] = el.value;
+        }
+    }
+    return config;
+}
+function _persistNewKeys(engineName) {
+    // Save any typed API key to browser localStorage automatically.
+    // Unchecking "Save key" is the explicit opt-out (deletes saved key).
+    const saveBoxes = $('config-form').querySelectorAll('[data-save-for]');
+    for (const box of saveBoxes) {
+        const keyField = $(`cfg-${box.dataset.saveFor}`);
+        const newKey = keyField?.value?.trim();
+        // Determine slot from engine name
+        const slotMap = {
+            'OpenWebUI': 'openwebui',
+            'Commercial APIs': null,  // slot depends on selected provider
+        };
+        let slot = slotMap[engineName];
+        if (engineName === 'Commercial APIs') {
+            const providerEl = $('cfg-provider');
+            slot = providerEl?.value?.toLowerCase() || 'openai';
+        }
+        if (!slot) continue;
+        if (newKey) {
+            const label = box.nextElementSibling;
+            if (_saveBrowserKey(slot, newKey)) {
+                keyField.value = '';  // clear field; hint shows key is saved
+                keyField.placeholder = '••••••••  (saved in browser — leave blank to keep)';
+                keyField.dataset.hasBrowser = 'true';
+                box.checked = true;
+                if (label) label.textContent = 'Key saved in browser';
+            } else {
+                box.checked = false;
+                if (label) label.textContent = 'Could not save key in browser';
+            }
+        } else if (!box.checked && _hasBrowserKey(slot)) {
+            // Explicit opt-out: unchecked + no typed key → delete saved key
+            _saveBrowserKey(slot, '');
+            delete keyField?.dataset?.hasBrowser;
+        }
+    }
+}
+async function onLoadModel() {
+    const name = $('engine-select').value;
+    const config = collectConfig();
+    // Attach Kraken preset ID if one is selected
+    if (name === 'Kraken') {
+        const presetSel = $('kraken-preset-select');
+        if (presetSel?.value) config.preset_id = presetSel.value;
+    }
+    const btn = $('btn-load-model');
+    const status = $('engine-status');
+    btn.classList.add('loading');
+    btn.textContent = 'Loading...';
+    status.className = 'status-badge status-loading';
+    status.textContent = `Loading ${name}...`;
+    status.classList.remove('hidden');
+    try {
+        if (name === 'OpenWebUI') {
+            config.base_url = _normalizeBaseUrl(config.base_url);
+            config.model = _resolveOpenWebUIModel(config);
+            if (!config.base_url) throw new Error('Enter your OpenWebUI base URL');
+            if (!config.api_key) throw new Error('Enter your OpenWebUI API key');
+            if (!config.model) throw new Error('Load the model list or enter an OpenWebUI model ID');
+            _browserOpenWebUIConfig = { ...config };
+            state.engineLoaded = true;
+            status.className = 'status-badge status-loaded';
+            status.textContent = `${name} ready in browser (${config.model})`;
+            _persistNewKeys(name);
+            const storedConfig = { ...config };
+            delete storedConfig.api_key;
+            saveEngineConfig(name, storedConfig);
+            emit('engine-loaded', {
+                success: true,
+                load_time_s: 0,
+                engine_name: name,
+                browser_direct: true,
+            });
+            return;
+        }
+        const resp = await api('/api/engine/load', {
+            method: 'POST',
+            body: JSON.stringify({ engine_name: name, config }),
+        });
+        const data = await resp.json();
+        state.engineLoaded = true;
+        status.className = 'status-badge status-loaded';
+        status.textContent = `${name} loaded (${data.load_time_s}s)`;
+        _persistNewKeys(name);   // save keys only after the typed key was used for loading
+        // Persist engine + config for next session
+        const storedConfig = { ...config };
+        delete storedConfig.api_key;
+        saveEngineConfig(name, storedConfig);
+        emit('engine-loaded', data);
+    } catch (err) {
+        status.className = 'status-badge';
+        status.style.color = 'var(--danger)';
+        status.textContent = `Error: ${err.message}`;
+        state.engineLoaded = false;
+    } finally {
+        btn.classList.remove('loading');
+        btn.textContent = 'Load Model';
+    }
+}
+async function onTranscribe() {
+    if (state.isProcessing) return;
+    if (!state.engineLoaded || !state.imageId) return;
+    state.isProcessing = true;
+    const btn = $('btn-transcribe');
+    btn.classList.add('loading');
+    btn.textContent = 'Transcribing...';
+    btn.disabled = true;
+    $('btn-cancel').classList.remove('hidden');
+    const segMethod = $('seg-method').value;
+    const segDevice = $('seg-device').value;
+    const maxColumns = parseInt($('seg-max-columns')?.value || '6', 10);
+    const splitWidth = parseFloat($('seg-split-width')?.value || '40') / 100;
+    const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
+    emit('transcription-start');
+    try {
+        if ($('engine-select').value === 'OpenWebUI') {
+            await transcribeOpenWebUIInBrowser();
+            return;
+        }
+        // Collect live config overrides — non-password form fields are sent at
+        // transcription time so changes (e.g. custom_prompt, thinking_mode) take
+        // effect immediately without requiring a model reload.
+        const liveOverrides = {};
+        for (const el of $('config-form').querySelectorAll('[data-key]')) {
+            if (el.dataset.saveFor) continue;       // skip "save key" checkboxes
+            if (el.dataset.passwordField) continue; // never resend secrets
+            const key = el.dataset.key;
+            if (el.type === 'checkbox')      liveOverrides[key] = el.checked;
+            else if (el.type === 'number')   liveOverrides[key] = Number(el.value);
+            else                             liveOverrides[key] = el.value;
+        }
+        const resp = await fetch('/api/transcribe', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                image_id: state.imageId,
+                seg_method: segMethod,
+                seg_device: segDevice,
+                max_columns: maxColumns,
+                split_width_fraction: splitWidth,
+                text_direction: textDirection,
+                engine_config_overrides: liveOverrides,
+            }),
+        });
+        if (!resp.ok) {
+            const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+            throw new Error(err.detail || 'Transcription failed');
+        }
+        const reader = resp.body.getReader();
+        const decoder = new TextDecoder();
+        let buffer = '';
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            buffer += decoder.decode(value, { stream: true });
+            const parts = buffer.split('\n\n');
+            buffer = parts.pop(); // keep incomplete
+            for (const part of parts) {
+                if (!part.trim()) continue;
+                const eventMatch = part.match(/event: (\w+)/);
+                const dataMatch = part.match(/data: (.+)/s);
+                if (eventMatch && dataMatch) {
+                    const eventName = eventMatch[1];
+                    const data = JSON.parse(dataMatch[1]);
+                    emit(`sse-${eventName}`, data);
+                }
+            }
+        }
+    } catch (err) {
+        if (err.name === 'AbortError') emit('sse-cancelled', {});
+        else emit('transcription-error', { message: err.message });
+    } finally {
+        _browserOpenWebUIAbort = null;
+    }
+}
+async function transcribeOpenWebUIInBrowser() {
+    const config = { ...(_browserOpenWebUIConfig || collectConfig()) };
+    config.base_url = _normalizeBaseUrl(config.base_url);
+    config.api_key = config.api_key || _loadBrowserKey('openwebui');
+    config.model = _resolveOpenWebUIModel(config);
+    if (!config.base_url) throw new Error('Enter your OpenWebUI base URL');
+    if (!config.api_key) throw new Error('Enter your OpenWebUI API key');
+    if (!config.model) throw new Error('Load the model list or enter an OpenWebUI model ID');
+    const imageResp = await fetch(`/api/image/${state.imageId}`);
+    if (!imageResp.ok) throw new Error('Could not load uploaded image');
+    const imageBlob = await imageResp.blob();
+    const dataUrl = await _blobToDataUrl(imageBlob);
+    emit('sse-segmentation', {
+        num_lines: 1,
+        bboxes: [[0, 0, state.imageInfo?.width || 0, state.imageInfo?.height || 0]],
+        source: 'page',
+    });
+    const prompt = (config.custom_prompt || '').trim() ||
+        'Transcribe all handwritten text in this manuscript image. Preserve the original language and layout. Output only the transcribed text without any additional commentary.';
+    const body = {
+        model: config.model,
+        messages: [{
+            role: 'user',
+            content: [
+                { type: 'text', text: prompt },
+                { type: 'image_url', image_url: { url: dataUrl } },
+            ],
+        }],
+        temperature: Number.isFinite(config.temperature) ? config.temperature : 0.1,
+    };
+    if (config.max_tokens && config.max_tokens > 0) body.max_tokens = config.max_tokens;
+    _browserOpenWebUIAbort = new AbortController();
+    const started = Date.now();
+    const resp = await fetch(`${config.base_url}/chat/completions`, {
+        method: 'POST',
+        headers: {
+            'Authorization': `Bearer ${config.api_key}`,
+            'Content-Type': 'application/json',
+            'Accept': 'application/json',
+        },
+        body: JSON.stringify(body),
+        signal: _browserOpenWebUIAbort.signal,
+    });
+    const text = await resp.text();
+    if (!resp.ok) {
+        throw new Error(`OpenWebUI HTTP ${resp.status}: ${text.slice(0, 240)}`);
+    }
+    let payload;
+    try {
+        payload = JSON.parse(text);
+    } catch (_) {
+        throw new Error(`OpenWebUI returned non-JSON response: ${text.slice(0, 240)}`);
+    }
+    const output = (payload.choices?.[0]?.message?.content || '').trim();
+    const tokenUsage = payload.usage ? {
+        prompt_tokens: payload.usage.prompt_tokens,
+        output_tokens: payload.usage.completion_tokens,
+        total_tokens: payload.usage.total_tokens,
+    } : null;
+    const line = {
+        index: 0,
+        text: output,
+        confidence: null,
+        bbox: [0, 0, state.imageInfo?.width || 0, state.imageInfo?.height || 0],
+        region: 0,
+    };
+    const progress = { current: 1, total: 1, line };
+    if (tokenUsage) progress.token_usage = tokenUsage;
+    emit('sse-progress', progress);
+    const complete = {
+        lines: [line],
+        total_time_s: Math.round((Date.now() - started) / 10) / 100,
+        engine: 'OpenWebUI',
+        browser_direct: true,
+    };
+    if (tokenUsage) complete.token_usage = tokenUsage;
+    emit('sse-complete', complete);
+}
+function updateTranscribeBtn() {
+    $('btn-transcribe').disabled = !(state.engineLoaded && state.imageId && !state.isProcessing);
+}
+function updateSegmentBtn() {
+    $('btn-segment').disabled = !(state.imageId && !state.isProcessing);
+}
+async function onSegment() {
+    if (!state.imageId || state.isProcessing) return;
+    const btn = $('btn-segment');
+    const segMethod   = $('seg-method').value;
+    const segDevice   = $('seg-device').value;
+    const maxColumns  = parseInt($('seg-max-columns')?.value || '6', 10);
+    const splitWidth  = parseFloat($('seg-split-width')?.value || '40') / 100;
+    const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
+    btn.classList.add('loading');
+    btn.textContent = 'Segmenting…';
+    btn.disabled = true;
+    try {
+        const params = new URLSearchParams({
+            method: segMethod, device: segDevice,
+            max_columns: maxColumns, split_width_fraction: splitWidth,
+            text_direction: textDirection,
+        });
+        const resp = await api(`/api/image/${state.imageId}/segment?${params}`);
+        if (!resp.ok) {
+            const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+            throw new Error(err.detail || resp.statusText);
+        }
+        const data = await resp.json();
+        // Reuse the same event the transcription flow uses — draws bboxes on canvas
+        emit('sse-segmentation', data);
+        if (data.source !== 'page') {
+            toast(`${data.num_lines} lines found (${data.source})`, 'success', 3000);
+        }
+        emit('segment-preview');  // switch mobile tab to image view
+    } catch (err) {
+        toast(`Segmentation failed: ${err.message}`, 'error');
+    } finally {
+        btn.classList.remove('loading');
+        btn.textContent = 'Segment';
+        updateSegmentBtn();
+    }
+}
+/**
+ * Populate a <select> with an array of options.
+ * Each option may be a string or {label, value}.
+ * Tries to restore previousValue after repopulating.
+ */
+function _populateSelect(select, options, previousValue) {
+    select.innerHTML = '';
+    if (options.length === 0) {
+        const o = document.createElement('option');
+        o.value = '';
+        o.textContent = '— click ↻ to load —';
+        select.appendChild(o);
+        return;
+    }
+    for (const opt of options) {
+        const o = document.createElement('option');
+        o.value = typeof opt === 'object' ? opt.value : opt;
+        o.textContent = typeof opt === 'object' ? opt.label : opt;
+        select.appendChild(o);
+    }
+    if (previousValue != null) {
+        // Restore previous selection if it still exists
+        const match = Array.from(select.options).find(o => o.value === previousValue);
+        if (match) select.value = previousValue;
+    }
+}
+// Same palette as image-viewer.js REGION_COLORS
+const _REGION_COLORS = [
+    'rgba(255,160,30,0.9)', 'rgba(46,213,115,0.9)', 'rgba(232,65,24,0.9)',
+    'rgba(52,172,224,0.9)', 'rgba(162,16,213,0.9)', 'rgba(255,211,42,0.9)',
+    'rgba(18,203,196,0.9)', 'rgba(253,89,166,0.9)',
+];
+function renderRegionList(regions) {
+    const list = $('seg-regions-list');
+    list.innerHTML = '';
+    if (!regions.length) { list.classList.add('hidden'); return; }
+    list.classList.remove('hidden');
+    const hdr = document.createElement('div');
+    hdr.className = 'seg-regions-header';
+    hdr.textContent = `Regions (${regions.length})`;
+    list.appendChild(hdr);
+    regions.forEach((r, i) => {
+        const row = document.createElement('div');
+        row.className = 'seg-region-row';
+        const dot = document.createElement('span');
+        dot.className = 'seg-region-dot';
+        dot.style.background = _REGION_COLORS[i % _REGION_COLORS.length];
+        const label = document.createElement('span');
+        label.className = 'seg-region-label';
+        label.textContent = `R${i + 1}`;
+        const count = document.createElement('span');
+        count.className = 'seg-region-count';
+        count.textContent = `${r.num_lines} line${r.num_lines !== 1 ? 's' : ''}`;
+        const delBtn = document.createElement('button');
+        delBtn.className = 'seg-region-del btn-icon';
+        delBtn.textContent = '×';
+        delBtn.title = 'Delete this region';
+        delBtn.addEventListener('click', async () => {
+            delBtn.disabled = true;
+            try {
+                const resp = await api(`/api/image/${state.imageId}/region/${i}`, { method: 'DELETE' });
+                const data = await resp.json();
+                emit('sse-segmentation', data);
+                toast(`Region R${i + 1} removed`, 'info', 2000);
+            } catch (err) {
+                toast(`Delete failed: ${err.message}`, 'error');
+                delBtn.disabled = false;
+            }
+        });
+        row.appendChild(dot);
+        row.appendChild(label);
+        row.appendChild(count);
+        row.appendChild(delBtn);
+        list.appendChild(row);
+    });
+}

web/static/components/image-viewer.js ADDED Viewed

	@@ -0,0 +1,294 @@

+/**
+ * Image Viewer — upload, display, bbox overlay
+ */
+import { state, emit, on, api, fitZoom, toast } from '../app.js';
+const $ = id => document.getElementById(id);
+const IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp', '.gif', '.webp']);
+function extensionOf(file) {
+    const name = file?.name || '';
+    const dot = name.lastIndexOf('.');
+    return dot >= 0 ? name.slice(dot).toLowerCase() : '';
+}
+function isImageOrPdf(file) {
+    const ext = extensionOf(file);
+    return file.type.startsWith('image/') || ext === '.pdf' || IMAGE_EXTENSIONS.has(ext);
+}
+export function initImageViewer() {
+    const uploadArea = $('upload-area');
+    const fileInput = $('file-input');
+    const xmlInput = $('xml-input');
+    const viewerScroll = $('viewer-scroll');
+    const viewerPlaceholder = $('viewer-placeholder');
+    const handleDroppedFiles = files => {
+        const img = files.find(isImageOrPdf);
+        const xml = files.find(f => f.name.toLowerCase().endsWith('.xml'));
+        if (img) uploadFile(img);
+        if (xml) uploadXml(xml);  // queued after image upload sets imageId
+    };
+    // Click to browse image
+    uploadArea.addEventListener('click', () => fileInput.click());
+    // File selected
+    fileInput.addEventListener('change', () => {
+        if (fileInput.files.length > 0) uploadFile(fileInput.files[0]);
+    });
+    // Drag & drop — accept image, PDF, and XML on the upload box or viewer.
+    const dropTargets = [uploadArea, viewerScroll, viewerPlaceholder].filter(Boolean);
+    dropTargets.forEach(target => {
+        target.addEventListener('dragover', e => {
+            e.preventDefault();
+            uploadArea.classList.add('dragover');
+            if (viewerPlaceholder && !state.imageId) viewerPlaceholder.classList.add('dragover');
+        });
+        target.addEventListener('dragleave', e => {
+            if (!e.currentTarget.contains(e.relatedTarget)) {
+                uploadArea.classList.remove('dragover');
+                viewerPlaceholder?.classList.remove('dragover');
+            }
+        });
+        target.addEventListener('drop', e => {
+            e.preventDefault();
+            uploadArea.classList.remove('dragover');
+            viewerPlaceholder?.classList.remove('dragover');
+            handleDroppedFiles(Array.from(e.dataTransfer.files));
+        });
+    });
+    // Keep the explicit upload area compatible with batch-panel's capture-phase
+    // drop interception for multi-image queues.
+    uploadArea.addEventListener('drop', e => {
+        e.preventDefault();
+    });
+    // XML file picker
+    xmlInput.addEventListener('change', () => {
+        if (xmlInput.files.length > 0) uploadXml(xmlInput.files[0]);
+    });
+    // Batch panel: load a completed item's image into the viewer
+    on('batch-item-start', ({ imageId, filename }) => {
+        state.imageId = imageId;
+        // Clear bboxes immediately for the new item
+        currentBboxes  = [];
+        currentRegions = [];
+        const img = $('page-image');
+        img.src = `/api/image/${imageId}`;
+        $('image-container').classList.remove('hidden');
+        $('viewer-placeholder').classList.add('hidden');
+        img.onload = () => {
+            const canvas = $('overlay-canvas');
+            canvas.width  = img.naturalWidth;
+            canvas.height = img.naturalHeight;
+            fitZoom();
+            // Redraw any bboxes that arrived before the image finished loading
+            if (currentBboxes.length > 0) {
+                drawBboxes(currentBboxes, -1, currentRegions);
+            } else {
+                const ctx = canvas.getContext('2d');
+                ctx.clearRect(0, 0, canvas.width, canvas.height);
+            }
+        };
+        $('image-info').textContent = filename;
+        $('xml-upload-row').classList.remove('hidden');
+        $('xml-status').textContent = 'No PAGE XML';
+        $('xml-status').classList.remove('xml-ok');
+        emit('transcription-start', {});
+    });
+    // Draw bboxes after segmentation; keep state.regions in sync
+    on('sse-segmentation', data => {
+        state.regions = data.regions || [];
+        if (data.source === 'page') {
+            // Page-level engine: clear any old line bboxes, don't draw full-page box
+            drawBboxes([], -1, []);
+        } else {
+            drawBboxes(data.bboxes, -1, state.regions);
+        }
+        if (data.source === 'pagexml') {
+            $('xml-status').textContent = `PAGE XML: ${data.num_lines} lines`;
+        }
+    });
+    // Highlight line on click from transcription panel
+    on('highlight-line', ({ index }) => highlightBbox(index));
+    // Click on canvas → highlight the clicked bbox and emit highlight-line
+    const canvas = $('overlay-canvas');
+    canvas.addEventListener('click', e => {
+        if (currentBboxes.length === 0) return;
+        const img = $('page-image');
+        // Scale factor: natural image coords / displayed canvas coords
+        const scaleX = img.naturalWidth / img.clientWidth;
+        const scaleY = img.naturalHeight / img.clientHeight;
+        const rect = canvas.getBoundingClientRect();
+        const clickX = (e.clientX - rect.left) * scaleX;
+        const clickY = (e.clientY - rect.top) * scaleY;
+        for (let i = 0; i < currentBboxes.length; i++) {
+            const [x1, y1, x2, y2] = currentBboxes[i];
+            if (clickX >= x1 && clickX <= x2 && clickY >= y1 && clickY <= y2) {
+                emit('highlight-line', { index: i });
+                break;
+            }
+        }
+    });
+}
+async function uploadFile(file) {
+    const formData = new FormData();
+    formData.append('file', file);
+    $('image-info').textContent = 'Uploading...';
+    try {
+        const resp = await fetch('/api/image/upload', {
+            method: 'POST',
+            body: formData,
+        });
+        if (!resp.ok) {
+            const err = await resp.json();
+            throw new Error(err.detail);
+        }
+        const data = await resp.json();
+        // PDF: redirect all pages to batch panel
+        if (data.is_pdf) {
+            $('image-info').textContent = `PDF: ${data.num_pages} page(s) — added to batch queue`;
+            emit('pdf-pages-ready', data);
+            return;
+        }
+        state.imageId = data.image_id;
+        state.imageInfo = data;
+        // Display image — show container, hide placeholder
+        const img = $('page-image');
+        img.src = `/api/image/${data.image_id}`;
+        $('image-container').classList.remove('hidden');
+        $('viewer-placeholder').classList.add('hidden');
+        // Wait for image to load to size canvas and fit zoom
+        img.onload = () => {
+            const canvas = $('overlay-canvas');
+            canvas.width = img.naturalWidth;
+            canvas.height = img.naturalHeight;
+            fitZoom();  // sets img.style.width/height and canvas display size
+        };
+        $('image-info').textContent = `${data.filename} (${data.width}×${data.height})`;
+        // Show XML upload row
+        $('xml-upload-row').classList.remove('hidden');
+        $('xml-status').textContent = 'No PAGE XML';
+        $('xml-status').classList.remove('xml-ok');
+        emit('image-uploaded', data);
+    } catch (err) {
+        $('image-info').textContent = `Error: ${err.message}`;
+        toast(`Upload failed: ${err.message}`, 'error', 7000);
+    }
+}
+async function uploadXml(file) {
+    if (!state.imageId) {
+        // Will retry after image upload finishes
+        on('image-uploaded', () => uploadXml(file), { once: true });
+        return;
+    }
+    const xmlStatus = $('xml-status');
+    xmlStatus.textContent = 'Uploading XML...';
+    xmlStatus.classList.remove('xml-ok');
+    try {
+        const formData = new FormData();
+        formData.append('file', file);
+        const resp = await fetch(`/api/image/${state.imageId}/xml`, {
+            method: 'POST',
+            body: formData,
+        });
+        if (!resp.ok) {
+            const err = await resp.json();
+            throw new Error(err.detail);
+        }
+        xmlStatus.textContent = `✓ ${file.name}`;
+        xmlStatus.classList.add('xml-ok');
+        emit('xml-uploaded', { filename: file.name });
+    } catch (err) {
+        xmlStatus.textContent = `XML error: ${err.message}`;
+    }
+}
+let currentBboxes = [];
+let currentRegions = [];
+// Distinct colours for up to 8 regions (cycling)
+const REGION_COLORS = [
+    'rgba(255, 160,  30, 0.55)',  // orange
+    'rgba( 46, 213, 115, 0.55)',  // green
+    'rgba(232,  65, 24,  0.55)',  // red
+    'rgba( 52, 172, 224, 0.55)',  // blue
+    'rgba(162,  16, 213, 0.55)',  // purple
+    'rgba(255, 211,  42, 0.55)',  // yellow
+    'rgba( 18, 203, 196, 0.55)',  // teal
+    'rgba(253,  89, 166, 0.55)',  // pink
+];
+function drawBboxes(bboxes, highlightIndex = -1, regions = []) {
+    currentBboxes = bboxes;
+    currentRegions = regions;
+    const canvas = $('overlay-canvas');
+    const img = $('page-image');
+    const ctx = canvas.getContext('2d');
+    // Keep canvas display size in sync with zoom-controlled img size
+    canvas.style.width  = img.style.width  || img.clientWidth  + 'px';
+    canvas.style.height = img.style.height || img.clientHeight + 'px';
+    ctx.clearRect(0, 0, canvas.width, canvas.height);
+    // Draw region outlines first (underneath line boxes)
+    regions.forEach((r, ri) => {
+        const [x1, y1, x2, y2] = r.bbox;
+        const color = REGION_COLORS[ri % REGION_COLORS.length];
+        ctx.strokeStyle = color;
+        ctx.lineWidth = 2.5;
+        ctx.setLineDash([8, 4]);
+        ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
+        ctx.setLineDash([]);
+        // Subtle fill
+        ctx.fillStyle = color.replace('0.55', '0.07');
+        ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
+        // Region label
+        ctx.fillStyle = color.replace('0.55', '0.9');
+        ctx.font = 'bold 13px sans-serif';
+        ctx.fillText(`R${ri + 1} (${r.num_lines} lines)`, x1 + 4, y1 + 16);
+    });
+    // Draw line boxes on top
+    for (let i = 0; i < bboxes.length; i++) {
+        const [x1, y1, x2, y2] = bboxes[i];
+        const isHighlighted = i === highlightIndex;
+        ctx.strokeStyle = isHighlighted ? '#e94560' : 'rgba(58, 134, 255, 0.6)';
+        ctx.lineWidth = isHighlighted ? 3 : 1.5;
+        ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
+        if (isHighlighted) {
+            ctx.fillStyle = 'rgba(233, 69, 96, 0.1)';
+            ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
+        }
+    }
+}
+function highlightBbox(index) {
+    if (currentBboxes.length > 0) {
+        drawBboxes(currentBboxes, index, currentRegions);
+    }
+}

web/static/components/transcription-panel.js ADDED Viewed

	@@ -0,0 +1,482 @@

+/**
+ * Transcription Panel — SSE progress, results, export
+ */
+import { state, emit, on, toast } from '../app.js';
+const $ = id => document.getElementById(id);
+// ── Font selector ───────────────────────────────────────────────────────
+const LS_FONT = 'polyscriptor_results_font';
+const FONTS = [
+    { label: 'Monospace (default)',  value: '' },
+    { label: 'Monomakh Unicode ✦',  value: 'Monomakh',         local: true            },
+    { label: 'Old Standard TT',     value: 'Old Standard TT',  gf: 'Old+Standard+TT'  },
+    { label: 'Noto Serif',          value: 'Noto Serif',       gf: 'Noto+Serif'        },
+    { label: 'Crimson Pro',         value: 'Crimson Pro',      gf: 'Crimson+Pro'       },
+    { label: 'IM Fell English',     value: 'IM Fell English',  gf: 'IM+Fell+English'   },
+];
+const _loadedFonts = new Set();
+function _loadGoogleFont(gfParam) {
+    const url = `https://fonts.googleapis.com/css2?family=${gfParam}&display=swap`;
+    if (_loadedFonts.has(url)) return;
+    const link = document.createElement('link');
+    link.rel = 'stylesheet';
+    link.href = url;
+    document.head.appendChild(link);
+    _loadedFonts.add(url);
+}
+function applyFont(value) {
+    const f = FONTS.find(f => f.value === value);
+    if (!f) return;
+    if (f.gf) _loadGoogleFont(f.gf);
+    if (f.value) {
+        document.documentElement.style.setProperty(
+            '--font-results', `"${f.value}", Georgia, serif`);
+    } else {
+        document.documentElement.style.removeProperty('--font-results');
+    }
+}
+export function initTranscriptionPanel() {
+    let _transcribeStart = null;
+    let _numRegions = 1;
+    let _columnMode = false;
+    // Confidence threshold slider
+    const slider = $('conf-threshold');
+    const sliderVal = $('conf-threshold-val');
+    slider.addEventListener('input', () => {
+        const threshold = parseInt(slider.value, 10);
+        sliderVal.textContent = threshold + '%';
+        applyConfidenceFilter(threshold);
+    });
+    // Search / filter
+    const searchInput = $('results-search');
+    searchInput.addEventListener('input', () => applySearch(searchInput.value));
+    // Clear search on new transcription
+    function resetSearch() {
+        searchInput.value = '';
+        $('results-search-row').classList.add('hidden');
+        $('results-search-count').textContent = '';
+    }
+    // Font selector — populate, restore, handle changes
+    const fontSel = $('font-select');
+    for (const f of FONTS) {
+        const o = document.createElement('option');
+        o.value = f.value;
+        o.textContent = f.label;
+        fontSel.appendChild(o);
+    }
+    const savedFont = (() => { try { return localStorage.getItem(LS_FONT) || ''; } catch { return ''; } })();
+    fontSel.value = savedFont;
+    if (savedFont) applyFont(savedFont);
+    fontSel.addEventListener('change', () => {
+        applyFont(fontSel.value);
+        try { localStorage.setItem(LS_FONT, fontSel.value); } catch { /* private mode */ }
+    });
+    // Column layout toggle
+    $('btn-col-layout').addEventListener('click', () => {
+        _columnMode = !_columnMode;
+        $('btn-col-layout').classList.toggle('active', _columnMode);
+        if (_columnMode) renderAllColumns();
+        else renderAllFlat();
+    });
+    on('transcription-start', () => {
+        state.lines = [];
+        _transcribeStart = null;
+        _numRegions = 1;
+        _columnMode = false;
+        $('btn-col-layout').classList.add('hidden');
+        $('btn-col-layout').classList.remove('active');
+        $('transcription-lines').innerHTML = '';
+        $('transcription-lines').classList.remove('col-layout');
+        $('progress-container').classList.remove('hidden');
+        $('results-footer').classList.add('hidden');
+        $('conf-filter-row').classList.add('hidden');
+        resetSearch();
+        $('progress-fill').style.width = '0%';
+        $('progress-fill').style.background = '';  // reset error colour
+        $('progress-text').textContent = 'Segmenting...';
+    });
+    // Highlight line in transcription panel when a bbox is clicked (or line clicked)
+    on('highlight-line', ({ index }) => {
+        const container = $('transcription-lines');
+        container.querySelectorAll('.line-active').forEach(el => el.classList.remove('line-active'));
+        const target = container.querySelector(`[data-index="${index}"]`);
+        if (target) {
+            target.classList.add('line-active');
+            target.scrollIntoView({ block: 'nearest', behavior: 'smooth' });
+        }
+    });
+    on('sse-status', data => {
+        $('progress-text').textContent = data.message;
+    });
+    on('sse-segmentation', data => {
+        if (data.source === 'page') {
+            $('progress-text').textContent = 'Processing full page...';
+        } else {
+            $('progress-text').textContent = `${data.num_lines} lines found. Transcribing...`;
+        }
+    });
+    on('sse-progress', data => {
+        const pct = Math.round((data.current / data.total) * 100);
+        $('progress-fill').style.width = pct + '%';
+        // ETA
+        const now = Date.now();
+        if (!_transcribeStart) _transcribeStart = now;
+        const elapsed = (now - _transcribeStart) / 1000;
+        const rate = data.current / elapsed;  // lines/s
+        const remaining = rate > 0 ? Math.round((data.total - data.current) / rate) : null;
+        const etaStr = remaining != null
+            ? ` · ~${remaining < 60 ? remaining + 's' : Math.round(remaining / 60) + 'min'} left`
+            : '';
+        let tokenStr = '';
+        if (data.token_usage) {
+            const tu = data.token_usage;
+            const parts = [];
+            if (tu.prompt_tokens  != null) parts.push(`in:${tu.prompt_tokens}`);
+            if (tu.output_tokens  != null) parts.push(`out:${tu.output_tokens}`);
+            if (tu.thinking_tokens != null && tu.thinking_tokens > 0) parts.push(`think:${tu.thinking_tokens}`);
+            if (parts.length) tokenStr = ` | ${parts.join(' ')} tok`;
+        }
+        $('progress-text').textContent = `${data.current} / ${data.total} lines${etaStr}${tokenStr}`;
+        _numRegions = Math.max(_numRegions, (data.line.region ?? 0) + 1);
+        state.lines.push(data.line);
+        appendLine(data.line);
+    });
+    on('sse-complete', data => {
+        $('progress-container').classList.add('hidden');
+        $('results-footer').classList.remove('hidden');
+        $('btn-export-xml').classList.toggle('hidden', !!data.browser_direct);
+        let summary = `${data.lines.length} lines in ${data.total_time_s}s (${data.engine})`;
+        if (data.token_usage) {
+            const tu = data.token_usage;
+            const parts = [];
+            if (tu.prompt_tokens != null)   parts.push(`in: ${tu.prompt_tokens}`);
+            if (tu.output_tokens != null)   parts.push(`out: ${tu.output_tokens}`);
+            if (tu.thinking_tokens != null && tu.thinking_tokens > 0)
+                parts.push(`think: ${tu.thinking_tokens}`);
+            if (parts.length) summary += ` | tokens: ${parts.join(', ')}`;
+        }
+        $('results-summary').textContent = summary;
+        // Show confidence filter if any line has confidence data
+        if (state.lines.some(l => l.confidence != null)) {
+            $('conf-filter-row').classList.remove('hidden');
+            slider.value = 0;
+            sliderVal.textContent = '0%';
+        }
+        // Show search if there are results
+        if (state.lines.length > 0) {
+            $('results-search-row').classList.remove('hidden');
+        }
+        // Show column layout toggle if multiple regions detected
+        if (_numRegions > 1) {
+            $('btn-col-layout').classList.remove('hidden');
+        }
+        emit('transcription-complete', data);
+    });
+    on('sse-cancelled', () => {
+        $('progress-text').textContent = 'Cancelled';
+        $('progress-fill').style.width = '0%';
+        // Show footer if we have partial results
+        if (state.lines.length > 0) {
+            $('results-footer').classList.remove('hidden');
+            $('results-summary').textContent = `Cancelled — ${state.lines.length} lines transcribed`;
+        }
+        emit('transcription-complete', {});
+    });
+    on('sse-error', data => {
+        $('progress-text').textContent = `Error: ${data.message}`;
+        $('progress-fill').style.width = '0%';
+        $('progress-fill').style.background = 'var(--danger)';
+        emit('transcription-complete', {});
+    });
+    on('transcription-error', data => {
+        $('progress-text').textContent = `Error: ${data.message}`;
+        emit('transcription-complete', {});
+    });
+    // Also hide Export XML when a new transcription starts
+    on('transcription-start', () => {
+        $('btn-export-xml').classList.add('hidden');
+    });
+    $('btn-copy-text').addEventListener('click', copyText);
+    $('btn-export-txt').addEventListener('click', exportTxt);
+    $('btn-export-csv').addEventListener('click', exportCsv);
+    $('btn-export-xml').addEventListener('click', exportXml);
+}
+function renderAllFlat() {
+    const container = $('transcription-lines');
+    container.innerHTML = '';
+    container.classList.remove('col-layout');
+    state.lines.forEach(line => appendLine(line));
+}
+function renderAllColumns() {
+    const container = $('transcription-lines');
+    container.innerHTML = '';
+    container.classList.add('col-layout');
+    const maxRegion = state.lines.reduce((m, l) => Math.max(m, l.region ?? 0), 0);
+    const groups = Array.from({ length: maxRegion + 1 }, () => []);
+    state.lines.forEach(line => groups[line.region ?? 0].push(line));
+    groups.forEach((lines, r) => {
+        const col = document.createElement('div');
+        col.className = 'region-column';
+        const hdr = document.createElement('div');
+        hdr.className = 'region-col-header';
+        const title = document.createElement('span');
+        title.textContent = `Column ${r + 1}  (${lines.length})`;
+        hdr.appendChild(title);
+        const closeBtn = document.createElement('button');
+        closeBtn.className = 'region-col-close';
+        closeBtn.textContent = '×';
+        closeBtn.title = 'Hide this column';
+        closeBtn.addEventListener('click', e => { e.stopPropagation(); col.remove(); });
+        hdr.appendChild(closeBtn);
+        col.appendChild(hdr);
+        lines.forEach(line => appendLine(line, col));
+        container.appendChild(col);
+    });
+}
+function appendLine(line, container = null) {
+    container = container || $('transcription-lines');
+    const div = document.createElement('div');
+    div.className = 'line-result';
+    div.dataset.index = line.index;
+    if (line.confidence != null) {
+        div.dataset.confidence = Math.round(line.confidence * 100);
+    }
+    // Line number
+    const numSpan = document.createElement('span');
+    numSpan.className = 'line-num';
+    numSpan.textContent = line.index + 1;
+    // Editable text span
+    const textSpan = document.createElement('span');
+    textSpan.className = 'line-text';
+    textSpan.textContent = line.text;
+    // Confidence badge
+    let confSpan = null;
+    if (line.confidence != null) {
+        const pct = Math.round(line.confidence * 100);
+        const cls = pct >= 90 ? 'conf-high' : pct >= 75 ? 'conf-mid' : 'conf-low';
+        confSpan = document.createElement('span');
+        confSpan.className = `confidence ${cls}`;
+        confSpan.textContent = pct + '%';
+    }
+    div.appendChild(numSpan);
+    div.appendChild(textSpan);
+    if (confSpan) div.appendChild(confSpan);
+    // Thinking text (Gemini reasoning) — collapsible per line
+    if (line.thinking_text) {
+        const details = document.createElement('details');
+        details.className = 'thinking-block';
+        const summary = document.createElement('summary');
+        summary.className = 'thinking-toggle';
+        summary.textContent = 'reasoning';
+        const pre = document.createElement('pre');
+        pre.className = 'thinking-text';
+        pre.textContent = line.thinking_text;
+        details.appendChild(summary);
+        details.appendChild(pre);
+        div.appendChild(details);
+    }
+    // Single click → highlight bbox on image
+    div.addEventListener('click', e => {
+        if (textSpan.contentEditable === 'true') return; // don't interfere while editing
+        emit('highlight-line', { index: line.index });
+    });
+    // Double-click → start inline editing
+    textSpan.addEventListener('dblclick', e => {
+        e.stopPropagation();
+        textSpan.contentEditable = 'true';
+        textSpan.focus();
+        // Select all text for easy replacement
+        const range = document.createRange();
+        range.selectNodeContents(textSpan);
+        const sel = window.getSelection();
+        sel.removeAllRanges();
+        sel.addRange(range);
+    });
+    // Save on blur or Enter
+    const saveEdit = () => {
+        textSpan.contentEditable = 'false';
+        const newText = textSpan.textContent;
+        if (newText !== line.text) {
+            state.lines[line.index].text = newText;
+            div.classList.add('line-edited');
+        }
+    };
+    textSpan.addEventListener('blur', saveEdit);
+    textSpan.addEventListener('keydown', e => {
+        if (e.key === 'Enter') { e.preventDefault(); saveEdit(); }
+        if (e.key === 'Escape') {
+            textSpan.textContent = state.lines[line.index].text; // revert
+            textSpan.contentEditable = 'false';
+        }
+    });
+    container.appendChild(div);
+    // Auto-scroll only for the main flat container (not column sub-divs)
+    if (container === $('transcription-lines')) {
+        container.scrollTop = container.scrollHeight;
+    }
+}
+function applyConfidenceFilter(threshold) {
+    $('transcription-lines').querySelectorAll('.line-result').forEach(div => {
+        const conf = parseInt(div.dataset.confidence ?? '100', 10);
+        div.classList.toggle('line-dimmed', conf < threshold);
+    });
+}
+function applySearch(query) {
+    const lines = $('transcription-lines').querySelectorAll('.line-result');
+    const q = query.trim().toLowerCase();
+    let matchCount = 0;
+    lines.forEach(div => {
+        const textSpan = div.querySelector('.line-text');
+        if (!textSpan) return;
+        // Use state.lines for the canonical text (survives inline edits and search markup)
+        const lineIdx = parseInt(div.dataset.index ?? '-1', 10);
+        const raw = lineIdx >= 0 && state.lines[lineIdx]
+            ? state.lines[lineIdx].text
+            : textSpan.textContent;
+        if (!q) {
+            // Clear search: restore plain text, remove hidden
+            textSpan.textContent = raw;
+            div.classList.remove('line-hidden');
+            return;
+        }
+        const lc = raw.toLowerCase();
+        const idx = lc.indexOf(q);
+        if (idx === -1) {
+            div.classList.add('line-hidden');
+        } else {
+            div.classList.remove('line-hidden');
+            matchCount++;
+            // Highlight match with <mark> using safe DOM manipulation
+            const before = raw.slice(0, idx);
+            const match  = raw.slice(idx, idx + q.length);
+            const after  = raw.slice(idx + q.length);
+            textSpan.textContent = '';
+            textSpan.appendChild(document.createTextNode(before));
+            const mark = document.createElement('mark');
+            mark.textContent = match;
+            textSpan.appendChild(mark);
+            textSpan.appendChild(document.createTextNode(after));
+        }
+    });
+    const countEl = $('results-search-count');
+    countEl.textContent = q ? `${matchCount} match${matchCount !== 1 ? 'es' : ''}` : '';
+}
+// (escapeHtml no longer needed — we use textContent/DOM directly)
+async function copyText() {
+    if (state.lines.length === 0) return;
+    const text = state.lines.map(l => l.text).join('\n');
+    try {
+        await navigator.clipboard.writeText(text);
+        const btn = $('btn-copy-text');
+        const orig = btn.textContent;
+        btn.textContent = 'Copied!';
+        setTimeout(() => { btn.textContent = orig; }, 1500);
+    } catch {
+        toast('Clipboard not available — use Export TXT instead', 'error');
+    }
+}
+function exportTxt() {
+    if (state.lines.length === 0) return;
+    const text = state.lines.map(l => l.text).join('\n');
+    downloadFile('transcription.txt', text, 'text/plain');
+}
+function exportCsv() {
+    if (state.lines.length === 0) return;
+    const header = 'Line,Text,Confidence,X1,Y1,X2,Y2\n';
+    const rows = state.lines.map(l => {
+        const conf = l.confidence != null ? l.confidence.toFixed(4) : '';
+        const bbox = l.bbox ? l.bbox.join(',') : ',,,';
+        return `${l.index + 1},"${l.text.replace(/"/g, '""')}",${conf},${bbox}`;
+    }).join('\n');
+    downloadFile('transcription.csv', header + rows, 'text/csv');
+}
+function downloadFile(filename, content, mime) {
+    const blob = new Blob([content], { type: mime });
+    const url = URL.createObjectURL(blob);
+    const a = document.createElement('a');
+    a.href = url;
+    a.download = filename;
+    a.click();
+    URL.revokeObjectURL(url);
+}
+async function exportXml() {
+    if (!state.imageId) return;
+    try {
+        const resp = await fetch(`/api/image/${state.imageId}/export-xml`, { method: 'POST' });
+        if (!resp.ok) {
+            const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+            toast(`XML export failed: ${err.detail || resp.statusText}`, 'error');
+            return;
+        }
+        const blob = await resp.blob();
+        // Use filename from Content-Disposition if provided, else fall back
+        let filename = 'transcription.xml';
+        const cd = resp.headers.get('Content-Disposition');
+        if (cd) {
+            const m = cd.match(/filename="([^"]+)"/);
+            if (m) filename = m[1];
+        }
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement('a');
+        a.href = url;
+        a.download = filename;
+        a.click();
+        URL.revokeObjectURL(url);
+    } catch (err) {
+        toast(`XML export error: ${err.message}`, 'error');
+    }
+}

web/static/fonts/MonomakhUnicode-Regular.woff2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a07ebc9c97abc54866b6c8f35d6057f861f84a760127349f28c47c069a9cfea4
+size 86480

web/static/index.html ADDED Viewed

	@@ -0,0 +1,323 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Polyscriptor HTR</title>
+    <link rel="stylesheet" href="/static/app.css">
+</head>
+<body>
+    <!-- Header -->
+    <header id="header">
+        <div class="header-left">
+            <span class="header-logo">⬡</span>
+            <h1>Polyscriptor <span class="header-sub">HTR</span></h1>
+        </div>
+        <div class="header-right">
+            <div id="gpu-status" class="gpu-widget"></div>
+            <button id="btn-help" class="btn-icon" title="Help">?</button>
+        </div>
+    </header>
+    <!-- Main 3-column layout -->
+    <main id="app">
+        <!-- Left: Engine + Image controls -->
+        <aside id="engine-panel" class="panel" data-panel="settings">
+            <section class="panel-section">
+                <h2>HTR Engine</h2>
+                <label for="engine-select">Engine</label>
+                <select id="engine-select" disabled>
+                    <option>Loading engines…</option>
+                </select>
+                <p id="engine-description" class="muted"></p>
+                <div id="config-form"></div>
+                <div id="kraken-preset-row" class="hidden" style="margin-top:8px">
+                    <label for="kraken-preset-select" style="display:block;font-size:0.78rem;margin-bottom:3px">Kraken Model Preset</label>
+                    <select id="kraken-preset-select" style="width:100%">
+                        <option value="">Loading presets…</option>
+                    </select>
+                    <span id="kraken-preset-status" class="muted" style="font-size:0.72rem;display:block;margin-top:3px"></span>
+                </div>
+                <button id="btn-load-model" class="btn btn-primary" disabled>Load Model</button>
+                <div id="engine-status" class="status-badge hidden"></div>
+            </section>
+            <hr>
+            <section class="panel-section">
+                <h2>Image</h2>
+                <div id="upload-area" class="upload-area">
+                    <svg class="upload-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
+                        <path stroke-linecap="round" stroke-linejoin="round"
+                              d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5m-13.5-9L12 3m0 0l4.5 4.5M12 3v13.5"/>
+                    </svg>
+                    <p>Drop image or PDF, or click to browse</p>
+                    <input type="file" id="file-input" accept="image/*,.pdf" multiple hidden>
+                </div>
+                <p id="image-info" class="muted"></p>
+                <div id="batch-queue-section" class="hidden">
+                    <div class="batch-queue-header">
+                        <span class="section-label">Queue</span>
+                        <span id="batch-overall-progress" class="batch-overall-progress hidden"></span>
+                    </div>
+                    <div id="batch-list"></div>
+                    <div class="batch-options-row">
+                        <label class="checkbox-label" title="Use PAGE XML segmentation if a matching .xml file was uploaded for this image">
+                            <input type="checkbox" id="batch-use-pagexml" checked>
+                            Use PAGE XML
+                        </label>
+                        <label class="checkbox-label" title="Skip images that have already been transcribed in this session">
+                            <input type="checkbox" id="batch-resume">
+                            Resume
+                        </label>
+                    </div>
+                    <div class="btn-row" style="margin-top:6px">
+                        <button id="btn-process-batch" class="btn btn-primary btn-small">Process All</button>
+                        <button id="btn-clear-batch" class="btn btn-small btn-outline">Clear</button>
+                    </div>
+                    <div id="batch-export-row" class="btn-row hidden" style="margin-top:6px">
+                        <button id="btn-export-batch-txt" class="btn btn-small">All TXT</button>
+                        <button id="btn-export-batch-csv" class="btn btn-small">All CSV</button>
+                        <button id="btn-export-batch-txt-zip" class="btn btn-small btn-primary">Download ZIP (TXT)</button>
+                        <button id="btn-export-batch-thinking-zip" class="btn btn-small btn-primary">Download ZIP (Thinking)</button>
+                        <button id="btn-export-batch-xml" class="btn btn-small btn-primary">Download ZIP (XML)</button>
+                    </div>
+                </div>
+                <div id="xml-upload-row" class="xml-row hidden">
+                    <span id="xml-status" class="muted">No PAGE XML</span>
+                    <label class="btn btn-small btn-outline" for="xml-input">
+                        Upload XML
+                        <input type="file" id="xml-input" accept=".xml" hidden multiple>
+                    </label>
+                </div>
+            </section>
+            <hr>
+            <section class="panel-section" id="seg-controls">
+                <h2>Segmentation</h2>
+                <label for="seg-method">Method</label>
+                <select id="seg-method">
+                    <option value="kraken" selected>Kraken Classical</option>
+                    <option value="hpp">HPP / projection profile fallback</option>
+                    <option value="kraken-blla" disabled>Kraken Neural / blla (server only)</option>
+                </select>
+                <label for="seg-device">Device</label>
+                <select id="seg-device">
+                    <option value="cpu">CPU</option>
+                    <option value="cuda:0">GPU 0</option>
+                    <option value="cuda:1">GPU 1</option>
+                </select>
+                <div id="blla-options" style="display:none">
+                    <div style="display:flex;gap:12px;align-items:center;flex-wrap:wrap">
+                        <div style="display:flex;flex-direction:column;gap:3px">
+                            <label for="seg-max-columns">Max columns</label>
+                            <input type="number" id="seg-max-columns" min="1" max="12" value="6" style="width:60px">
+                        </div>
+                        <div style="display:flex;flex-direction:column;gap:3px">
+                            <label for="seg-split-width">Split width %</label>
+                            <input type="number" id="seg-split-width" min="5" max="80" value="40" step="5" style="width:60px" title="Min region width (% of page) to trigger sub-column splitting. Lower = split narrower regions. Double pages: try 20.">
+                        </div>
+                    </div>
+                    <div style="margin-top:6px">
+                        <label for="seg-text-direction">Reading direction</label>
+                        <select id="seg-text-direction" title="Controls column reading order. Use horizontal-rl for Arabic, Ottoman, Hebrew manuscripts.">
+                            <option value="horizontal-lr">LTR (Latin, Cyrillic, …)</option>
+                            <option value="horizontal-rl">RTL (Arabic, Ottoman, Hebrew, …)</option>
+                            <option value="vertical-lr">Vertical LTR</option>
+                            <option value="vertical-rl">Vertical RTL</option>
+                        </select>
+                    </div>
+                </div>
+            </section>
+            <div id="seg-regions-list" class="hidden"></div>
+            <div class="panel-footer">
+                <div class="btn-row footer-btn-row">
+                    <button id="btn-segment" class="btn btn-outline" disabled title="Preview line segmentation without transcribing">Segment</button>
+                    <button id="btn-transcribe" class="btn btn-accent" disabled>Transcribe</button>
+                </div>
+            </div>
+        </aside>
+        <div class="panel-resize-handle" id="resize-left" title="Drag to resize"></div>
+        <!-- Center: Image viewer -->
+        <section id="viewer-panel" class="panel" data-panel="image">
+            <!-- Zoom toolbar — only visible when image is loaded -->
+            <div id="zoom-toolbar" class="zoom-toolbar hidden">
+                <button class="zoom-btn" id="btn-zoom-out" title="Zoom out">−</button>
+                <span id="zoom-level" class="zoom-level">100%</span>
+                <button class="zoom-btn" id="btn-zoom-in" title="Zoom in">+</button>
+                <button class="zoom-btn zoom-fit" id="btn-zoom-fit" title="Fit to view">⊡</button>
+                <span class="zoom-toolbar-sep"></span>
+                <button class="btn btn-small btn-outline nav-btn" id="btn-nav-prev" title="Previous image (←)" disabled>‹ Prev</button>
+                <span id="batch-nav-label" class="batch-nav-label-toolbar"></span>
+                <button class="btn btn-small btn-outline nav-btn" id="btn-nav-next" title="Next image (→)" disabled>Next ›</button>
+            </div>
+            <!-- Scroll area fills remaining height -->
+            <div id="viewer-scroll">
+                <div id="viewer-placeholder" class="viewer-placeholder">
+                    <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1">
+                        <rect x="3" y="3" width="18" height="18" rx="2"/>
+                        <circle cx="8.5" cy="8.5" r="1.5"/>
+                        <path stroke-linecap="round" stroke-linejoin="round" d="M21 15l-5-5L5 21"/>
+                    </svg>
+                    <p>Upload an image to begin</p>
+                </div>
+                <div id="image-container" class="hidden">
+                    <img id="page-image">
+                    <canvas id="overlay-canvas"></canvas>
+                </div>
+            </div>
+        </section>
+        <div class="panel-resize-handle" id="resize-right" title="Drag to resize"></div>
+        <!-- Right: Transcription results -->
+        <section id="results-panel" class="panel" data-panel="results">
+            <div class="results-header">
+                <div class="results-header-row">
+                    <h2>Transcription</h2>
+                    <div class="results-header-controls">
+                        <select id="font-select" class="font-select" title="Transcription font"></select>
+                        <button id="btn-col-layout" class="btn-icon hidden" title="Toggle column layout">⊞</button>
+                    </div>
+                </div>
+                <div id="results-search-row" class="results-search-row hidden">
+                    <input type="search" id="results-search" placeholder="Search lines…" autocomplete="off">
+                    <span id="results-search-count" class="muted"></span>
+                </div>
+                <div id="conf-filter-row" class="conf-filter-row hidden">
+                    <label>Min conf: <strong id="conf-threshold-val">0%</strong></label>
+                    <input type="range" id="conf-threshold" min="0" max="100" value="0" step="5">
+                </div>
+                <div id="progress-container" class="hidden">
+                    <div id="progress-bar"><div id="progress-fill"></div></div>
+                    <div class="progress-row">
+                        <p id="progress-text" class="muted">0 / 0 lines</p>
+                        <button id="btn-cancel" class="btn btn-small hidden">Cancel</button>
+                    </div>
+                </div>
+            </div>
+            <div id="transcription-lines"></div>
+            <div id="results-footer" class="hidden">
+                <p id="results-summary" class="muted"></p>
+                <div class="btn-row">
+                    <button id="btn-copy-text" class="btn btn-small">Copy Text</button>
+                    <button id="btn-export-txt" class="btn btn-small">TXT</button>
+                    <button id="btn-export-csv" class="btn btn-small">CSV</button>
+                    <button id="btn-export-xml" class="btn btn-small hidden">XML</button>
+                </div>
+            </div>
+        </section>
+    </main>
+    <!-- Mobile tab bar (visible < 700px) -->
+    <nav id="mobile-tabs">
+        <button class="tab-btn active" data-target="settings">
+            <svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M11.49 3.17c-.38-1.56-2.6-1.56-2.98 0a1.532 1.532 0 01-2.286.948c-1.372-.836-2.942.734-2.106 2.106.54.886.061 2.042-.947 2.287-1.561.379-1.561 2.6 0 2.978a1.532 1.532 0 01.947 2.287c-.836 1.372.734 2.942 2.106 2.106a1.532 1.532 0 012.287.947c.379 1.561 2.6 1.561 2.978 0a1.533 1.533 0 012.287-.947c1.372.836 2.942-.734 2.106-2.106a1.533 1.533 0 01.947-2.287c1.561-.379 1.561-2.6 0-2.978a1.532 1.532 0 01-.947-2.287c.836-1.372-.734-2.942-2.106-2.106a1.532 1.532 0 01-2.287-.947zM10 13a3 3 0 100-6 3 3 0 000 6z" clip-rule="evenodd"/></svg>
+            Settings
+        </button>
+        <button class="tab-btn" data-target="image">
+            <svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M4 3a2 2 0 00-2 2v10a2 2 0 002 2h12a2 2 0 002-2V5a2 2 0 00-2-2H4zm12 12H4l4-8 3 6 2-4 3 6z" clip-rule="evenodd"/></svg>
+            Image
+        </button>
+        <button class="tab-btn" data-target="results">
+            <svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M4 4a2 2 0 012-2h4.586A2 2 0 0112 2.586L15.414 6A2 2 0 0116 7.414V16a2 2 0 01-2 2H6a2 2 0 01-2-2V4zm2 6a1 1 0 011-1h6a1 1 0 110 2H7a1 1 0 01-1-1zm1 3a1 1 0 100 2h6a1 1 0 100-2H7z" clip-rule="evenodd"/></svg>
+            Results
+        </button>
+    </nav>
+    <!-- Help modal -->
+    <dialog id="help-modal">
+        <div class="modal-header">
+            <h2>Polyscriptor HTR — Quick Guide</h2>
+            <button id="btn-help-close" class="btn-icon">✕</button>
+        </div>
+        <div class="modal-body">
+            <h3>Quick Start</h3>
+            <ol>
+                <li><strong>Select an engine</strong> from the dropdown and configure it (model path, API key, etc.).</li>
+                <li>Click <strong>Load Model</strong> and wait for the green status badge.</li>
+                <li><strong>Upload an image</strong> by dragging it onto the upload area or clicking to browse.</li>
+                <li>Optionally click <strong>Segment</strong> to preview line detection before transcribing.</li>
+                <li>Click <strong>Transcribe</strong>. Lines appear one by one as they are processed.</li>
+                <li><strong>Export</strong> the result as TXT, CSV, or PAGE XML.</li>
+            </ol>
+            <h3>Source Code</h3>
+            <p>
+                The public Polyscriptor source code is available on
+                <a href="https://github.com/achimrabus/polyscriptor" target="_blank" rel="noopener noreferrer">GitHub</a>.
+                This Hugging Face Space runs a curated hosted demo configuration.
+            </p>
+            <h3>Engines</h3>
+            <table>
+                <tr><th>Engine</th><th>Best for</th></tr>
+                <tr><td>CRNN-CTC</td><td>Fastest; works well on Church Slavonic, Glagolitic, Ukrainian with trained models</td></tr>
+                <tr><td>TrOCR</td><td>HuggingFace Transformer OCR; good general-purpose accuracy</td></tr>
+                <tr><td>Qwen3-VL</td><td>Large vision-language model; best quality but slow, needs GPU</td></tr>
+                <tr><td>Kraken</td><td>Classical HTR; good for Latin scripts</td></tr>
+                <tr><td>Party</td><td>Whole-page transformer; requires PAGE XML with line segmentation</td></tr>
+                <tr><td>Commercial APIs</td><td>OpenAI / Gemini / Claude — cloud inference, no local GPU needed</td></tr>
+                <tr><td>OpenWebUI</td><td>Locally hosted models via OpenWebUI/Ollama</td></tr>
+            </table>
+            <h3>Segmentation</h3>
+            <ul>
+                <li><strong>Kraken Classical</strong> — default line segmentation in this Hugging Face CPU demo.</li>
+                <li><strong>HPP</strong> — horizontal projection profile fallback.</li>
+                <li><strong>Kraken Neural / blla</strong> — available on the full server setup, but not enabled in this Space.</li>
+                <li><strong>PAGE XML upload</strong> — skip segmentation entirely by uploading an existing PAGE XML annotation (e.g. from Transkribus).</li>
+            </ul>
+            <h3>Tips</h3>
+            <ul>
+                <li>Click a transcription line to highlight the corresponding bounding box in the image.</li>
+                <li>Confidence badges: <span class="conf-high demo-badge">high ≥90%</span> <span class="conf-mid demo-badge">mid ≥75%</span> <span class="conf-low demo-badge">low &lt;75%</span></li>
+                <li>Line-segmenting engines (CRNN-CTC, TrOCR, Kraken) use the segmentation method above. Page-level engines (Party, Qwen3-VL, Commercial APIs) do their own segmentation.</li>
+                <li>API keys can be saved on the server — enter the key once, check <em>Save key on server</em>.</li>
+                <li>Uploads are kept for 24 hours, then cleaned up automatically.</li>
+            </ul>
+            <h3>Keyboard</h3>
+            <ul>
+                <li><kbd>Esc</kbd> — close this dialog</li>
+            </ul>
+        </div>
+    </dialog>
+    <!-- Toast notification container -->
+    <div id="toast-container"></div>
+    <script type="module" src="/static/app.js"></script>
+    <script>
+        // Help modal
+        const modal = document.getElementById('help-modal');
+        document.getElementById('btn-help').addEventListener('click', () => modal.showModal());
+        document.getElementById('btn-help-close').addEventListener('click', () => modal.close());
+        modal.addEventListener('click', e => { if (e.target === modal) modal.close(); });
+        // Mobile tab bar
+        const tabBtns = document.querySelectorAll('.tab-btn');
+        const panels = document.querySelectorAll('[data-panel]');
+        tabBtns.forEach(btn => {
+            btn.addEventListener('click', () => {
+                const target = btn.dataset.target;
+                tabBtns.forEach(b => b.classList.remove('active'));
+                btn.classList.add('active');
+                panels.forEach(p => {
+                    p.classList.toggle('panel-active', p.dataset.panel === target);
+                });
+            });
+        });
+        // Default: settings active on mobile
+        document.querySelector('[data-panel="settings"]').classList.add('panel-active');
+    </script>
+</body>
+</html>

web/static/pwa/demo.css ADDED Viewed

	@@ -0,0 +1,698 @@

+/* ── Design tokens (matching main app) ───────────────────────────────── */
+:root {
+  --bg:           #111827;
+  --bg-panel:     #1f2937;
+  --bg-card:      #1a2333;
+  --bg-input:     #111827;
+  --bg-hover:     #2a3a52;
+  --text:         #e2e8f0;
+  --text-muted:   #64748b;
+  --text-dim:     #94a3b8;
+  --accent:       #e94560;
+  --primary:      #3b82f6;
+  --primary-dark: #2563eb;
+  --primary-glow: rgba(59,130,246,0.25);
+  --success:      #22c55e;
+  --warning:      #f59e0b;
+  --danger:       #ef4444;
+  --border:       #2d3f59;
+  --border-light: #3a4f6e;
+  --radius:       10px;
+  --radius-sm:    6px;
+  --font:         'Segoe UI', system-ui, -apple-system, sans-serif;
+  --font-mono:    'Consolas', 'Fira Code', monospace;
+  --header-h:     52px;
+  --safe-bottom:  env(safe-area-inset-bottom, 0px);
+}
+/* ── Reset ───────────────────────────────────────────────────────────── */
+*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+html, body {
+  height: 100%;
+  font-family: var(--font);
+  background: var(--bg);
+  color: var(--text);
+  -webkit-text-size-adjust: 100%;
+}
+body {
+  display: flex;
+  flex-direction: column;
+  min-height: 100dvh;
+  overflow-x: hidden;
+}
+/* ── Header ──────────────────────────────────────────────────────────── */
+#header {
+  position: sticky;
+  top: 0;
+  z-index: 100;
+  height: var(--header-h);
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  padding: 0 16px;
+  padding-top: env(safe-area-inset-top, 0);
+  background: var(--bg-panel);
+  border-bottom: 1px solid var(--border);
+  flex-shrink: 0;
+}
+.header-brand {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  user-select: none;
+}
+.logo-hex {
+  font-size: 1.5rem;
+  color: var(--primary);
+  line-height: 1;
+}
+.logo-text {
+  font-size: 1.05rem;
+  font-weight: 700;
+  letter-spacing: -0.01em;
+  color: var(--text);
+}
+.logo-sub {
+  font-weight: 400;
+  color: var(--text-dim);
+  font-size: 0.9em;
+}
+/* Engine status pill */
+.engine-pill {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  padding: 4px 10px;
+  border-radius: 20px;
+  font-size: 0.75rem;
+  font-weight: 600;
+  border: 1px solid transparent;
+  transition: all 0.2s;
+}
+.engine-pill--unknown  { background: var(--bg); border-color: var(--border); color: var(--text-muted); }
+.engine-pill--loaded   { background: rgba(34,197,94,0.12); border-color: rgba(34,197,94,0.4); color: var(--success); }
+.engine-pill--unloaded { background: rgba(239,68,68,0.1); border-color: rgba(239,68,68,0.3); color: var(--danger); }
+.engine-pill--loading  { background: rgba(245,158,11,0.1); border-color: rgba(245,158,11,0.3); color: var(--warning); }
+.pill-dot {
+  width: 7px;
+  height: 7px;
+  border-radius: 50%;
+  background: currentColor;
+  flex-shrink: 0;
+}
+.engine-pill--loading .pill-dot {
+  animation: pulse-dot 1s ease-in-out infinite;
+}
+@keyframes pulse-dot {
+  0%, 100% { opacity: 1; }
+  50% { opacity: 0.3; }
+}
+/* ── Toast ───────────────────────────────────────────────────────────── */
+#toast-container {
+  position: fixed;
+  top: calc(var(--header-h) + 8px);
+  left: 50%;
+  transform: translateX(-50%);
+  z-index: 200;
+  display: flex;
+  flex-direction: column;
+  gap: 6px;
+  width: calc(100% - 32px);
+  max-width: 420px;
+  pointer-events: none;
+}
+.toast {
+  padding: 10px 14px;
+  border-radius: var(--radius-sm);
+  font-size: 0.85rem;
+  font-weight: 500;
+  pointer-events: auto;
+  animation: toast-in 0.25s ease;
+}
+.toast--info    { background: var(--bg-panel); border: 1px solid var(--border); color: var(--text); }
+.toast--success { background: rgba(34,197,94,0.15); border: 1px solid rgba(34,197,94,0.4); color: var(--success); }
+.toast--error   { background: rgba(239,68,68,0.15); border: 1px solid rgba(239,68,68,0.4); color: #fca5a5; }
+.toast--warn    { background: rgba(245,158,11,0.12); border: 1px solid rgba(245,158,11,0.35); color: var(--warning); }
+@keyframes toast-in {
+  from { opacity: 0; transform: translateY(-8px); }
+  to   { opacity: 1; transform: translateY(0); }
+}
+/* ── Main scroll area ─────────────────────────────────────────────────── */
+#main {
+  flex: 1;
+  overflow-y: auto;
+  padding: 14px;
+  padding-bottom: calc(16px + var(--safe-bottom));
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+/* ── Cards ───────────────────────────────────────────────────────────── */
+.card {
+  background: var(--bg-card);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  padding: 16px;
+}
+.card-header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  margin-bottom: 14px;
+}
+.card-header h2 {
+  font-size: 0.9rem;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+  color: var(--text-dim);
+}
+/* ── Buttons ──────────────────────────────────────────────────────────── */
+.btn {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  gap: 8px;
+  padding: 12px 18px;
+  border: 1px solid transparent;
+  border-radius: var(--radius-sm);
+  font-family: var(--font);
+  font-size: 0.95rem;
+  font-weight: 600;
+  cursor: pointer;
+  transition: all 0.15s;
+  min-height: 48px;
+  user-select: none;
+  -webkit-tap-highlight-color: transparent;
+  white-space: nowrap;
+}
+.btn svg {
+  width: 18px;
+  height: 18px;
+  flex-shrink: 0;
+}
+.btn:disabled {
+  opacity: 0.4;
+  cursor: not-allowed;
+}
+.btn-primary {
+  background: var(--primary);
+  color: #fff;
+  border-color: var(--primary);
+}
+.btn-primary:not(:disabled):hover,
+.btn-primary:not(:disabled):active {
+  background: var(--primary-dark);
+  border-color: var(--primary-dark);
+}
+.btn-secondary {
+  background: var(--bg-panel);
+  color: var(--text);
+  border-color: var(--border-light);
+}
+.btn-secondary:not(:disabled):hover,
+.btn-secondary:not(:disabled):active {
+  background: var(--bg-hover);
+  border-color: var(--primary);
+  color: var(--primary);
+}
+.btn-danger {
+  background: rgba(239,68,68,0.15);
+  color: var(--danger);
+  border-color: rgba(239,68,68,0.4);
+}
+.btn-danger:not(:disabled):hover,
+.btn-danger:not(:disabled):active {
+  background: rgba(239,68,68,0.25);
+}
+.btn-ghost {
+  background: transparent;
+  color: var(--text-muted);
+  border-color: transparent;
+  padding: 4px 8px;
+  min-height: unset;
+  font-size: 0.8rem;
+}
+.btn-ghost:hover { color: var(--text); }
+.btn-small { font-size: 0.8rem; padding: 6px 10px; min-height: 32px; }
+/* Capture button — accent-colored, full-width primary CTA */
+.btn-capture {
+  background: linear-gradient(135deg, var(--primary) 0%, #6366f1 100%);
+  color: #fff;
+  border-color: transparent;
+  flex: 1;
+  padding: 14px;
+  font-size: 1rem;
+}
+.btn-capture:not(:disabled):hover,
+.btn-capture:not(:disabled):active {
+  opacity: 0.9;
+  transform: translateY(-1px);
+}
+.btn-capture:not(:disabled):active {
+  transform: translateY(0);
+}
+/* ── Upload section ───────────────────────────────────────────────────── */
+.upload-btn-row {
+  display: flex;
+  gap: 10px;
+}
+.btn-upload {
+  flex: 0 0 auto;
+  padding: 14px 16px;
+}
+/* Image preview */
+#image-preview-wrap {
+  margin-top: 14px;
+}
+#image-container {
+  position: relative;
+  display: inline-block;
+  width: 100%;
+  border-radius: var(--radius-sm);
+  overflow: hidden;
+  background: #000;
+}
+#preview-img {
+  display: block;
+  width: 100%;
+  height: auto;
+  max-height: 55vh;
+  object-fit: contain;
+}
+#bbox-canvas {
+  position: absolute;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  pointer-events: none;
+}
+.preview-meta {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  margin-top: 8px;
+}
+.meta-filename {
+  font-size: 0.78rem;
+  color: var(--text-muted);
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+  max-width: 70%;
+}
+/* ── Engine card ──────────────────────────────────────────────────────── */
+.field-row {
+  display: flex;
+  flex-direction: column;
+  gap: 5px;
+  margin-bottom: 10px;
+}
+.field-row label {
+  font-size: 0.75rem;
+  font-weight: 600;
+  color: var(--text-dim);
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+}
+select {
+  width: 100%;
+  padding: 10px 12px;
+  background: var(--bg-input);
+  color: var(--text);
+  border: 1px solid var(--border);
+  border-radius: var(--radius-sm);
+  font-family: var(--font);
+  font-size: 0.9rem;
+  cursor: pointer;
+  min-height: 44px;
+  -webkit-appearance: none;
+  appearance: none;
+  background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='8' viewBox='0 0 12 8'%3E%3Cpath d='M1 1l5 5 5-5' stroke='%2364748b' stroke-width='1.5' fill='none' stroke-linecap='round'/%3E%3C/svg%3E");
+  background-repeat: no-repeat;
+  background-position: right 12px center;
+  padding-right: 36px;
+}
+select:focus {
+  outline: none;
+  border-color: var(--primary);
+  box-shadow: 0 0 0 3px var(--primary-glow);
+}
+/* Badges */
+.badge {
+  padding: 3px 9px;
+  border-radius: 20px;
+  font-size: 0.72rem;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+}
+.badge--loading  { background: rgba(100,116,139,0.2); color: var(--text-muted); }
+.badge--loaded   { background: rgba(34,197,94,0.15); color: var(--success); }
+.badge--unloaded { background: rgba(239,68,68,0.12); color: var(--danger); }
+.badge--info     { background: rgba(59,130,246,0.15); color: var(--primary); }
+/* Advanced details */
+#advanced-details {
+  margin-top: 8px;
+  border-top: 1px solid var(--border);
+  padding-top: 10px;
+}
+#advanced-details summary {
+  font-size: 0.8rem;
+  color: var(--text-muted);
+  cursor: pointer;
+  user-select: none;
+  padding: 2px 0;
+  list-style: none;
+  display: flex;
+  align-items: center;
+  gap: 6px;
+}
+#advanced-details summary::before {
+  content: '›';
+  font-size: 1.1em;
+  transition: transform 0.2s;
+  display: inline-block;
+}
+#advanced-details[open] summary::before {
+  transform: rotate(90deg);
+}
+.advanced-inner {
+  margin-top: 10px;
+}
+/* ── Actions card ─────────────────────────────────────────────────────── */
+.actions-card {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 10px;
+  padding: 12px;
+}
+#btn-cancel {
+  flex: 0 0 100%;
+}
+.btn-action {
+  flex: 1;
+  padding: 14px 10px;
+}
+.btn-segment {
+  background: var(--bg-panel);
+  color: var(--text-dim);
+  border-color: var(--border-light);
+}
+.btn-segment:not(:disabled) {
+  color: var(--text);
+}
+.btn-segment:not(:disabled):hover,
+.btn-segment:not(:disabled):active {
+  background: var(--bg-hover);
+  border-color: var(--primary);
+  color: var(--primary);
+}
+/* ── Progress card ────────────────────────────────────────────────────── */
+#progress-bar-wrap {
+  height: 6px;
+  background: var(--bg-panel);
+  border-radius: 3px;
+  overflow: hidden;
+  margin-bottom: 10px;
+}
+#progress-bar {
+  height: 100%;
+  background: linear-gradient(90deg, var(--primary), #6366f1);
+  border-radius: 3px;
+  transition: width 0.3s ease;
+}
+.status-text {
+  font-size: 0.82rem;
+  color: var(--text-dim);
+  min-height: 1.4em;
+}
+/* ── Results card ─────────────────────────────────────────────────────── */
+#results-list {
+  display: flex;
+  flex-direction: column;
+  gap: 6px;
+  margin-bottom: 14px;
+  max-height: 50vh;
+  overflow-y: auto;
+}
+.result-line {
+  display: flex;
+  gap: 10px;
+  align-items: flex-start;
+  padding: 8px 10px;
+  background: var(--bg-panel);
+  border-radius: var(--radius-sm);
+  border: 1px solid var(--border);
+  animation: line-in 0.2s ease;
+}
+@keyframes line-in {
+  from { opacity: 0; transform: translateY(4px); }
+  to   { opacity: 1; transform: translateY(0); }
+}
+.line-num {
+  font-size: 0.72rem;
+  font-weight: 700;
+  color: var(--text-muted);
+  min-width: 22px;
+  padding-top: 1px;
+  flex-shrink: 0;
+  font-family: var(--font-mono);
+}
+.line-text {
+  flex: 1;
+  font-size: 0.88rem;
+  line-height: 1.45;
+  color: var(--text);
+  word-break: break-word;
+}
+.line-conf {
+  font-size: 0.7rem;
+  font-weight: 600;
+  padding: 2px 6px;
+  border-radius: 4px;
+  flex-shrink: 0;
+  align-self: flex-start;
+  margin-top: 1px;
+}
+.conf-high { background: rgba(34,197,94,0.15);  color: var(--success); }
+.conf-mid  { background: rgba(245,158,11,0.15); color: var(--warning); }
+.conf-low  { background: rgba(239,68,68,0.12);  color: var(--danger); }
+.results-actions {
+  display: flex;
+  gap: 10px;
+}
+.results-actions .btn {
+  flex: 1;
+  font-size: 0.85rem;
+  padding: 10px 12px;
+}
+/* ── Landscape layout ─────────────────────────────────────────────────── */
+@media (orientation: landscape) and (max-height: 600px) {
+  #main {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    grid-template-rows: auto;
+    align-items: start;
+  }
+  #upload-card    { grid-column: 1; grid-row: 1 / 3; }
+  #engine-card    { grid-column: 2; grid-row: 1; }
+  #actions-card   { grid-column: 2; grid-row: 2; }
+  #progress-card  { grid-column: 1 / 3; }
+  #results-card   { grid-column: 1 / 3; }
+  #preview-img {
+    max-height: 70vh;
+  }
+}
+/* ── Desktop (>= 768px) ───────────────────────────────────────────────── */
+@media (min-width: 768px) {
+  #main {
+    max-width: 580px;
+    margin: 0 auto;
+    padding: 20px 0 40px;
+  }
+}
+/* ── Utility ──────────────────────────────────────────────────────────── */
+.hidden { display: none !important; }
+/* Scrollbar styling */
+#results-list::-webkit-scrollbar { width: 4px; }
+#results-list::-webkit-scrollbar-track { background: transparent; }
+#results-list::-webkit-scrollbar-thumb { background: var(--border-light); border-radius: 2px; }
+/* Focus visible for accessibility */
+:focus-visible {
+  outline: 2px solid var(--primary);
+  outline-offset: 2px;
+}
+/* ── Photo Review Overlay ─────────────────────────────────────────────── */
+#photo-review {
+  position: fixed;
+  inset: 0;
+  background: #0a0a0a;
+  z-index: 500;
+  /* Use block layout instead of flex to avoid the iOS Safari flex+overflow-y scroll bug */
+  display: block;
+  overflow-y: auto;
+  -webkit-overflow-scrolling: touch;
+}
+#photo-review[hidden] {
+  display: none;
+}
+#review-inner {
+  width: 100%;
+  max-width: 600px;
+  margin: 0 auto;
+  padding: max(14px, env(safe-area-inset-top, 0px)) 14px calc(14px + env(safe-area-inset-bottom, 0px));
+  box-sizing: border-box;
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+#review-warn {
+  background: rgba(245, 158, 11, 0.2);
+  color: #fef3c7;
+  border: 1px solid rgba(245, 158, 11, 0.45);
+  padding: 10px 14px;
+  border-radius: 8px;
+  font-size: 0.875rem;
+  text-align: center;
+  line-height: 1.4;
+}
+#review-warn[hidden] { display: none; }
+#review-img-outer {
+  text-align: center;
+}
+#review-img-wrap {
+  display: inline-block;
+  position: relative;
+  max-width: 100%;
+  border-radius: 8px;
+  overflow: hidden;
+  background: #111;
+  vertical-align: top;
+}
+#review-img {
+  display: block;
+  max-width: 100%;
+  max-height: 45vh;  /* fallback */
+  max-height: 45svh; /* small viewport height: excludes browser chrome on iOS/Android */
+  width: auto;
+  height: auto;
+}
+#review-crop-canvas {
+  position: absolute;
+  inset: 0;
+  width: 100%;
+  height: 100%;
+  pointer-events: none;
+  touch-action: none;
+  cursor: crosshair;
+}
+#review-toolbar {
+  display: flex;
+  gap: 8px;
+  flex-wrap: wrap;
+  align-items: center;
+  justify-content: center;
+}
+.btn-icon {
+  font-size: 1.2rem;
+  padding: 8px 16px;
+  min-width: 48px;
+}
+#review-actions {
+  display: flex;
+  gap: 10px;
+}
+#review-actions .btn {
+  flex: 1;
+}
+@media (orientation: landscape) and (max-height: 500px) {
+  #review-img {
+    max-height: 30vh;
+    max-height: 30svh;
+  }
+}

web/static/pwa/demo.html ADDED Viewed

	@@ -0,0 +1,204 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
+  <meta name="theme-color" content="#3b82f6">
+  <meta name="mobile-web-app-capable" content="yes">
+  <meta name="apple-mobile-web-app-capable" content="yes">
+  <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
+  <meta name="apple-mobile-web-app-title" content="Polyscriptor">
+  <title>Polyscriptor HTR Demo</title>
+  <link rel="manifest" href="/manifest.json">
+  <link rel="apple-touch-icon" href="/static/pwa/icons/icon-192.png">
+  <link rel="stylesheet" href="/static/pwa/demo.css">
+</head>
+<body>
+  <!-- Header -->
+  <header id="header">
+    <div class="header-brand">
+      <span class="logo-hex">⬡</span>
+      <span class="logo-text">Polyscriptor <span class="logo-sub">HTR Demo</span></span>
+    </div>
+    <div class="header-actions">
+      <div id="engine-pill" class="engine-pill engine-pill--unknown" title="Engine status">
+        <span class="pill-dot"></span>
+        <span id="engine-pill-text">…</span>
+      </div>
+    </div>
+  </header>
+  <!-- Toast container -->
+  <div id="toast-container" aria-live="polite"></div>
+  <!-- Main content (scrollable) -->
+  <main id="main">
+    <!-- Card: Upload / Camera -->
+    <section id="upload-card" class="card">
+      <div id="upload-buttons" class="upload-btn-row">
+        <button id="btn-camera" class="btn btn-capture">
+          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+            <path d="M23 19a2 2 0 0 1-2 2H3a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h4l2-3h6l2 3h4a2 2 0 0 1 2 2z"/>
+            <circle cx="12" cy="13" r="4"/>
+          </svg>
+          <span>Take Photo</span>
+        </button>
+        <button id="btn-file" class="btn btn-secondary btn-upload">
+          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+            <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
+            <polyline points="17 8 12 3 7 8"/>
+            <line x1="12" y1="3" x2="12" y2="15"/>
+          </svg>
+          <span>Upload Image</span>
+        </button>
+      </div>
+      <!-- Hidden inputs -->
+      <input id="file-camera" type="file" accept="image/*" capture="environment" hidden>
+      <input id="file-picker" type="file" accept="image/*,application/pdf" hidden>
+      <!-- Image preview with bbox canvas overlay -->
+      <div id="image-preview-wrap" hidden>
+        <div id="image-container">
+          <img id="preview-img" alt="Uploaded image">
+          <canvas id="bbox-canvas"></canvas>
+        </div>
+        <div class="preview-meta">
+          <span id="preview-filename" class="meta-filename"></span>
+          <button id="btn-clear-image" class="btn-ghost btn-small">✕ Remove</button>
+        </div>
+      </div>
+    </section>
+    <!-- Card: Engine & Model -->
+    <section id="engine-card" class="card">
+      <div class="card-header">
+        <h2>HTR Engine</h2>
+        <span id="model-status-badge" class="badge badge--loading">checking…</span>
+      </div>
+      <div id="engine-controls">
+        <div class="field-row">
+          <label for="engine-select">Engine</label>
+          <select id="engine-select">
+            <option value="">Loading…</option>
+          </select>
+        </div>
+        <div class="field-row" id="model-row" hidden>
+          <label for="model-select">Model</label>
+          <select id="model-select">
+            <option value="">Select engine first</option>
+          </select>
+        </div>
+        <button id="btn-load-model" class="btn btn-secondary" hidden>
+          Load Model
+        </button>
+      </div>
+      <!-- Advanced: segmentation -->
+      <details id="advanced-details">
+        <summary>Advanced options</summary>
+        <div class="advanced-inner">
+          <div class="field-row">
+            <label for="seg-method-select">Line segmentation</label>
+            <select id="seg-method-select">
+              <option value="kraken" selected>Kraken Classical</option>
+              <option value="hpp">Projection Profile fallback</option>
+              <option value="kraken-blla" disabled>Kraken Neural / blla (server only)</option>
+            </select>
+          </div>
+        </div>
+      </details>
+    </section>
+    <!-- Card: Actions -->
+    <section id="actions-card" class="card actions-card">
+      <button id="btn-segment" class="btn btn-action btn-segment" disabled>
+        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+          <rect x="3" y="3" width="18" height="18" rx="2"/>
+          <line x1="3" y1="9" x2="21" y2="9"/>
+          <line x1="3" y1="15" x2="21" y2="15"/>
+        </svg>
+        Detect Lines
+      </button>
+      <button id="btn-transcribe" class="btn btn-action btn-primary" disabled>
+        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+          <path d="M12 20h9"/><path d="M16.5 3.5a2.121 2.121 0 0 1 3 3L7 19l-4 1 1-4L16.5 3.5z"/>
+        </svg>
+        Transcribe
+      </button>
+      <button id="btn-cancel" class="btn btn-danger" hidden>
+        Cancel
+      </button>
+    </section>
+    <!-- Card: Progress -->
+    <section id="progress-card" class="card" hidden>
+      <div id="progress-bar-wrap">
+        <div id="progress-bar" style="width:0%"></div>
+      </div>
+      <p id="status-text" class="status-text"></p>
+    </section>
+    <!-- Card: Results -->
+    <section id="results-card" class="card" hidden>
+      <div class="card-header">
+        <h2>Transcription</h2>
+        <span id="line-count" class="badge badge--info"></span>
+      </div>
+      <div id="results-list"></div>
+      <div id="results-actions" class="results-actions">
+        <button id="btn-copy" class="btn btn-secondary">
+          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+            <rect x="9" y="9" width="13" height="13" rx="2"/>
+            <path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/>
+          </svg>
+          Copy All
+        </button>
+        <button id="btn-export-txt" class="btn btn-secondary">
+          <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+            <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
+            <polyline points="7 10 12 15 17 10"/>
+            <line x1="12" y1="15" x2="12" y2="3"/>
+          </svg>
+          Save TXT
+        </button>
+      </div>
+    </section>
+  </main>
+  <!-- Photo Review Overlay — shown after camera capture, before upload -->
+  <div id="photo-review" hidden>
+    <div id="review-inner">
+      <div id="review-warn" hidden>
+        Landscape photo detected - for line segmentation, please rotate to portrait (↺ or ↻)
+      </div>
+      <div id="review-img-outer">
+        <div id="review-img-wrap">
+          <img id="review-img" alt="Photo preview">
+          <canvas id="review-crop-canvas"></canvas>
+        </div>
+      </div>
+      <div id="review-toolbar">
+        <button id="btn-rotate-ccw" class="btn btn-secondary btn-icon" title="Rotate 90° left">↺</button>
+        <button id="btn-rotate-cw"  class="btn btn-secondary btn-icon" title="Rotate 90° right">↻</button>
+        <button id="btn-auto-crop"   class="btn btn-secondary">Auto crop page</button>
+        <button id="btn-crop-start"  class="btn btn-secondary">✂ Manual crop</button>
+        <button id="btn-crop-apply"  class="btn btn-primary"   hidden>Apply crop</button>
+        <button id="btn-crop-cancel" class="btn btn-ghost btn-small" hidden>Cancel</button>
+      </div>
+      <div id="review-actions">
+        <button id="btn-retake"    class="btn btn-secondary">Retake</button>
+        <button id="btn-use-photo" class="btn btn-primary">Use photo →</button>
+      </div>
+    </div>
+  </div>
+  <script src="/static/pwa/demo.js" type="module"></script>
+</body>
+</html>

web/static/pwa/demo.js ADDED Viewed

	@@ -0,0 +1,1069 @@

+/**
+ * Polyscriptor PWA Demo — App Logic
+ * Self-contained (no imports from main app.js).
+ * Cache-bust: 2026-05-18 (photo review CSS fix)
+ */
+// ── LocalStorage keys ──────────────────────────────────────────────────
+const LS_ENGINE     = 'pwa_last_engine';
+const LS_SEG_METHOD = 'pwa_seg_method';
+const LS_MODEL = name => `pwa_last_model_${name}`;
+// ── State ──────────────────────────────────────────────────────────────
+const state = {
+  imageId:        null,
+  imageInfo:      null,   // { width, height, filename }
+  bboxes:         [],     // [[x1,y1,x2,y2], …]
+  lines:          [],     // [{index, text, confidence, bbox}, …]
+  engines:        [],     // from /api/engines
+  loadedEngine:   null,   // currently active engine name in pool
+  engineChangeSeq: 0,     // guards against stale async schema responses
+  isSegmenting:   false,
+  isTranscribing: false,
+  sseAbort:       null,   // AbortController for active SSE
+};
+// ── DOM refs ───────────────────────────────────────────────────────────
+const $  = id => document.getElementById(id);
+const el = {
+  btnCamera:        $('btn-camera'),
+  btnFile:          $('btn-file'),
+  fileCamera:       $('file-camera'),
+  filePicker:       $('file-picker'),
+  previewWrap:      $('image-preview-wrap'),
+  previewImg:       $('preview-img'),
+  bboxCanvas:       $('bbox-canvas'),
+  previewFilename:  $('preview-filename'),
+  btnClearImage:    $('btn-clear-image'),
+  engineSelect:     $('engine-select'),
+  modelRow:         $('model-row'),
+  modelSelect:      $('model-select'),
+  btnLoadModel:     $('btn-load-model'),
+  modelStatusBadge: $('model-status-badge'),
+  enginePill:       $('engine-pill'),
+  enginePillText:   $('engine-pill-text'),
+  segMethodSelect:  $('seg-method-select'),
+  btnSegment:       $('btn-segment'),
+  btnTranscribe:    $('btn-transcribe'),
+  btnCancel:        $('btn-cancel'),
+  progressCard:     $('progress-card'),
+  progressBar:      $('progress-bar'),
+  statusText:       $('status-text'),
+  resultsCard:      $('results-card'),
+  resultsList:      $('results-list'),
+  lineCount:        $('line-count'),
+  btnCopy:          $('btn-copy'),
+  btnExportTxt:     $('btn-export-txt'),
+  // Photo review overlay
+  photoReview:      $('photo-review'),
+  reviewImg:        $('review-img'),
+  reviewCropCanvas: $('review-crop-canvas'),
+  reviewWarn:       $('review-warn'),
+  btnRotateCCW:     $('btn-rotate-ccw'),
+  btnRotateCW:      $('btn-rotate-cw'),
+  btnAutoCrop:      $('btn-auto-crop'),
+  btnCropStart:     $('btn-crop-start'),
+  btnCropApply:     $('btn-crop-apply'),
+  btnCropCancel:    $('btn-crop-cancel'),
+  btnRetake:        $('btn-retake'),
+  btnUsePhoto:      $('btn-use-photo'),
+};
+// ── Photo Review State ─────────────────────────────────────────────────
+const reviewState = {
+  canvas:      null,   // off-screen working canvas (rotated / cropped)
+  cropMode:    false,
+  cropStart:   null,   // image-coord pointer-down position
+  cropRect:    null,   // {x, y, w, h} in image coords
+  srcFilename: '',
+};
+// ── Toast ──────────────────────────────────────────────────────────────
+function toast(msg, type = 'info', ms = 4000) {
+  const container = $('toast-container');
+  const div = document.createElement('div');
+  div.className = `toast toast--${type}`;
+  div.textContent = msg;
+  container.appendChild(div);
+  setTimeout(() => div.remove(), ms);
+}
+// ── API helper ─────────────────────────────────────────────────────────
+async function api(path, options = {}) {
+  const headers = { 'Content-Type': 'application/json', ...(options.headers || {}) };
+  const resp = await fetch(path, { ...options, headers });
+  if (!resp.ok) {
+    const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+    throw new Error(err.detail || err.message || `HTTP ${resp.status}`);
+  }
+  return resp;
+}
+// ── Engine pill ────────────────────────────────────────────────────────
+function setPill(state, text) {
+  el.enginePill.className = `engine-pill engine-pill--${state}`;
+  el.enginePillText.textContent = text;
+}
+// ── Engine status (check pool) ─────────────────────────────��───────────
+async function checkEngineStatus() {
+  try {
+    const resp = await api('/api/engine/status');
+    const data = await resp.json();
+    // Response: { loaded: bool, engine_name: str, config: {...} }
+    if (data.loaded && data.engine_name) {
+      state.loadedEngine = data.engine_name;
+      setPill('loaded', data.engine_name);
+      setBadge('loaded', 'Model loaded');
+      // Pre-select the matching engine in the dropdown
+      if (el.engineSelect.querySelector(`option[value="${data.engine_name}"]`)) {
+        el.engineSelect.value = data.engine_name;
+      }
+      // Hide load controls — engine already active
+      el.btnLoadModel.hidden = true;
+      el.modelRow.hidden = true;
+    } else {
+      state.loadedEngine = null;
+      setPill('unloaded', 'No model');
+      setBadge('unloaded', 'No model loaded');
+      el.btnLoadModel.hidden = false;
+    }
+    updateActionButtons();
+  } catch {
+    setPill('unknown', 'Offline');
+    setBadge('loading', 'Checking…');
+  }
+}
+function setBadge(type, text) {
+  el.modelStatusBadge.className = `badge badge--${type}`;
+  el.modelStatusBadge.textContent = text;
+}
+// ── Load engines list ──────────────────────────────────────────────────
+async function loadEngines() {
+  try {
+    const resp  = await api('/api/engines');
+    const data  = await resp.json();
+    // /api/engines returns a plain array
+    state.engines = Array.isArray(data) ? data : (data.engines || []);
+    el.engineSelect.innerHTML = '';
+    const avail = state.engines.filter(e => e.available);
+    if (avail.length === 0) {
+      el.engineSelect.innerHTML = '<option value="">No engines available</option>';
+      return;
+    }
+    for (const eng of avail) {
+      const opt = document.createElement('option');
+      opt.value = eng.name;
+      opt.textContent = eng.display_name || eng.name;
+      el.engineSelect.appendChild(opt);
+    }
+    // Restore last selection
+    const last = localStorage.getItem(LS_ENGINE);
+    if (last && el.engineSelect.querySelector(`option[value="${last}"]`)) {
+      el.engineSelect.value = last;
+    }
+    await onEngineChange();
+  } catch (e) {
+    el.engineSelect.innerHTML = '<option value="">Failed to load engines</option>';
+    toast('Could not reach server', 'error');
+  }
+}
+// ── Engine selection changed ───────────────────────────────────────────
+async function onEngineChange() {
+  const name = el.engineSelect.value;
+  if (!name) return;
+  const requestSeq = ++state.engineChangeSeq;
+  localStorage.setItem(LS_ENGINE, name);
+  // If this engine is already the loaded one, hide load controls
+  if (name === state.loadedEngine) {
+    el.modelRow.hidden = true;
+    el.btnLoadModel.hidden = true;
+    return;
+  }
+  el.modelRow.hidden = false;
+  el.modelSelect.innerHTML = '<option>Loading…</option>';
+  el.btnLoadModel.hidden = false;
+  el.btnLoadModel.disabled = true;
+  state.modelFieldKey = null;
+  try {
+    // Use config-schema (same as main app) — it has the full model option list
+    const resp = await api(`/api/engine/${encodeURIComponent(name)}/config-schema`);
+    const schema = await resp.json();
+    if (requestSeq !== state.engineChangeSeq || el.engineSelect.value !== name) {
+      return;
+    }
+    // Find first non-dynamic select field → that's the model selector
+    const selectField = (schema.fields || []).find(
+      f => f.type === 'select' && !f.dynamic
+    );
+    el.modelSelect.innerHTML = '';
+    if (selectField && (selectField.options || []).length > 0) {
+      state.modelFieldKey = selectField.key;
+      for (const opt of selectField.options) {
+        const o = document.createElement('option');
+        o.value = typeof opt === 'object' ? opt.value : opt;
+        o.textContent = typeof opt === 'object' ? opt.label : opt;
+        el.modelSelect.appendChild(o);
+      }
+      // Restore last selection or apply schema default
+      const lastModel = localStorage.getItem(LS_MODEL(name));
+      if (lastModel && el.modelSelect.querySelector(`option[value="${lastModel}"]`)) {
+        el.modelSelect.value = lastModel;
+      } else if (selectField.default != null) {
+        el.modelSelect.value = selectField.default;
+      }
+    } else {
+      // No static options (e.g. API-based engines) — show Default
+      state.modelFieldKey = selectField?.key || 'model_path';
+      const o = document.createElement('option');
+      o.value = '';
+      o.textContent = 'Default';
+      el.modelSelect.appendChild(o);
+    }
+    el.btnLoadModel.disabled = false;
+  } catch {
+    if (requestSeq !== state.engineChangeSeq || el.engineSelect.value !== name) {
+      return;
+    }
+    el.modelSelect.innerHTML = '<option value="">Default</option>';
+    state.modelFieldKey = 'model_path';
+    el.btnLoadModel.disabled = false;
+  }
+}
+// ── Load model ─────────────────────────────────────────────────────────
+async function loadModel() {
+  const engineName = el.engineSelect.value;
+  if (!engineName) return;
+  const modelVal = el.modelSelect.value || '';
+  localStorage.setItem(LS_MODEL(engineName), modelVal);
+  el.btnLoadModel.disabled = true;
+  el.btnLoadModel.textContent = 'Loading…';
+  setPill('loading', 'Loading…');
+  setBadge('loading', 'Loading…');
+  try {
+    // Use the field key from the config schema (e.g. 'model_path' for CRNN-CTC/TrOCR/Kraken)
+    const fieldKey = state.modelFieldKey || 'model_path';
+    const config = modelVal ? { [fieldKey]: modelVal } : {};
+    await api('/api/engine/load', {
+      method: 'POST',
+      body: JSON.stringify({ engine_name: engineName, config }),
+    });
+    state.loadedEngine = engineName;
+    setPill('loaded', engineName);
+    setBadge('loaded', 'Model loaded');
+    el.btnLoadModel.hidden = true;
+    el.modelRow.hidden = true;
+    toast(`${engineName} loaded`, 'success');
+  } catch (e) {
+    setPill('unloaded', 'Load failed');
+    setBadge('unloaded', 'Load failed');
+    toast(`Load failed: ${e.message}`, 'error');
+  } finally {
+    el.btnLoadModel.disabled = false;
+    el.btnLoadModel.textContent = 'Load Model';
+    updateActionButtons();
+  }
+}
+// ── Update action button states ────────────────────────────────────────
+function updateActionButtons() {
+  const hasImage  = !!state.imageId;
+  const hasEngine = !!state.loadedEngine;
+  const busy      = state.isSegmenting || state.isTranscribing;
+  el.btnSegment.disabled   = !hasImage || !hasEngine || busy;
+  el.btnTranscribe.disabled = !hasImage || !hasEngine || busy;
+  el.btnCancel.hidden       = !busy;
+}
+// ── File upload ────────────────────────────────────────────────────────
+async function uploadFile(file) {
+  if (!file) return;
+  const fd = new FormData();
+  fd.append('file', file);
+  setStatus('Uploading…');
+  el.progressCard.hidden = false;
+  setProgress(0);
+  try {
+    const resp = await fetch('/api/image/upload?max_dim=2400', { method: 'POST', body: fd });
+    if (!resp.ok) {
+      const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+      throw new Error(err.detail || 'Upload failed');
+    }
+    const data = await resp.json();
+    if (data.is_pdf) {
+      // PDF: use first page
+      const first = data.pages[0];
+      state.imageId   = first.image_id;
+      state.imageInfo = { width: first.width, height: first.height, filename: first.filename };
+      toast(`PDF uploaded — using page 1 of ${data.pages.length}`, 'info');
+    } else {
+      state.imageId   = data.image_id;
+      state.imageInfo = { width: data.width, height: data.height, filename: data.filename };
+    }
+    // Show preview
+    el.previewImg.src              = `/api/image/${state.imageId}`;
+    el.previewFilename.textContent = state.imageInfo.filename || file.name;
+    el.previewWrap.hidden          = false;
+    clearBboxes();
+    // Clear old results
+    hideResults();
+    setStatus('Image ready');
+    setProgress(100);
+    setTimeout(() => { el.progressCard.hidden = true; }, 800);
+    updateActionButtons();
+  } catch (e) {
+    toast(`Upload failed: ${e.message}`, 'error');
+    setStatus('');
+    el.progressCard.hidden = true;
+  }
+}
+// ── Clear image ────────────────────────────────────────────────────────
+function clearImage() {
+  state.imageId   = null;
+  state.imageInfo = null;
+  state.bboxes    = [];
+  state.lines     = [];
+  el.previewWrap.hidden = true;
+  el.previewImg.src     = '';
+  clearBboxes();
+  hideResults();
+  updateActionButtons();
+}
+// ── BBox canvas ────────────────────────────────────────────────────────
+function clearBboxes() {
+  const canvas = el.bboxCanvas;
+  const ctx    = canvas.getContext('2d');
+  ctx.clearRect(0, 0, canvas.width, canvas.height);
+  state.bboxes = [];
+}
+// Draw bounding boxes scaled to displayed image size
+function drawBboxes(bboxes, highlightIdx = -1) {
+  const img    = el.previewImg;
+  const canvas = el.bboxCanvas;
+  const ctx    = canvas.getContext('2d');
+  // Match canvas to displayed size
+  canvas.width  = img.offsetWidth;
+  canvas.height = img.offsetHeight;
+  ctx.clearRect(0, 0, canvas.width, canvas.height);
+  if (!bboxes || bboxes.length === 0 || !state.imageInfo) return;
+  const scaleX = img.offsetWidth  / state.imageInfo.width;
+  const scaleY = img.offsetHeight / state.imageInfo.height;
+  // Color palette for lines — use distinct hues
+  const COLORS = [
+    'rgba(59,130,246,', // blue
+    'rgba(99,102,241,', // indigo
+    'rgba(34,197,94,',  // green
+    'rgba(245,158,11,', // amber
+    'rgba(239,68,68,',  // red
+    'rgba(168,85,247,', // purple
+    'rgba(20,184,166,', // teal
+    'rgba(249,115,22,', // orange
+  ];
+  bboxes.forEach((bbox, i) => {
+    const [x1, y1, x2, y2] = bbox;
+    const x = x1 * scaleX;
+    const y = y1 * scaleY;
+    const w = (x2 - x1) * scaleX;
+    const h = (y2 - y1) * scaleY;
+    const colorBase = COLORS[i % COLORS.length];
+    const isHighlighted = i === highlightIdx;
+    const fillAlpha    = isHighlighted ? 0.25 : 0.10;
+    const strokeAlpha  = isHighlighted ? 1.0  : 0.7;
+    ctx.fillStyle   = `${colorBase}${fillAlpha})`;
+    ctx.strokeStyle = `${colorBase}${strokeAlpha})`;
+    ctx.lineWidth   = isHighlighted ? 2 : 1.5;
+    ctx.fillRect(x, y, w, h);
+    ctx.strokeRect(x, y, w, h);
+    // Line number label
+    ctx.font      = 'bold 10px monospace';
+    ctx.fillStyle = `${colorBase}0.9)`;
+    const label   = String(i + 1);
+    const pad     = 3;
+    const tw      = ctx.measureText(label).width + pad * 2;
+    ctx.fillStyle = `${colorBase}0.85)`;
+    ctx.fillRect(x, y - 14, tw, 14);
+    ctx.fillStyle = '#fff';
+    ctx.fillText(label, x + pad, y - 3);
+  });
+}
+// ── Segment ────────────────────────────────────────────────────────────
+async function segmentImage() {
+  if (!state.imageId) return;
+  state.isSegmenting = true;
+  updateActionButtons();
+  el.progressCard.hidden = false;
+  setProgress(0);
+  setStatus('Detecting lines…');
+  clearBboxes();
+  const method = el.segMethodSelect.value || 'kraken';
+  localStorage.setItem(LS_SEG_METHOD, method);
+  try {
+    const url  = `/api/image/${state.imageId}/segment?method=${encodeURIComponent(method)}&device=cuda%3A0`;
+    const resp = await api(url);
+    const data = await resp.json();
+    state.bboxes = data.bboxes || [];
+    drawBboxes(state.bboxes);
+    setStatus(`${state.bboxes.length} line${state.bboxes.length !== 1 ? 's' : ''} detected`);
+    setProgress(100);
+    toast(`${state.bboxes.length} lines detected`, 'success', 2500);
+  } catch (e) {
+    toast(`Segmentation failed: ${e.message}`, 'error');
+    setStatus('Segmentation failed');
+  } finally {
+    state.isSegmenting = false;
+    updateActionButtons();
+    setTimeout(() => { if (!state.isTranscribing) el.progressCard.hidden = true; }, 1500);
+  }
+}
+// ── Transcribe (SSE) ───────────────────────────────────────────────────
+async function startTranscription() {
+  if (!state.imageId || !state.loadedEngine) return;
+  state.isTranscribing = true;
+  state.lines          = [];
+  updateActionButtons();
+  el.progressCard.hidden = false;
+  setProgress(0);
+  setStatus('Starting transcription…');
+  el.resultsCard.hidden  = true;
+  el.resultsList.innerHTML = '';
+  const method = el.segMethodSelect.value || 'kraken';
+  const body = JSON.stringify({
+    image_id:   state.imageId,
+    seg_method: method,
+    seg_device: 'cuda:0',
+  });
+  const abort = new AbortController();
+  state.sseAbort = abort;
+  try {
+    const resp = await fetch('/api/transcribe', {
+      method:  'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body,
+      signal: abort.signal,
+    });
+    if (!resp.ok) {
+      const err = await resp.json().catch(() => ({ detail: resp.statusText }));
+      throw new Error(err.detail || 'Transcription failed');
+    }
+    const reader  = resp.body.getReader();
+    const decoder = new TextDecoder();
+    let   buffer  = '';
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      const parts = buffer.split('\n\n');
+      buffer = parts.pop(); // last part may be incomplete
+      for (const part of parts) {
+        const eventLine = part.split('\n').find(l => l.startsWith('event:'));
+        const dataLine  = part.split('\n').find(l => l.startsWith('data:'));
+        if (!dataLine) continue;
+        const event   = eventLine ? eventLine.slice(7).trim() : 'message';
+        const payload = JSON.parse(dataLine.slice(5).trim());
+        handleSSEEvent(event, payload);
+      }
+    }
+  } catch (e) {
+    if (e.name !== 'AbortError') {
+      toast(`Transcription error: ${e.message}`, 'error');
+      setStatus('Error');
+    }
+  } finally {
+    state.isTranscribing = false;
+    state.sseAbort       = null;
+    updateActionButtons();
+  }
+}
+function handleSSEEvent(event, payload) {
+  switch (event) {
+    case 'status':
+      setStatus(payload.message || '');
+      break;
+    case 'segmentation': {
+      state.bboxes = payload.bboxes || [];
+      drawBboxes(state.bboxes);
+      setStatus(`${state.bboxes.length} lines detected — transcribing…`);
+      break;
+    }
+    case 'progress': {
+      const { current, total, line } = payload;
+      setProgress(total > 0 ? (current / total) * 100 : 0);
+      setStatus(`Transcribing line ${current} / ${total}…`);
+      if (line) {
+        state.lines.push(line);
+        appendResultLine(line);
+        // Highlight corresponding bbox
+        drawBboxes(state.bboxes, line.index);
+      }
+      // Show results card on first result
+      if (el.resultsCard.hidden && state.lines.length === 1) {
+        el.resultsCard.hidden = false;
+        el.resultsCard.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+      }
+      break;
+    }
+    case 'complete': {
+      setProgress(100);
+      const count = (payload.lines || []).length;
+      const secs  = payload.total_time_s ? ` in ${payload.total_time_s}s` : '';
+      setStatus(`Done — ${count} lines${secs}`);
+      el.lineCount.textContent = `${count} lines`;
+      el.lineCount.className   = 'badge badge--info';
+      // Redraw all bboxes without highlight
+      drawBboxes(state.bboxes);
+      toast(`Transcription complete (${count} lines)`, 'success');
+      setTimeout(() => { el.progressCard.hidden = true; }, 1200);
+      break;
+    }
+    case 'cancelled':
+      setStatus('Cancelled');
+      toast('Transcription cancelled', 'warn', 2500);
+      setTimeout(() => { el.progressCard.hidden = true; }, 1000);
+      break;
+    case 'error':
+      toast(`Error: ${payload.message}`, 'error');
+      setStatus('Error');
+      break;
+  }
+}
+// ── Result line DOM ────────────────────────────────────────────────────
+function appendResultLine(line) {
+  const div = document.createElement('div');
+  div.className = 'result-line';
+  const numSpan = document.createElement('span');
+  numSpan.className = 'line-num';
+  numSpan.textContent = String(line.index + 1);
+  const textSpan = document.createElement('span');
+  textSpan.className = 'line-text';
+  textSpan.textContent = line.text || '';
+  div.appendChild(numSpan);
+  div.appendChild(textSpan);
+  if (line.confidence !== null && line.confidence !== undefined) {
+    const pct = Math.round(line.confidence * 100);
+    const confSpan = document.createElement('span');
+    confSpan.className = `line-conf ${pct >= 90 ? 'conf-high' : pct >= 75 ? 'conf-mid' : 'conf-low'}`;
+    confSpan.textContent = `${pct}%`;
+    div.appendChild(confSpan);
+  }
+  el.resultsList.appendChild(div);
+  // Auto-scroll to latest
+  el.resultsList.scrollTop = el.resultsList.scrollHeight;
+}
+// ── Cancel ─────────────────────────────────────────────────────────────
+async function cancelTranscription() {
+  if (state.sseAbort) state.sseAbort.abort();
+  try {
+    await api('/api/transcribe/cancel', { method: 'POST', body: '{}' });
+  } catch { /* ignore */ }
+}
+// ── Progress helpers ───────────────────────────────────────────────────
+function setProgress(pct) {
+  el.progressBar.style.width = `${Math.min(100, Math.max(0, pct))}%`;
+}
+function setStatus(msg) {
+  el.statusText.textContent = msg;
+}
+// ── Hide results ───────────────────────────────────────────────────────
+function hideResults() {
+  el.resultsCard.hidden    = true;
+  el.resultsList.innerHTML = '';
+  state.lines              = [];
+  el.lineCount.textContent = '';
+}
+// ── Copy all ───────────────────────────────────────────────────────────
+function copyAll() {
+  const text = state.lines.map(l => l.text || '').join('\n');
+  if (!text) { toast('Nothing to copy', 'warn', 2000); return; }
+  navigator.clipboard.writeText(text)
+    .then(() => toast('Copied to clipboard', 'success', 2000))
+    .catch(() => toast('Copy failed', 'error'));
+}
+// ── Export TXT ─────────────────────────────────────────────────────────
+function exportTxt() {
+  const text = state.lines.map(l => l.text || '').join('\n');
+  if (!text) { toast('Nothing to export', 'warn', 2000); return; }
+  const blob  = new Blob([text], { type: 'text/plain;charset=utf-8' });
+  const url   = URL.createObjectURL(blob);
+  const a     = document.createElement('a');
+  a.href      = url;
+  a.download  = (state.imageInfo?.filename?.replace(/\.[^.]+$/, '') || 'transcription') + '.txt';
+  a.click();
+  URL.revokeObjectURL(url);
+}
+// ── Redraw bboxes on image resize ─────────────���────────────────────────
+function onImageResize() {
+  if (state.bboxes.length > 0) drawBboxes(state.bboxes);
+}
+// ── Photo Review ────────────────────────────────────────────────────────
+function openPhotoReview(file) {
+  reviewState.srcFilename = file.name || 'photo.jpg';
+  reviewState.cropMode    = false;
+  reviewState.cropStart   = null;
+  reviewState.cropRect    = null;
+  const img = new Image();
+  const url = URL.createObjectURL(file);
+  img.onload = () => {
+    URL.revokeObjectURL(url);
+    const canvas = document.createElement('canvas');
+    canvas.width  = img.naturalWidth;
+    canvas.height = img.naturalHeight;
+    canvas.getContext('2d').drawImage(img, 0, 0);
+    reviewState.canvas = canvas;
+    updateReviewDisplay();
+    el.photoReview.hidden = false;
+    document.body.style.overflow = 'hidden';
+  };
+  img.onerror = () => {
+    URL.revokeObjectURL(url);
+    toast('Could not load photo', 'error');
+  };
+  img.src = url;
+}
+function closePhotoReview() {
+  el.photoReview.hidden = true;
+  document.body.style.overflow = '';
+  reviewState.canvas   = null;
+  reviewState.cropMode = false;
+  reviewState.cropRect = null;
+  resetCropUI();
+}
+function updateReviewDisplay() {
+  if (!reviewState.canvas) return;
+  el.reviewImg.onload = () => {
+    syncCropCanvas();
+    checkReviewOrientation();
+  };
+  el.reviewImg.src = reviewState.canvas.toDataURL('image/jpeg', 0.9);
+}
+function checkReviewOrientation() {
+  const landscape = reviewState.canvas.width > reviewState.canvas.height;
+  el.reviewWarn.hidden = !landscape;
+}
+function syncCropCanvas() {
+  const c    = el.reviewCropCanvas;
+  const rect = el.reviewImg.getBoundingClientRect();
+  if (!rect.width) return;
+  c.width  = Math.round(rect.width);
+  c.height = Math.round(rect.height);
+  c.getContext('2d').clearRect(0, 0, c.width, c.height);
+}
+// ── Auto-Crop (adaptive page detection) ────────────────────────────────
+function autoDetectAndCrop() {
+  if (!reviewState.canvas) return;
+  exitCropMode();
+  const canvas = reviewState.canvas;
+  const { width, height } = canvas;
+  const data = canvas.getContext('2d').getImageData(0, 0, width, height).data;
+  // Single pass: accumulate page-likelihood per row and per column.
+  // Heuristic: white paper is typically bright with low saturation.
+  const rowSum = new Float32Array(height);
+  const colSum = new Float32Array(width);
+  let borderSum = 0;
+  let borderCount = 0;
+  const borderBandY = Math.max(1, Math.floor(height * 0.08));
+  const borderBandX = Math.max(1, Math.floor(width * 0.08));
+  for (let y = 0; y < height; y++) {
+    for (let x = 0; x < width; x++) {
+      const i = (y * width + x) * 4;
+      const r = data[i];
+      const g = data[i + 1];
+      const b = data[i + 2];
+      const v = Math.max(r, g, b);
+      const min = Math.min(r, g, b);
+      const s = v === 0 ? 0 : (v - min) / v;
+      const pageScore = v - (s * 90);
+      rowSum[y] += pageScore;
+      colSum[x] += pageScore;
+      const isBorderPixel = y < borderBandY || y >= (height - borderBandY) || x < borderBandX || x >= (width - borderBandX);
+      if (isBorderPixel) {
+        borderSum += pageScore;
+        borderCount += 1;
+      }
+    }
+  }
+  const borderMean = borderCount > 0 ? (borderSum / borderCount) : 40;
+  const THRESHOLD = Math.min(230, borderMean + 14);
+  const PAD       = 12;
+  let top = 0, bottom = height - 1, left = 0, right = width - 1;
+  for (let y = 0;          y < height; y++) { if (rowSum[y] / width  > THRESHOLD) { top    = y; break; } }
+  for (let y = height - 1; y >= 0;    y--) { if (rowSum[y] / width  > THRESHOLD) { bottom = y; break; } }
+  for (let x = 0;          x < width; x++) { if (colSum[x] / height > THRESHOLD) { left   = x; break; } }
+  for (let x = width - 1;  x >= 0;    x--) { if (colSum[x] / height > THRESHOLD) { right  = x; break; } }
+  // Apply padding and clamp
+  top    = Math.max(0,         top    - PAD);
+  bottom = Math.min(height - 1, bottom + PAD);
+  left   = Math.max(0,         left   - PAD);
+  right  = Math.min(width - 1, right  + PAD);
+  const w = right - left;
+  const h = bottom - top;
+  // Sanity check: don't crop to less than 20% of original
+  if (w < width * 0.2 || h < height * 0.2) {
+    toast('Page not detected clearly - please crop manually', 'warn');
+    return;
+  }
+  const dst = document.createElement('canvas');
+  dst.width  = w;
+  dst.height = h;
+  dst.getContext('2d').drawImage(canvas, left, top, w, h, 0, 0, w, h);
+  reviewState.canvas = dst;
+  updateReviewDisplay();
+}
+// ── Rotate ─────────────────────────────────────────────────────────────
+function rotateReview(angle) {
+  if (!reviewState.canvas) return;
+  exitCropMode();
+  const src = reviewState.canvas;
+  const dst = document.createElement('canvas');
+  dst.width  = src.height;
+  dst.height = src.width;
+  const ctx = dst.getContext('2d');
+  ctx.translate(dst.width / 2, dst.height / 2);
+  ctx.rotate(angle * Math.PI / 180);
+  ctx.drawImage(src, -src.width / 2, -src.height / 2);
+  reviewState.canvas = dst;
+  updateReviewDisplay();
+}
+// ── Crop ───────────────────────────────────────────────────────────────
+function enterCropMode() {
+  reviewState.cropMode  = true;
+  reviewState.cropRect  = null;
+  reviewState.cropStart = null;
+  el.btnCropStart.hidden  = true;
+  el.btnCropApply.hidden  = true;
+  el.btnCropCancel.hidden = false;
+  el.reviewCropCanvas.style.pointerEvents = 'auto';
+  syncCropCanvas();
+}
+function exitCropMode() {
+  reviewState.cropMode  = false;
+  reviewState.cropStart = null;
+  reviewState.cropRect  = null;
+  el.reviewCropCanvas.style.pointerEvents = 'none';
+  resetCropUI();
+  syncCropCanvas();
+}
+function resetCropUI() {
+  el.btnCropStart.hidden  = false;
+  el.btnCropApply.hidden  = true;
+  el.btnCropCancel.hidden = true;
+}
+function pointerToImageCoords(e) {
+  const c    = el.reviewCropCanvas;
+  const rect = c.getBoundingClientRect();
+  return {
+    x: Math.max(0, Math.min(reviewState.canvas.width,  (e.clientX - rect.left) * (reviewState.canvas.width  / rect.width))),
+    y: Math.max(0, Math.min(reviewState.canvas.height, (e.clientY - rect.top)  * (reviewState.canvas.height / rect.height))),
+  };
+}
+function onCropPointerDown(e) {
+  if (!reviewState.cropMode) return;
+  e.preventDefault();
+  el.reviewCropCanvas.setPointerCapture(e.pointerId);
+  reviewState.cropStart = pointerToImageCoords(e);
+  reviewState.cropRect  = null;
+  el.btnCropApply.hidden = true;
+}
+function onCropPointerMove(e) {
+  if (!reviewState.cropMode || !reviewState.cropStart) return;
+  e.preventDefault();
+  const cur = pointerToImageCoords(e);
+  reviewState.cropRect = {
+    x: Math.min(reviewState.cropStart.x, cur.x),
+    y: Math.min(reviewState.cropStart.y, cur.y),
+    w: Math.abs(cur.x - reviewState.cropStart.x),
+    h: Math.abs(cur.y - reviewState.cropStart.y),
+  };
+  drawCropOverlay();
+}
+function onCropPointerUp(e) {
+  if (!reviewState.cropMode) return;
+  e.preventDefault();
+  reviewState.cropStart = null;
+  const r = reviewState.cropRect;
+  if (r && r.w > 20 && r.h > 20) {
+    el.btnCropApply.hidden = false;
+  }
+}
+function drawCropOverlay() {
+  const c    = el.reviewCropCanvas;
+  const ctx  = c.getContext('2d');
+  const r    = reviewState.cropRect;
+  if (!r) return;
+  const scaleX = c.width  / reviewState.canvas.width;
+  const scaleY = c.height / reviewState.canvas.height;
+  const rx = r.x * scaleX, ry = r.y * scaleY;
+  const rw = r.w * scaleX, rh = r.h * scaleY;
+  ctx.clearRect(0, 0, c.width, c.height);
+  ctx.fillStyle = 'rgba(0,0,0,0.55)';
+  ctx.fillRect(0, 0, c.width, c.height);
+  ctx.clearRect(rx, ry, rw, rh);
+  ctx.strokeStyle = 'rgba(255,255,255,0.9)';
+  ctx.lineWidth   = 2;
+  ctx.strokeRect(rx, ry, rw, rh);
+}
+function applyReviewCrop() {
+  const r = reviewState.cropRect;
+  if (!r || r.w < 20 || r.h < 20) return;
+  const dst = document.createElement('canvas');
+  dst.width  = Math.round(r.w);
+  dst.height = Math.round(r.h);
+  dst.getContext('2d').drawImage(
+    reviewState.canvas,
+    Math.round(r.x), Math.round(r.y), Math.round(r.w), Math.round(r.h),
+    0, 0, Math.round(r.w), Math.round(r.h)
+  );
+  reviewState.canvas = dst;
+  exitCropMode();
+  updateReviewDisplay();
+}
+// ── Confirm / Retake ────────────────────────────────────────────────────
+function retakePhoto() {
+  closePhotoReview();
+  el.fileCamera.value = '';
+  el.fileCamera.click();
+}
+function confirmPhoto() {
+  if (!reviewState.canvas) return;
+  el.btnUsePhoto.disabled = true;
+  reviewState.canvas.toBlob(blob => {
+    if (!blob) {
+      toast('Error while processing photo', 'error');
+      el.btnUsePhoto.disabled = false;
+      return;
+    }
+    const baseName = reviewState.srcFilename.replace(/\.[^.]+$/, '');
+    const file = new File([blob], baseName + '.jpg', { type: 'image/jpeg' });
+    closePhotoReview();
+    el.btnUsePhoto.disabled = false;
+    uploadFile(file);
+  }, 'image/jpeg', 0.92);
+}
+// ── Register service worker ─────────────────────────────────────────────
+async function detectPwaVersion() {
+  try {
+    const resp = await fetch('/static/pwa/demo.js', {
+      method: 'HEAD',
+      cache: 'no-store',
+    });
+    const lastModified = resp.headers.get('last-modified');
+    if (lastModified) {
+      const ts = Date.parse(lastModified);
+      if (Number.isFinite(ts) && ts > 0) return String(ts);
+    }
+  } catch {
+    // Fallback below
+  }
+  return 'dev';
+}
+if ('serviceWorker' in navigator) {
+  window.addEventListener('load', async () => {
+    try {
+      const version = await detectPwaVersion();
+      const reg = await navigator.serviceWorker.register(`/sw.js?v=${encodeURIComponent(version)}`, { scope: '/' });
+      reg.update().catch(() => {});
+    } catch (e) {
+      console.warn('SW registration failed:', e);
+    }
+  });
+}
+// ── Init ───────────────────────────────────────────────────────────────
+function init() {
+  // Camera button — open review overlay instead of uploading directly
+  el.btnCamera.addEventListener('click', () => el.fileCamera.click());
+  el.fileCamera.addEventListener('change', () => {
+    if (el.fileCamera.files[0]) openPhotoReview(el.fileCamera.files[0]);
+    el.fileCamera.value = '';
+  });
+  // Photo review
+  el.btnRotateCCW.addEventListener('click',  () => rotateReview(-90));
+  el.btnRotateCW.addEventListener('click',   () => rotateReview(90));
+  el.btnAutoCrop.addEventListener('click',   autoDetectAndCrop);
+  el.btnCropStart.addEventListener('click',  enterCropMode);
+  el.btnCropApply.addEventListener('click',  applyReviewCrop);
+  el.btnCropCancel.addEventListener('click', exitCropMode);
+  el.btnRetake.addEventListener('click',     retakePhoto);
+  el.btnUsePhoto.addEventListener('click',   confirmPhoto);
+  el.reviewCropCanvas.addEventListener('pointerdown', onCropPointerDown);
+  el.reviewCropCanvas.addEventListener('pointermove', onCropPointerMove);
+  el.reviewCropCanvas.addEventListener('pointerup',   onCropPointerUp);
+  // File picker button
+  el.btnFile.addEventListener('click', () => el.filePicker.click());
+  el.filePicker.addEventListener('change', () => {
+    if (el.filePicker.files[0]) uploadFile(el.filePicker.files[0]);
+    el.filePicker.value = '';
+  });
+  // Clear image
+  el.btnClearImage.addEventListener('click', clearImage);
+  // Engine select
+  el.engineSelect.addEventListener('change', onEngineChange);
+  // Load model
+  el.btnLoadModel.addEventListener('click', loadModel);
+  // Segment
+  el.btnSegment.addEventListener('click', segmentImage);
+  // Transcribe
+  el.btnTranscribe.addEventListener('click', startTranscription);
+  // Cancel
+  el.btnCancel.addEventListener('click', cancelTranscription);
+  // Export
+  el.btnCopy.addEventListener('click', copyAll);
+  el.btnExportTxt.addEventListener('click', exportTxt);
+  // Seg method persistence
+  const savedSeg = localStorage.getItem(LS_SEG_METHOD);
+  const savedSegOption = savedSeg ? el.segMethodSelect.querySelector(`option[value="${savedSeg}"]`) : null;
+  if (savedSegOption && !savedSegOption.disabled) {
+    el.segMethodSelect.value = savedSeg;
+  }
+  el.segMethodSelect.addEventListener('change', () => {
+    localStorage.setItem(LS_SEG_METHOD, el.segMethodSelect.value);
+  });
+  // Redraw bboxes on layout changes (image resize)
+  const ro = new ResizeObserver(onImageResize);
+  ro.observe(el.previewImg);
+  // Initial data load
+  loadEngines().then(checkEngineStatus);
+}
+document.addEventListener('DOMContentLoaded', init);

web/static/pwa/icons/icon-192.png ADDED Viewed

Git LFS Details

SHA256: cac30492acbc8fff49fd2e166c0a5610148dd73c832c6cfe9f48bbacca2b94b6
Pointer size: 130 Bytes
Size of remote file: 37.5 kB

web/static/pwa/icons/icon-512.png ADDED Viewed

Git LFS Details

SHA256: 006af190dcb8989e09a2d36566fea9371aa92e5aa95b9f772176f262be35401a
Pointer size: 131 Bytes
Size of remote file: 160 kB

web/static/pwa/manifest.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "name": "Polyscriptor HTR Demo",
+  "short_name": "Polyscriptor",
+  "description": "Handwritten Text Recognition — capture a photo and transcribe it instantly",
+  "start_url": "/demo",
+  "scope": "/",
+  "display": "standalone",
+  "orientation": "portrait-primary",
+  "background_color": "#111827",
+  "theme_color": "#3b82f6",
+  "icons": [
+    {
+      "src": "/static/pwa/icons/icon-192.png",
+      "sizes": "192x192",
+      "type": "image/png",
+      "purpose": "any maskable"
+    },
+    {
+      "src": "/static/pwa/icons/icon-512.png",
+      "sizes": "512x512",
+      "type": "image/png",
+      "purpose": "any maskable"
+    }
+  ],
+  "categories": ["productivity", "utilities"],
+  "lang": "en"
+}

web/static/pwa/sw.js ADDED Viewed

	@@ -0,0 +1,60 @@

+/**
+ * Polyscriptor PWA — Service Worker
+ * Caches static assets for faster startup; API calls always go to network.
+ */
+const SW_VERSION = new URL(self.location.href).searchParams.get('v') || 'dev';
+const CACHE = `polyscriptor-pwa-${SW_VERSION}`;
+const STATIC = [
+  '/demo',
+  '/static/pwa/demo.html',
+  '/static/pwa/demo.css',
+  '/static/pwa/demo.js',
+  '/static/pwa/manifest.json',
+  '/static/pwa/icons/icon-192.png',
+  '/static/pwa/icons/icon-512.png',
+];
+self.addEventListener('install', e => {
+  e.waitUntil(
+    caches.open(CACHE)
+      .then(async c => {
+        const freshRequests = STATIC.map(url => new Request(url, { cache: 'reload' }));
+        await c.addAll(freshRequests);
+      })
+      .then(() => self.skipWaiting())
+  );
+});
+self.addEventListener('activate', e => {
+  e.waitUntil(
+    caches.keys().then(keys =>
+      Promise.all(keys.filter(k => k !== CACHE).map(k => caches.delete(k)))
+    ).then(() => self.clients.claim())
+  );
+});
+self.addEventListener('fetch', e => {
+  const url = new URL(e.request.url);
+  // API calls: always network-only (no caching)
+  if (url.pathname.startsWith('/api/')) {
+    e.respondWith(fetch(e.request).catch(() =>
+      new Response(JSON.stringify({ detail: 'No server connection' }), {
+        status: 503,
+        headers: { 'Content-Type': 'application/json' },
+      })
+    ));
+    return;
+  }
+  // Static assets: cache-first
+  e.respondWith(
+    caches.match(e.request).then(cached => cached || fetch(e.request).then(resp => {
+      if (resp.ok && STATIC.some(s => url.pathname === s || url.pathname.startsWith(s))) {
+        caches.open(CACHE).then(c => c.put(e.request, resp.clone()));
+      }
+      return resp;
+    }))
+  );
+});