Achim Rabus commited on
Commit
78431ff
·
1 Parent(s): c8ba8c4

Deploy Polyscriptor HTR Space demo

Browse files
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .pytest_cache
3
+ __pycache__
4
+ **/__pycache__
5
+ *.pyc
6
+ *.ipynb
7
+ *.zip
8
+ models
9
+ htr_gui
10
+ Documentation
.gitattributes CHANGED
@@ -19,6 +19,7 @@
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
  *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
 
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
@@ -29,6 +30,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
 
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
  *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.png filter=lfs diff=lfs merge=lfs -text
23
  *.pt filter=lfs diff=lfs merge=lfs -text
24
  *.pth filter=lfs diff=lfs merge=lfs -text
25
  *.rar filter=lfs diff=lfs merge=lfs -text
 
30
  *.tflite filter=lfs diff=lfs merge=lfs -text
31
  *.tgz filter=lfs diff=lfs merge=lfs -text
32
  *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.woff2 filter=lfs diff=lfs merge=lfs -text
34
  *.xz filter=lfs diff=lfs merge=lfs -text
35
  *.zip filter=lfs diff=lfs merge=lfs -text
36
  *.zst filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ POLYSCRIPTOR_DEMO_MODE=hf_space \
6
+ HF_HOME=/tmp/huggingface \
7
+ PORT=7860
8
+
9
+ WORKDIR /app
10
+
11
+ RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ libgl1 \
13
+ libglib2.0-0 \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ COPY hf-space/requirements.txt /tmp/requirements-hf-space.txt
17
+ RUN pip install --no-cache-dir -r /tmp/requirements-hf-space.txt
18
+
19
+ COPY . /app
20
+
21
+ EXPOSE 7860
22
+
23
+ CMD ["python", "-m", "uvicorn", "web.polyscriptor_server:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,78 @@
1
  ---
2
- title: Polyscriptor Htr Demo
3
- emoji: 📉
4
- colorFrom: gray
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
8
- license: mit
9
- short_description: Demo of Polyscriptor HTR
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Polyscriptor HTR Demo
3
+ emoji: 📝
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
8
+ license: apache-2.0
 
9
  ---
10
 
11
+ # Polyscriptor HTR Demo
12
+
13
+ Polyscriptor is a browser-based demo for handwritten text recognition (HTR) on
14
+ historical Slavic manuscript material. This Hugging Face Space runs a constrained
15
+ public version of the Polyscriptor FastAPI/Web interface.
16
+
17
+ The hosted demo is intended for quick inspection and teaching. It is not the full
18
+ local research environment used for training, batch processing, GPU inference, or
19
+ private manuscript collections.
20
+
21
+ ## Source Code
22
+
23
+ The public Polyscriptor source code is available on GitHub:
24
+
25
+ https://github.com/achimrabus/polyscriptor
26
+
27
+ This Hugging Face Space contains the curated hosted demo deployment. The GitHub
28
+ repository contains the broader Polyscriptor codebase, including the web UI,
29
+ engine plugins, segmentation code, training utilities, and local workflows.
30
+
31
+ ## What This Demo Supports
32
+
33
+ - CRNN-CTC / PyLaia-inspired HTR presets for selected public model repositories.
34
+ - User-supplied API keys for OpenAI, Gemini, Claude, and OpenWebUI-compatible
35
+ endpoints.
36
+ - Public model download from the Hugging Face Hub, primarily under
37
+ `achimrabus/*`.
38
+ - CPU-only inference.
39
+ - Kraken Classical line segmentation, with HPP as a lightweight fallback.
40
+ - Temporary image uploads during the active session.
41
+
42
+ ## Limitations
43
+
44
+ - No private models are bundled with this Space.
45
+ - API-based engines require users to paste their own API key in the browser
46
+ form. The Space does not ship with shared provider credentials.
47
+ - Uploaded files are treated as temporary runtime data and are not part of the
48
+ repository.
49
+ - Large local GPU/VLM engines from the full Polyscriptor workflow are not
50
+ enabled here.
51
+ - Accuracy depends strongly on script, language, writing style, image quality,
52
+ and segmentation quality.
53
+
54
+ ## Model Notes
55
+
56
+ The demo uses publicly available model presets. For best results, choose a model
57
+ that matches the manuscript tradition as closely as possible. The current public
58
+ Polyscriptor model cards are available at:
59
+
60
+ https://huggingface.co/achimrabus
61
+
62
+ ## Project Context
63
+
64
+ Polyscriptor is developed for historical HTR workflows, with a focus on Slavic
65
+ manuscripts and reproducible comparison of OCR/HTR engines. The full development
66
+ repository contains additional tooling for local use, training, evaluation, and
67
+ batch processing; this Space contains only the hosted demo configuration.
68
+
69
+ ## Privacy
70
+
71
+ Do not upload sensitive or unpublished manuscript images unless you are
72
+ comfortable processing them in a hosted public demo environment. The application
73
+ uses temporary server-side files during processing, but this Space should be
74
+ treated as a public demonstration service rather than a secure private workflow.
75
+
76
+ For API-based engines, provider keys are entered by the user at runtime. Do not
77
+ commit keys to this repository or add them to the Space configuration unless you
78
+ intend to provide a shared project credential.
engines/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTR Engine Plugins
2
+
3
+ This package contains plugin implementations for different HTR engines.
4
+ Each engine module implements the HTREngine interface defined in htr_engine_base.py.
5
+ """
6
+
7
+ __all__ = [
8
+ "TrOCREngine",
9
+ "Qwen3Engine",
10
+ "PyLaiaEngine",
11
+ "KrakenEngine",
12
+ "CommercialAPIEngine",
13
+ "PartyEngine",
14
+ "DeepSeekOCREngine",
15
+ "LightOnOCREngine",
16
+ "PaddleOCREngine",
17
+ ]
engines/commercial_api_engine.py ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Commercial API Engine Plugin
3
+
4
+ Wraps commercial HTR APIs (OpenAI, Gemini, Claude) as a unified plugin.
5
+ """
6
+
7
+ import os
8
+ from pathlib import Path
9
+ from typing import Dict, Any, Optional
10
+ import numpy as np
11
+
12
+ from htr_engine_base import HTREngine, TranscriptionResult
13
+
14
+ # Load environment variables from .env file
15
+ try:
16
+ from dotenv import load_dotenv
17
+ # Look for .env in the project root (parent of engines/)
18
+ env_path = Path(__file__).parent.parent / ".env"
19
+ if env_path.exists():
20
+ load_dotenv(env_path)
21
+ print(f"[CommercialAPIEngine] Loaded environment variables from {env_path}")
22
+ except ImportError:
23
+ print("[CommercialAPIEngine] Warning: python-dotenv not installed. API keys will not be loaded from .env file.")
24
+ print("Install with: pip install python-dotenv")
25
+
26
+ try:
27
+ from PyQt6.QtWidgets import (
28
+ QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
29
+ QPushButton, QCheckBox, QLineEdit, QGroupBox, QTextEdit
30
+ )
31
+ from PyQt6.QtCore import Qt
32
+ PYQT_AVAILABLE = True
33
+ except ImportError:
34
+ PYQT_AVAILABLE = False
35
+ QWidget = object
36
+
37
+ try:
38
+ from inference_commercial_api import (
39
+ OpenAIInference, GeminiInference, ClaudeInference,
40
+ check_api_availability,
41
+ OPENAI_MODELS, GEMINI_MODELS, CLAUDE_MODELS,
42
+ fetch_openai_models, fetch_gemini_models
43
+ )
44
+ COMMERCIAL_API_AVAILABLE = True
45
+ API_AVAILABILITY = check_api_availability()
46
+ except ImportError:
47
+ COMMERCIAL_API_AVAILABLE = False
48
+ API_AVAILABILITY = {"openai": False, "gemini": False, "claude": False}
49
+ OPENAI_MODELS = []
50
+ GEMINI_MODELS = []
51
+ CLAUDE_MODELS = []
52
+ fetch_openai_models = lambda api_key=None: []
53
+ fetch_gemini_models = lambda api_key=None: []
54
+
55
+
56
+ class CommercialAPIEngine(HTREngine):
57
+ """Commercial API HTR engine plugin."""
58
+
59
+ def __init__(self):
60
+ # Instance attributes (avoid type annotations here for broader runtime compatibility in some environments)
61
+ self.model = None # Can be OpenAI, Gemini, or Claude
62
+ self._config_widget = None
63
+ self._current_provider = None
64
+
65
+ # Widget references
66
+ self._provider_combo = None
67
+ self._model_combo = None
68
+ self._custom_model_edit = None
69
+ self._use_custom_model_check = None
70
+ self._refresh_models_btn = None
71
+ self._api_key_edit = None
72
+ self._show_key_check = None
73
+ self._prompt_edit = None
74
+ self._thinking_combo = None
75
+ self._temperature_edit = None
76
+ self._max_tokens_edit = None
77
+ self._early_exit_check = None
78
+ self._auto_continue_check = None
79
+ self._max_continuations_edit = None
80
+
81
+ def get_name(self) -> str:
82
+ return "Commercial APIs"
83
+
84
+ def get_description(self) -> str:
85
+ return "OpenAI GPT-4V, Google Gemini, Anthropic Claude vision APIs"
86
+
87
+ def is_available(self) -> bool:
88
+ return COMMERCIAL_API_AVAILABLE and any(API_AVAILABILITY.values())
89
+
90
+ def get_unavailable_reason(self) -> str:
91
+ if not COMMERCIAL_API_AVAILABLE:
92
+ return "Commercial API support not available. Install with: pip install openai google-generativeai anthropic"
93
+ if not any(API_AVAILABILITY.values()):
94
+ return "No API libraries installed. Install at least one: openai, google-generativeai, or anthropic"
95
+ return ""
96
+
97
+ def get_config_widget(self):
98
+ """Create Commercial API configuration panel."""
99
+ if self._config_widget is not None:
100
+ return self._config_widget
101
+
102
+ widget = QWidget()
103
+ layout = QVBoxLayout()
104
+
105
+ # Provider selection
106
+ provider_group = QGroupBox("API Provider")
107
+ provider_layout = QVBoxLayout()
108
+
109
+ self._provider_combo = QComboBox()
110
+ available_providers = []
111
+ if API_AVAILABILITY.get("openai", False):
112
+ available_providers.append("OpenAI")
113
+ if API_AVAILABILITY.get("gemini", False):
114
+ available_providers.append("Gemini")
115
+ if API_AVAILABILITY.get("claude", False):
116
+ available_providers.append("Claude")
117
+
118
+ if not available_providers:
119
+ available_providers = ["No APIs available"]
120
+
121
+ self._provider_combo.addItems(available_providers)
122
+ self._provider_combo.currentTextChanged.connect(self._on_provider_changed)
123
+ provider_layout.addWidget(self._provider_combo)
124
+
125
+ provider_group.setLayout(provider_layout)
126
+ layout.addWidget(provider_group)
127
+
128
+ # Model selection
129
+ model_group = QGroupBox("Model")
130
+ model_layout = QVBoxLayout()
131
+
132
+ # Dropdown for standard models
133
+ model_dropdown_layout = QHBoxLayout()
134
+ self._model_combo = QComboBox()
135
+ model_dropdown_layout.addWidget(self._model_combo)
136
+
137
+ # Refresh models button
138
+ self._refresh_models_btn = QPushButton("🔄 Refresh")
139
+ self._refresh_models_btn.setToolTip("Fetch latest models from API")
140
+ self._refresh_models_btn.setMaximumWidth(80)
141
+ self._refresh_models_btn.clicked.connect(self._on_refresh_models)
142
+ model_dropdown_layout.addWidget(self._refresh_models_btn)
143
+
144
+ model_layout.addLayout(model_dropdown_layout)
145
+
146
+ # Custom model ID checkbox and field
147
+ custom_model_layout = QHBoxLayout()
148
+ self._use_custom_model_check = QCheckBox("Use custom model ID:")
149
+ self._use_custom_model_check.toggled.connect(self._on_custom_model_toggled)
150
+ custom_model_layout.addWidget(self._use_custom_model_check)
151
+
152
+ self._custom_model_edit = QLineEdit()
153
+ self._custom_model_edit.setPlaceholderText("e.g., gpt-4.5, o1-preview-2024-12-17")
154
+ self._custom_model_edit.setEnabled(False) # Disabled by default
155
+ custom_model_layout.addWidget(self._custom_model_edit)
156
+
157
+ model_layout.addLayout(custom_model_layout)
158
+
159
+ model_hint = QLabel("💡 Use custom model ID for bleeding-edge models not in the dropdown")
160
+ model_hint.setStyleSheet("color: gray; font-size: 8pt;")
161
+ model_hint.setWordWrap(True)
162
+ model_layout.addWidget(model_hint)
163
+
164
+ model_group.setLayout(model_layout)
165
+ layout.addWidget(model_group)
166
+
167
+ # API key
168
+ key_group = QGroupBox("API Key")
169
+ key_layout = QVBoxLayout()
170
+
171
+ key_input_layout = QHBoxLayout()
172
+ self._api_key_edit = QLineEdit()
173
+ self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
174
+ self._api_key_edit.setPlaceholderText("Enter your API key")
175
+
176
+ key_input_layout.addWidget(self._api_key_edit)
177
+
178
+
179
+ self._show_key_check = QCheckBox("Show")
180
+ self._show_key_check.toggled.connect(self._toggle_key_visibility)
181
+ key_input_layout.addWidget(self._show_key_check)
182
+ key_layout.addLayout(key_input_layout)
183
+
184
+ key_hint = QLabel("API keys are stored locally in .trocr_gui/")
185
+ key_hint.setStyleSheet("color: gray; font-size: 9pt;")
186
+ key_layout.addWidget(key_hint)
187
+
188
+ key_group.setLayout(key_layout)
189
+ layout.addWidget(key_group)
190
+
191
+ # Prompt & Sampling section
192
+ prompt_group = QGroupBox("Prompt & Sampling (Optional)")
193
+ prompt_layout = QVBoxLayout()
194
+
195
+ self._prompt_edit = QTextEdit()
196
+ self._prompt_edit.setPlaceholderText("Enter custom transcription prompt...")
197
+ self._prompt_edit.setMaximumHeight(100)
198
+ prompt_layout.addWidget(self._prompt_edit)
199
+
200
+ # Temperature control
201
+ temp_row = QHBoxLayout()
202
+ temp_row.addWidget(QLabel("Temperature:"))
203
+ self._temperature_edit = QLineEdit()
204
+ self._temperature_edit.setPlaceholderText("1.0 (default)")
205
+ self._temperature_edit.setToolTip(
206
+ "Sampling temperature (web default ~1.0).\n"
207
+ "Use 0-0.3 for deterministic; >1 can increase variability."
208
+ )
209
+ self._temperature_edit.setMaximumWidth(90)
210
+ temp_row.addWidget(self._temperature_edit)
211
+ temp_row.addStretch()
212
+ prompt_layout.addLayout(temp_row)
213
+
214
+ # Max output tokens control
215
+ tokens_row = QHBoxLayout()
216
+ tokens_row.addWidget(QLabel("Max output tokens:"))
217
+ self._max_tokens_edit = QLineEdit()
218
+ self._max_tokens_edit.setPlaceholderText("4096 preview / 2048 default")
219
+ self._max_tokens_edit.setToolTip(
220
+ "Upper limit on generated tokens. Lowering may force earlier output.\n"
221
+ "Raising (e.g. 8192) may help high reasoning but risks long 'thinking'."
222
+ )
223
+ self._max_tokens_edit.setMaximumWidth(130)
224
+ tokens_row.addWidget(self._max_tokens_edit)
225
+ tokens_row.addStretch()
226
+ prompt_layout.addLayout(tokens_row)
227
+
228
+ prompt_group.setLayout(prompt_layout)
229
+ layout.addWidget(prompt_group)
230
+
231
+ # Thinking Mode section (for Gemini models)
232
+ thinking_group = QGroupBox("Thinking Mode (Gemini only)")
233
+ thinking_layout = QVBoxLayout()
234
+
235
+ # (Removed warning banner recommending alternative models; preview model retained for Church Slavonic use)
236
+
237
+ thinking_row = QHBoxLayout()
238
+ thinking_row.addWidget(QLabel("Reasoning:"))
239
+ self._thinking_combo = QComboBox()
240
+ self._thinking_combo.addItems(["Auto (Low for preview)", "Low (Fast)", "High (More reasoning)"])
241
+ self._thinking_combo.setToolTip(
242
+ "Low: Fast, direct output\n"
243
+ "High: Slower, uses more tokens for reasoning\n"
244
+ "Auto: Uses Low for preview models to avoid token waste"
245
+ )
246
+ thinking_row.addWidget(self._thinking_combo)
247
+ thinking_row.addStretch()
248
+ thinking_layout.addLayout(thinking_row)
249
+
250
+ thinking_group.setLayout(thinking_layout)
251
+ layout.addWidget(thinking_group)
252
+
253
+ # Advanced Gemini controls
254
+ advanced_group = QGroupBox("Gemini Advanced")
255
+ adv_layout = QVBoxLayout()
256
+
257
+ # Row 1: Checkboxes
258
+ adv_row1 = QHBoxLayout()
259
+ self._early_exit_check = QCheckBox("Early exit on first chunk")
260
+ self._early_exit_check.setChecked(True)
261
+ self._early_exit_check.setToolTip("If checked, streaming returns after first non-empty text chunk. Uncheck to collect full stream.")
262
+ adv_row1.addWidget(self._early_exit_check)
263
+
264
+ self._auto_continue_check = QCheckBox("Auto continuation")
265
+ self._auto_continue_check.setChecked(False) # Default: off for speed
266
+ self._auto_continue_check.setToolTip("If checked, performs additional continuation calls to capture missed trailing text.")
267
+ adv_row1.addWidget(self._auto_continue_check)
268
+ adv_row1.addStretch()
269
+ adv_layout.addLayout(adv_row1)
270
+
271
+ # Row 2: Continuation settings (symmetrical grid)
272
+ adv_row2 = QHBoxLayout()
273
+ adv_row2.addWidget(QLabel("Max passes:"))
274
+ self._max_continuations_edit = QLineEdit()
275
+ self._max_continuations_edit.setText("2") # Default value
276
+ self._max_continuations_edit.setToolTip("Maximum number of continuation attempts (2-3 recommended)")
277
+ self._max_continuations_edit.setFixedWidth(60)
278
+ adv_row2.addWidget(self._max_continuations_edit)
279
+
280
+ adv_row2.addSpacing(20)
281
+
282
+ adv_row2.addWidget(QLabel("Min new chars:"))
283
+ self._min_new_chars_edit = QLineEdit()
284
+ self._min_new_chars_edit.setText("50") # Default value
285
+ self._min_new_chars_edit.setToolTip("Minimum number of new characters required to accept a continuation chunk.")
286
+ self._min_new_chars_edit.setFixedWidth(60)
287
+ adv_row2.addWidget(self._min_new_chars_edit)
288
+ adv_row2.addStretch()
289
+ adv_layout.addLayout(adv_row2)
290
+
291
+ # Row 3: Token & fallback settings (symmetrical grid)
292
+ adv_row3 = QHBoxLayout()
293
+ adv_row3.addWidget(QLabel("Low-mode tokens:"))
294
+ self._low_initial_tokens_edit = QLineEdit()
295
+ self._low_initial_tokens_edit.setText("6144") # Default value
296
+ self._low_initial_tokens_edit.setToolTip("Initial max_output_tokens for LOW thinking before fallback escalation (4096-8192).")
297
+ self._low_initial_tokens_edit.setFixedWidth(60)
298
+ adv_row3.addWidget(self._low_initial_tokens_edit)
299
+
300
+ adv_row3.addSpacing(20)
301
+
302
+ adv_row3.addWidget(QLabel("Fallback %:"))
303
+ self._reasoning_fallback_edit = QLineEdit()
304
+ self._reasoning_fallback_edit.setText("0.6") # Default value
305
+ self._reasoning_fallback_edit.setToolTip("Fraction of token budget consumed internally (no output) that triggers early fallback (0.5-0.8).")
306
+ self._reasoning_fallback_edit.setFixedWidth(60)
307
+ adv_row3.addWidget(self._reasoning_fallback_edit)
308
+
309
+ adv_row3.addSpacing(20)
310
+ adv_row3.addWidget(QLabel("Fallback cap:"))
311
+ self._fallback_cap_edit = QLineEdit()
312
+ self._fallback_cap_edit.setText("8192") # Default configurable cap
313
+ self._fallback_cap_edit.setToolTip("Maximum tokens for fallback attempt. Increase for page-wise recognition (e.g. 12288 or 16384).")
314
+ self._fallback_cap_edit.setFixedWidth(70)
315
+ adv_row3.addWidget(self._fallback_cap_edit)
316
+ adv_row3.addStretch()
317
+ adv_layout.addLayout(adv_row3)
318
+
319
+ advanced_group.setLayout(adv_layout)
320
+ layout.addWidget(advanced_group)
321
+
322
+ layout.addStretch()
323
+ widget.setLayout(layout)
324
+
325
+ self._config_widget = widget
326
+
327
+ # Initialize model list based on default provider
328
+ self._on_provider_changed(self._provider_combo.currentText())
329
+
330
+ return widget
331
+
332
+ def _get_api_key_file(self) -> 'Path':
333
+ """Get path to API key storage file."""
334
+ from pathlib import Path
335
+ storage_dir = Path.home() / ".trocr_gui"
336
+ storage_dir.mkdir(exist_ok=True)
337
+ return storage_dir / "api_keys.json"
338
+
339
+ def _load_saved_api_key(self):
340
+ """Load saved API key for current provider."""
341
+ try:
342
+ import json
343
+ key_file = self._get_api_key_file()
344
+
345
+ if key_file.exists():
346
+ with open(key_file, "r") as f:
347
+ keys = json.load(f)
348
+
349
+ provider = self._provider_combo.currentText().lower()
350
+ if provider in keys:
351
+ self._api_key_edit.setText(keys[provider])
352
+ except Exception as e:
353
+ print(f"Warning: Could not load saved API key: {e}")
354
+
355
+ def _save_api_key(self):
356
+ """Save API key for current provider."""
357
+ try:
358
+ import json
359
+ key_file = self._get_api_key_file()
360
+
361
+ # Load existing keys
362
+ keys = {}
363
+ if key_file.exists():
364
+ with open(key_file, "r") as f:
365
+ keys = json.load(f)
366
+
367
+ # Update key for current provider
368
+ provider = self._provider_combo.currentText().lower()
369
+ api_key = self._api_key_edit.text().strip()
370
+
371
+ if api_key:
372
+ keys[provider] = api_key
373
+
374
+ with open(key_file, "w") as f:
375
+ json.dump(keys, f, indent=2)
376
+ except Exception as e:
377
+ print(f"Warning: Could not save API key: {e}")
378
+
379
+ def _on_provider_changed(self, provider: str):
380
+ """Update model list when provider changes and load API key from environment."""
381
+ if self._model_combo is None:
382
+ return
383
+
384
+ self._model_combo.clear()
385
+
386
+ if provider == "OpenAI":
387
+ self._model_combo.addItems(OPENAI_MODELS)
388
+ elif provider == "Gemini":
389
+ self._model_combo.addItems(GEMINI_MODELS)
390
+ elif provider == "Claude":
391
+ self._model_combo.addItems(CLAUDE_MODELS)
392
+ else:
393
+ self._model_combo.addItem("No models available")
394
+
395
+ # Auto-load API key from environment variables
396
+ if self._api_key_edit is not None:
397
+ env_key = self._get_api_key_from_env(provider)
398
+ if env_key:
399
+ self._api_key_edit.setText(env_key)
400
+ print(f"[CommercialAPIEngine] Loaded {provider} API key from environment")
401
+
402
+ def _get_api_key_from_env(self, provider: str) -> Optional[str]:
403
+ """Get API key from environment variables based on provider."""
404
+ env_var_map = {
405
+ "OpenAI": "OPENAI_API_KEY",
406
+ "Gemini": "GOOGLE_API_KEY",
407
+ "Claude": "ANTHROPIC_API_KEY"
408
+ }
409
+
410
+ env_var = env_var_map.get(provider)
411
+ if env_var:
412
+ return os.getenv(env_var, "")
413
+
414
+ def _toggle_key_visibility(self, checked: bool):
415
+ """Toggle API key visibility."""
416
+ if checked:
417
+ self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Normal)
418
+ else:
419
+ self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
420
+
421
+ def _on_custom_model_toggled(self, checked: bool):
422
+ """Enable/disable custom model field."""
423
+ self._custom_model_edit.setEnabled(checked)
424
+ self._model_combo.setEnabled(not checked)
425
+
426
+ def _on_refresh_models(self):
427
+ """Refresh model list from API dynamically."""
428
+ if self._model_combo is None or self._api_key_edit is None:
429
+ return
430
+
431
+ provider = self._provider_combo.currentText()
432
+ api_key = self._api_key_edit.text().strip()
433
+
434
+ if not api_key:
435
+ print(f"[CommercialAPIEngine] Cannot refresh models: No API key provided")
436
+ return
437
+
438
+ print(f"[CommercialAPIEngine] Refreshing {provider} models from API...")
439
+
440
+ # Save current selection
441
+ current_model = self._model_combo.currentText()
442
+
443
+ # Fetch models dynamically
444
+ if provider == "OpenAI":
445
+ models = fetch_openai_models(api_key)
446
+ elif provider == "Gemini":
447
+ models = fetch_gemini_models(api_key)
448
+ else:
449
+ print(f"[CommercialAPIEngine] Dynamic refresh not supported for {provider}")
450
+ return
451
+
452
+ # Update dropdown
453
+ self._model_combo.clear()
454
+ self._model_combo.addItems(models)
455
+
456
+ # Restore selection if possible
457
+ idx = self._model_combo.findText(current_model)
458
+ if idx >= 0:
459
+ self._model_combo.setCurrentIndex(idx)
460
+
461
+ print(f"[CommercialAPIEngine] Refreshed {len(models)} models for {provider}")
462
+
463
+ def get_config(self) -> Dict[str, Any]:
464
+ """Extract configuration from widget controls."""
465
+ if self._config_widget is None:
466
+ return {}
467
+
468
+ prompt_text = self._prompt_edit.toPlainText().strip()
469
+
470
+ # Use custom model if checkbox is enabled, otherwise use dropdown
471
+ if self._use_custom_model_check.isChecked():
472
+ model = self._custom_model_edit.text().strip()
473
+ else:
474
+ model = self._model_combo.currentText()
475
+
476
+ return {
477
+ "provider": self._provider_combo.currentText(),
478
+ "model": model,
479
+ "api_key": self._api_key_edit.text().strip(),
480
+ "custom_prompt": prompt_text if prompt_text else None,
481
+ "use_custom_model": self._use_custom_model_check.isChecked(),
482
+ "custom_model_id": self._custom_model_edit.text().strip(),
483
+ }
484
+
485
+ def set_config(self, config: Dict[str, Any]):
486
+ """Restore configuration to widget controls."""
487
+ if self._config_widget is None:
488
+ return
489
+
490
+ provider = config.get("provider", "")
491
+ idx = self._provider_combo.findText(provider)
492
+ if idx >= 0:
493
+ self._provider_combo.setCurrentIndex(idx)
494
+
495
+ # Restore custom model checkbox and field
496
+ use_custom = config.get("use_custom_model", False)
497
+ self._use_custom_model_check.setChecked(use_custom)
498
+
499
+ if use_custom:
500
+ custom_model_id = config.get("custom_model_id", "")
501
+ self._custom_model_edit.setText(custom_model_id)
502
+ else:
503
+ model = config.get("model", "")
504
+ idx = self._model_combo.findText(model)
505
+ if idx >= 0:
506
+ self._model_combo.setCurrentIndex(idx)
507
+
508
+ self._api_key_edit.setText(config.get("api_key", ""))
509
+
510
+ custom_prompt = config.get("custom_prompt", "")
511
+ if custom_prompt:
512
+ self._prompt_edit.setPlainText(custom_prompt)
513
+
514
+ def load_model(self, config: Dict[str, Any]) -> bool:
515
+ """Load (initialize) API client."""
516
+ try:
517
+ provider = config.get("provider", "")
518
+ model_name = config.get("model", "")
519
+ api_key = config.get("api_key", "")
520
+
521
+ if not api_key:
522
+ print("Error: No API key provided")
523
+ return False
524
+
525
+ # Unload previous model
526
+ self.unload_model()
527
+
528
+ # Initialize appropriate client
529
+ if provider == "OpenAI":
530
+ self.model = OpenAIInference(api_key=api_key, model=model_name)
531
+ self._current_provider = "openai"
532
+ elif provider == "Gemini":
533
+ self.model = GeminiInference(api_key=api_key, model=model_name)
534
+ self._current_provider = "gemini"
535
+ elif provider == "Claude":
536
+ self.model = ClaudeInference(api_key=api_key, model=model_name)
537
+ self._current_provider = "claude"
538
+ else:
539
+ return False
540
+
541
+ return True
542
+
543
+ except Exception as e:
544
+ print(f"Error initializing API client: {e}")
545
+ self.model = None
546
+ self._current_provider = None
547
+ return False
548
+
549
+ def unload_model(self):
550
+ """Unload (clear) API client."""
551
+ if self.model is not None:
552
+ del self.model
553
+ self.model = None
554
+ self._current_provider = None
555
+
556
+ def is_model_loaded(self) -> bool:
557
+ """Check if API client is initialized."""
558
+ return self.model is not None
559
+
560
+ def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
561
+ """Transcribe a line image with commercial API."""
562
+ if self.model is None:
563
+ return TranscriptionResult(text="[API client not initialized]", confidence=0.0)
564
+
565
+ if config is None:
566
+ config = self.get_config()
567
+
568
+ custom_prompt = config.get("custom_prompt")
569
+
570
+ try:
571
+ # Convert numpy array to PIL Image
572
+ from PIL import Image
573
+ if isinstance(image, np.ndarray):
574
+ pil_image = Image.fromarray(image)
575
+ else:
576
+ pil_image = image
577
+
578
+ # All API clients have transcribe() method
579
+ # It returns a string directly, not a dict
580
+ # Enable retry logic for Gemini to handle content blocking
581
+ if self._current_provider == "gemini":
582
+ # Get thinking mode setting
583
+ thinking_mode = None
584
+ temperature = None
585
+ if self._thinking_combo is not None:
586
+ thinking_text = self._thinking_combo.currentText()
587
+ if "Low" in thinking_text:
588
+ thinking_mode = "low"
589
+ fast_direct = True # low mode: request immediate output
590
+ elif "High" in thinking_text:
591
+ thinking_mode = "high"
592
+ # else: Auto = None (default)
593
+ else:
594
+ # Web UI context — get thinking_mode from config dict
595
+ thinking_mode = config.get("thinking_mode") or None
596
+ if self._temperature_edit is not None:
597
+ t_text = self._temperature_edit.text().strip()
598
+ if t_text:
599
+ try:
600
+ temperature = float(t_text)
601
+ except ValueError:
602
+ temperature = None
603
+ max_tokens = None
604
+ if self._max_tokens_edit is not None:
605
+ mt_text = self._max_tokens_edit.text().strip()
606
+ if mt_text:
607
+ try:
608
+ max_tokens = int(mt_text)
609
+ except ValueError:
610
+ max_tokens = None
611
+ # Fallback to config dict (web UI context — no Qt widgets)
612
+ if max_tokens is None:
613
+ max_tokens = config.get("max_output_tokens")
614
+ # Treat 0 as "no limit" (HTML number fields send 0 for blank)
615
+ if max_tokens is not None and max_tokens <= 0:
616
+ max_tokens = None
617
+ if temperature is None:
618
+ temperature = config.get("temperature")
619
+ # Web UI (no Qt widgets): disable early exit for full reasoning quality
620
+ if self._early_exit_check is not None:
621
+ fast_direct_early_exit = self._early_exit_check.isChecked()
622
+ else:
623
+ fast_direct_early_exit = False
624
+ # Extract continuation settings
625
+ auto_continue = False
626
+ max_auto_continuations = 2 # Default
627
+ if self._auto_continue_check is not None and self._auto_continue_check.isChecked():
628
+ auto_continue = True
629
+ if self._max_continuations_edit is not None:
630
+ mc_text = self._max_continuations_edit.text().strip()
631
+ if mc_text:
632
+ try:
633
+ max_auto_continuations = int(mc_text)
634
+ except ValueError:
635
+ pass # Keep default of 2
636
+
637
+ # Extract continuation settings with defaults
638
+ continuation_min_new_chars = 50
639
+ if hasattr(self, '_min_new_chars_edit') and self._min_new_chars_edit is not None:
640
+ mnc_text = self._min_new_chars_edit.text().strip()
641
+ if mnc_text:
642
+ try:
643
+ continuation_min_new_chars = int(mnc_text)
644
+ except ValueError:
645
+ pass # Keep default
646
+
647
+ # Web UI (no Qt widgets): disable reasoning fallback (1.0 = never trigger)
648
+ reasoning_fallback_threshold = 1.0 if not (hasattr(self, '_reasoning_fallback_edit') and self._reasoning_fallback_edit is not None) else 0.6
649
+ if hasattr(self, '_reasoning_fallback_edit') and self._reasoning_fallback_edit is not None:
650
+ rft_text = self._reasoning_fallback_edit.text().strip()
651
+ if rft_text:
652
+ try:
653
+ reasoning_fallback_threshold = float(rft_text)
654
+ except ValueError:
655
+ pass # Keep default
656
+
657
+ fallback_cap = 8192
658
+ if hasattr(self, '_fallback_cap_edit') and self._fallback_cap_edit is not None:
659
+ fc_text = self._fallback_cap_edit.text().strip()
660
+ if fc_text:
661
+ try:
662
+ fallback_cap = int(fc_text)
663
+ except ValueError:
664
+ pass # Keep default if invalid value
665
+
666
+ # Override max_tokens for LOW thinking mode if specified
667
+ if thinking_mode == 'low' and hasattr(self, '_low_initial_tokens_edit') and self._low_initial_tokens_edit is not None:
668
+ lit_text = self._low_initial_tokens_edit.text().strip()
669
+ if lit_text:
670
+ try:
671
+ lit_val = int(lit_text)
672
+ if lit_val > 0:
673
+ max_tokens = lit_val
674
+ print(f"🔧 LOW thinking mode: overriding max_output_tokens to {max_tokens}")
675
+ except ValueError:
676
+ pass # Keep existing max_tokens
677
+
678
+ # Debug: show final token budget
679
+ print(f"📊 Final settings: thinking_mode={thinking_mode}, max_output_tokens={max_tokens or 'model default'}, temp={temperature if temperature is not None else 1.0}")
680
+
681
+ text = self.model.transcribe(
682
+ pil_image,
683
+ prompt=custom_prompt,
684
+ temperature=temperature if temperature is not None else 0.0,
685
+ max_output_tokens=max_tokens, # None = no limit, model uses its own maximum
686
+ auto_retry_on_block=True,
687
+ safety_relax=True,
688
+ verbose_block_logging=True,
689
+ thinking_mode=thinking_mode,
690
+ fast_direct=fast_direct if 'fast_direct' in locals() else False,
691
+ fast_direct_early_exit=fast_direct_early_exit,
692
+ auto_continue=auto_continue,
693
+ max_auto_continuations=max_auto_continuations,
694
+ continuation_min_new_chars=continuation_min_new_chars,
695
+ reasoning_fallback_threshold=reasoning_fallback_threshold,
696
+ fallback_max_output_tokens=fallback_cap,
697
+ record_stats_csv="gemini_runs.csv",
698
+ apply_restriction_prompt=False # Let model reason freely — improves transcription quality
699
+ )
700
+ else:
701
+ temperature = None
702
+ if self._temperature_edit is not None:
703
+ t_text = self._temperature_edit.text().strip()
704
+ if t_text:
705
+ try:
706
+ temperature = float(t_text)
707
+ except ValueError:
708
+ temperature = None
709
+ max_tokens = None
710
+ if self._max_tokens_edit is not None:
711
+ mt_text = self._max_tokens_edit.text().strip()
712
+ if mt_text:
713
+ try:
714
+ max_tokens = int(mt_text)
715
+ except ValueError:
716
+ max_tokens = None
717
+ # Fallback to config dict (web UI context — no Qt widgets)
718
+ if max_tokens is None:
719
+ max_tokens = config.get("max_output_tokens")
720
+ # Treat 0 as "no limit" (HTML number fields send 0 for blank)
721
+ if max_tokens is not None and max_tokens <= 0:
722
+ max_tokens = None
723
+ if temperature is None:
724
+ temperature = config.get("temperature")
725
+ thinking_mode = config.get("thinking_mode") or None
726
+ text = self.model.transcribe(
727
+ pil_image,
728
+ prompt=custom_prompt,
729
+ temperature=temperature if temperature is not None else 0.0,
730
+ max_output_tokens=max_tokens, # None = no limit, model uses its own maximum
731
+ thinking_mode=thinking_mode,
732
+ )
733
+
734
+ meta: Dict[str, Any] = {
735
+ "provider": self._current_provider,
736
+ "model": config.get("model", ""),
737
+ }
738
+ if hasattr(self.model, "last_usage") and self.model.last_usage:
739
+ usage = dict(self.model.last_usage)
740
+ thinking_text = usage.pop("thinking_text", None)
741
+ meta["token_usage"] = usage
742
+ if thinking_text:
743
+ meta["thinking_text"] = thinking_text
744
+ return TranscriptionResult(
745
+ text=text if text else "",
746
+ confidence=1.0, # API models don't provide confidence
747
+ metadata=meta,
748
+ )
749
+
750
+ except Exception as e:
751
+ print(f"Error in API transcription: {e}")
752
+ import traceback
753
+ traceback.print_exc()
754
+ return TranscriptionResult(text=f"[API Error: {e}]", confidence=0.0)
755
+
756
+ def get_capabilities(self) -> Dict[str, bool]:
757
+ """Commercial API capabilities."""
758
+ return {
759
+ "batch_processing": False, # APIs typically process one at a time
760
+ "confidence_scores": False, # Most don't provide confidence
761
+ "beam_search": False, # Internal to API
762
+ "language_model": True, # All are language models
763
+ "preprocessing": True, # APIs handle preprocessing
764
+ }
765
+
766
+ def requires_line_segmentation(self) -> bool:
767
+ """Commercial APIs can process full pages without segmentation."""
768
+ return False
engines/kraken_engine.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kraken HTR Engine Plugin
3
+
4
+ Wraps the Kraken OCR system as a plugin for the unified GUI.
5
+ Kraken is specialized for historical document OCR with robust segmentation and recognition.
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Dict, Any, Optional
11
+ import numpy as np
12
+
13
+
14
+ def _print(msg: str) -> None:
15
+ """Print with graceful fallback if console can't encode the message (e.g. Windows CP-1252)."""
16
+ try:
17
+ print(msg)
18
+ except UnicodeEncodeError:
19
+ print(msg.encode("ascii", errors="replace").decode("ascii"))
20
+
21
+ from htr_engine_base import HTREngine, TranscriptionResult
22
+
23
+ try:
24
+ from PyQt6.QtWidgets import (
25
+ QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
26
+ QPushButton, QLineEdit, QFileDialog, QGroupBox, QCheckBox
27
+ )
28
+ from PyQt6.QtCore import Qt
29
+ PYQT_AVAILABLE = True
30
+ except ImportError:
31
+ PYQT_AVAILABLE = False
32
+ QWidget = object
33
+
34
+ try:
35
+ from kraken import rpred
36
+ from kraken.lib import vgsl, models
37
+ KRAKEN_AVAILABLE = True
38
+ except ImportError:
39
+ KRAKEN_AVAILABLE = False
40
+
41
+
42
+ # Local model (included in repo)
43
+ LOCAL_BLLA_MODEL = "pagexml/blla.mlmodel"
44
+
45
+ # Preset Kraken models — local + Zenodo community models (auto-download on first use)
46
+ KRAKEN_MODELS = {
47
+ "blla-local": {
48
+ "path": LOCAL_BLLA_MODEL,
49
+ "description": "BLLA Segmentation Model (Local, Default)",
50
+ "language": "multi",
51
+ "source": "local"
52
+ },
53
+ # --- VERIFIED ZENODO MODELS ---
54
+ # CATMuS-Print: printed text, multilingual, verified DOI 10.5281/zenodo.10592716
55
+ "catmus-print": {
56
+ "zenodo_id": "10.5281/zenodo.10592716",
57
+ "description": "CATMuS-Print (Modern Printed Text, multilingual)",
58
+ "language": "multi",
59
+ "source": "zenodo"
60
+ },
61
+ # Arabic handwritten segmentation (Muharaf Corpus), verified DOI 10.5281/zenodo.14295555
62
+ "arabic-muharaf": {
63
+ "zenodo_id": "10.5281/zenodo.14295555",
64
+ "description": "Arabic Handwritten Segmentation (Muharaf Corpus)",
65
+ "language": "arabic",
66
+ "source": "zenodo"
67
+ },
68
+ }
69
+
70
+
71
+ class KrakenEngine(HTREngine):
72
+ """Kraken HTR engine plugin."""
73
+
74
+ def __init__(self):
75
+ self.model: Optional[Any] = None # TorchSeqRecognizer
76
+ self._config_widget: Optional[QWidget] = None
77
+
78
+ # Widget references
79
+ self._model_source_combo: Optional[QComboBox] = None
80
+ self._preset_combo: Optional[QComboBox] = None
81
+ self._custom_model_edit: Optional[QLineEdit] = None
82
+ self._bidi_reorder_check: Optional[QCheckBox] = None
83
+
84
+ def get_name(self) -> str:
85
+ return "Kraken"
86
+
87
+ def get_description(self) -> str:
88
+ return "Kraken OCR - Specialized for historical documents with .mlmodel support"
89
+
90
+ def is_available(self) -> bool:
91
+ return KRAKEN_AVAILABLE
92
+
93
+ def get_unavailable_reason(self) -> str:
94
+ if not KRAKEN_AVAILABLE:
95
+ return "Kraken not installed. Install with: pip install kraken"
96
+ return ""
97
+
98
+ def get_config_widget(self) -> QWidget:
99
+ """Create Kraken configuration panel."""
100
+ if not PYQT_AVAILABLE:
101
+ raise RuntimeError("PyQt6 not installed. Install with: pip install PyQt6")
102
+ if self._config_widget is not None:
103
+ return self._config_widget
104
+
105
+ widget = QWidget()
106
+ layout = QVBoxLayout()
107
+
108
+ # Model source selection
109
+ source_group = QGroupBox("Model Source")
110
+ source_layout = QVBoxLayout()
111
+
112
+ self._model_source_combo = QComboBox()
113
+ self._model_source_combo.addItems(["Preset Models", "Custom Model File"])
114
+ self._model_source_combo.currentTextChanged.connect(self._on_model_source_changed)
115
+ source_layout.addWidget(self._model_source_combo)
116
+
117
+ source_group.setLayout(source_layout)
118
+ layout.addWidget(source_group)
119
+
120
+ # Preset models group
121
+ self._preset_group = QGroupBox("Preset Model")
122
+ preset_layout = QVBoxLayout()
123
+
124
+ self._preset_combo = QComboBox()
125
+ self._populate_preset_models()
126
+ self._preset_combo.currentIndexChanged.connect(self._on_preset_model_changed)
127
+ preset_layout.addWidget(QLabel("Model:"))
128
+ preset_layout.addWidget(self._preset_combo)
129
+
130
+ preset_hint = QLabel("Note: Zenodo models (⬇️) auto-download on first use")
131
+ preset_hint.setStyleSheet("color: gray; font-size: 9pt;")
132
+ preset_layout.addWidget(preset_hint)
133
+
134
+ self._preset_group.setLayout(preset_layout)
135
+ layout.addWidget(self._preset_group)
136
+
137
+ # Custom model group
138
+ self._custom_group = QGroupBox("Custom Model")
139
+ custom_layout = QVBoxLayout()
140
+
141
+ custom_layout.addWidget(QLabel("Model File (.mlmodel):"))
142
+ model_layout = QHBoxLayout()
143
+ self._custom_model_edit = QLineEdit()
144
+ self._custom_model_edit.setPlaceholderText("Path to .mlmodel file")
145
+ model_layout.addWidget(self._custom_model_edit)
146
+
147
+ browse_btn = QPushButton("Browse...")
148
+ browse_btn.clicked.connect(self._browse_model)
149
+ model_layout.addWidget(browse_btn)
150
+ custom_layout.addLayout(model_layout)
151
+
152
+ self._custom_group.setLayout(custom_layout)
153
+ self._custom_group.setVisible(False) # Hidden by default
154
+ layout.addWidget(self._custom_group)
155
+
156
+ # Recognition settings
157
+ settings_group = QGroupBox("Recognition Settings")
158
+ settings_layout = QVBoxLayout()
159
+
160
+ self._bidi_reorder_check = QCheckBox("Bidirectional Text Reordering")
161
+ self._bidi_reorder_check.setChecked(True)
162
+ self._bidi_reorder_check.setToolTip("Enable for RTL languages (Arabic, Hebrew, etc.)")
163
+ settings_layout.addWidget(self._bidi_reorder_check)
164
+
165
+ settings_group.setLayout(settings_layout)
166
+ layout.addWidget(settings_group)
167
+
168
+ layout.addStretch()
169
+ widget.setLayout(layout)
170
+
171
+ self._config_widget = widget
172
+ return widget
173
+
174
+ def _populate_preset_models(self):
175
+ """Populate preset models dropdown with local and Zenodo models."""
176
+ if self._preset_combo is None:
177
+ return
178
+
179
+ self._preset_combo.clear()
180
+
181
+ if not KRAKEN_MODELS:
182
+ self._preset_combo.addItem("No presets available")
183
+ return
184
+
185
+ # Local model first
186
+ for model_id, info in KRAKEN_MODELS.items():
187
+ if info.get("source") == "local":
188
+ desc = info.get('description', model_id)
189
+ self._preset_combo.addItem(f"📁 {desc}", userData=model_id)
190
+ break
191
+
192
+ self._preset_combo.insertSeparator(self._preset_combo.count())
193
+
194
+ # Zenodo models
195
+ for model_id, info in KRAKEN_MODELS.items():
196
+ if info.get("source") == "zenodo":
197
+ desc = info.get('description', model_id)
198
+ lang = info.get('language', '')
199
+ self._preset_combo.addItem(f"⬇️ {desc} ({lang})", userData=model_id)
200
+
201
+ self._preset_combo.insertSeparator(self._preset_combo.count())
202
+ self._preset_combo.addItem("📂 Browse Custom File...", userData="__custom__")
203
+
204
+ def _on_model_source_changed(self, source: str):
205
+ """Toggle between preset and custom model selection."""
206
+ is_preset = (source == "Preset Models")
207
+ self._preset_group.setVisible(is_preset)
208
+ self._custom_group.setVisible(not is_preset)
209
+
210
+ def _on_preset_model_changed(self, index: int):
211
+ """Handle preset selection — open file browser for custom option."""
212
+ model_id = self._preset_combo.currentData()
213
+ if model_id == "__custom__":
214
+ file_path, _ = QFileDialog.getOpenFileName(
215
+ self._config_widget,
216
+ "Select Kraken Model File",
217
+ "",
218
+ "Kraken Models (*.mlmodel);;All Files (*)"
219
+ )
220
+ if file_path:
221
+ self._model_source_combo.setCurrentText("Custom Model File")
222
+ self._custom_model_edit.setText(file_path)
223
+ self._preset_combo.blockSignals(True)
224
+ self._preset_combo.setCurrentIndex(0)
225
+ self._preset_combo.blockSignals(False)
226
+
227
+ def _browse_model(self):
228
+ """Open file dialog to select model file."""
229
+ file_path, _ = QFileDialog.getOpenFileName(
230
+ self._config_widget,
231
+ "Select Kraken Model",
232
+ "models",
233
+ "Kraken Models (*.mlmodel);;All Files (*)"
234
+ )
235
+
236
+ if file_path:
237
+ self._custom_model_edit.setText(file_path)
238
+
239
+ def get_config(self) -> Dict[str, Any]:
240
+ """Extract configuration from widget controls."""
241
+ if self._config_widget is None:
242
+ return {}
243
+
244
+ is_preset = (self._model_source_combo.currentText() == "Preset Models")
245
+
246
+ config = {
247
+ "model_source": "preset" if is_preset else "custom",
248
+ "bidi_reordering": self._bidi_reorder_check.isChecked(),
249
+ }
250
+
251
+ if is_preset:
252
+ model_id = self._preset_combo.currentData()
253
+ if model_id and model_id in KRAKEN_MODELS:
254
+ config["preset_id"] = model_id
255
+ config["model_path"] = KRAKEN_MODELS[model_id].get("path")
256
+ else:
257
+ config["model_path"] = self._custom_model_edit.text()
258
+
259
+ return config
260
+
261
+ def set_config(self, config: Dict[str, Any]):
262
+ """Restore configuration to widget controls."""
263
+ if self._config_widget is None:
264
+ return
265
+
266
+ model_source = config.get("model_source", "preset")
267
+ self._model_source_combo.setCurrentText("Preset Models" if model_source == "preset" else "Custom Model File")
268
+
269
+ if model_source == "preset":
270
+ preset_id = config.get("preset_id", "")
271
+ for i in range(self._preset_combo.count()):
272
+ if self._preset_combo.itemData(i) == preset_id:
273
+ self._preset_combo.setCurrentIndex(i)
274
+ break
275
+ else:
276
+ self._custom_model_edit.setText(config.get("model_path", ""))
277
+
278
+ self._bidi_reorder_check.setChecked(config.get("bidi_reordering", True))
279
+
280
+ def load_model(self, config: Dict[str, Any]) -> bool:
281
+ """Load Kraken model (local or Zenodo auto-download)."""
282
+ try:
283
+ model_path = config.get("model_path")
284
+ preset_id = config.get("preset_id")
285
+
286
+ # Resolve Zenodo preset: download if needed
287
+ if preset_id and preset_id in KRAKEN_MODELS:
288
+ model_info = KRAKEN_MODELS[preset_id]
289
+ if model_info.get("source") == "zenodo":
290
+ zenodo_id = model_info.get("zenodo_id")
291
+ model_path = self._download_zenodo_model(zenodo_id, preset_id)
292
+ if not model_path:
293
+ print(f"Error: Failed to download Zenodo model '{preset_id}'")
294
+ return False
295
+ elif model_info.get("source") == "local":
296
+ model_path = model_info.get("path")
297
+
298
+ # Fall back to default local blla model
299
+ if not model_path:
300
+ model_path = LOCAL_BLLA_MODEL
301
+ print(f"No model specified, using default: {model_path}")
302
+
303
+ if not Path(model_path).exists():
304
+ print(f"Error: Model file not found: {model_path}")
305
+ print("For Zenodo models, run: kraken get <zenodo_id>")
306
+ return False
307
+
308
+ vgsl_model = vgsl.TorchVGSLModel.load_model(model_path)
309
+ from kraken.lib.models import TorchSeqRecognizer
310
+ self.model = TorchSeqRecognizer(vgsl_model, device='cpu')
311
+ print(f"Kraken model loaded from: {model_path}")
312
+ return True
313
+
314
+ except Exception as e:
315
+ import traceback
316
+ print(f"Error loading Kraken model: {e}")
317
+ print(traceback.format_exc())
318
+ self.model = None
319
+ return False
320
+
321
+ def _download_zenodo_model(self, zenodo_id: str, model_name: str) -> Optional[str]:
322
+ """Download a Kraken model from Zenodo via `kraken get`.
323
+
324
+ Models are cached in `kraken_models/` inside the repo root.
325
+ Returns local path on success, None on failure.
326
+ """
327
+ import subprocess
328
+ import shutil
329
+ import sys
330
+ import time
331
+
332
+ # Prefer the kraken binary from the same venv as this Python process
333
+ # (shutil.which only searches PATH, which may not include the venv bin/ in
334
+ # systemd services that invoke uvicorn directly without activating the venv).
335
+ venv_kraken = Path(sys.executable).parent / "kraken"
336
+ kraken_cmd = str(venv_kraken) if venv_kraken.exists() else shutil.which("kraken")
337
+ if not kraken_cmd:
338
+ _print("❌ 'kraken' command not found. Install with: pip install kraken")
339
+ _print(f"💡 Manual download: https://zenodo.org/record/{zenodo_id.split('/')[-1]}")
340
+ return None
341
+
342
+ repo_root = Path(__file__).parent.parent
343
+ models_dir = repo_root / "kraken_models"
344
+ models_dir.mkdir(exist_ok=True)
345
+ model_path = models_dir / f"{model_name}.mlmodel"
346
+
347
+ if model_path.exists():
348
+ _print(f"✅ Using cached Zenodo model: {model_path}")
349
+ return str(model_path)
350
+
351
+ # Check for any existing name-matched file
352
+ for existing in models_dir.glob("*.mlmodel"):
353
+ if model_name.lower() in existing.stem.lower():
354
+ _print(f"✅ Found existing model: {existing}")
355
+ return str(existing)
356
+
357
+ _print(f"📥 Downloading Zenodo model {zenodo_id} …")
358
+ _print(f"📂 Will save to: {model_path}")
359
+ _print("⏳ This may take a few minutes on first use …")
360
+
361
+ try:
362
+ result = subprocess.run(
363
+ [kraken_cmd, "get", zenodo_id],
364
+ capture_output=True, text=True, timeout=300
365
+ )
366
+ if result.returncode == 0:
367
+ # Find freshly downloaded .mlmodel (modified within last 2 min)
368
+ search_dirs = [
369
+ Path.home() / "Library" / "Application Support" / "htrmopo",
370
+ Path.home() / ".kraken",
371
+ ]
372
+ downloaded = None
373
+ for d in search_dirs:
374
+ if not d.exists():
375
+ continue
376
+ for p in d.rglob("*.mlmodel"):
377
+ if time.time() - p.stat().st_mtime < 120:
378
+ downloaded = p
379
+ break
380
+ if downloaded:
381
+ break
382
+ if downloaded and downloaded.exists():
383
+ shutil.copy2(downloaded, model_path)
384
+ _print(f"✅ Model saved to: {model_path}")
385
+ return str(model_path)
386
+ else:
387
+ _print("⚠️ Download succeeded but couldn't locate the file")
388
+ else:
389
+ _print(f"❌ kraken get failed (exit {result.returncode}): {result.stderr}")
390
+ _print(f"💡 Manual: kraken get {zenodo_id} then copy to {models_dir}/")
391
+ except subprocess.TimeoutExpired:
392
+ _print("⏱️ Download timeout (>5 min). Try manually: kraken get " + zenodo_id)
393
+ except Exception as e:
394
+ _print(f"❌ Download error: {e}")
395
+
396
+ return None
397
+
398
+ def unload_model(self):
399
+ """Unload model from memory."""
400
+ if self.model is not None:
401
+ del self.model
402
+ self.model = None
403
+
404
+ # Free GPU memory
405
+ import torch
406
+ if torch.cuda.is_available():
407
+ torch.cuda.empty_cache()
408
+
409
+ def is_model_loaded(self) -> bool:
410
+ """Check if model is loaded."""
411
+ return self.model is not None
412
+
413
+ def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
414
+ """Transcribe a line image with Kraken."""
415
+ if self.model is None:
416
+ return TranscriptionResult(text="[Model not loaded]", confidence=0.0)
417
+
418
+ if config is None:
419
+ config = self.get_config()
420
+
421
+ try:
422
+ # Import numpy at the start
423
+ import numpy as np
424
+
425
+ # Convert numpy to PIL
426
+ from PIL import Image as PILImage
427
+ if isinstance(image, np.ndarray):
428
+ pil_image = PILImage.fromarray(image)
429
+ else:
430
+ pil_image = image
431
+
432
+ # Convert to grayscale first
433
+ if pil_image.mode != 'L':
434
+ pil_image = pil_image.convert('L')
435
+
436
+ # IMPORTANT: Do NOT binarize! Kraken models work better with grayscale
437
+ # Modern Kraken models are trained on grayscale images and binarization
438
+ # destroys character details, especially in historical manuscripts
439
+ # The previous median threshold was causing poor recognition quality
440
+ binary_image = pil_image # Keep original grayscale
441
+
442
+ # Create a simple segmentation boundary for the full line image
443
+ # Kraken's rpred needs a Segmentation object with line boundaries
444
+ from kraken.containers import BaselineLine, Segmentation
445
+
446
+ height, width = binary_image.height, binary_image.width
447
+
448
+ # Create a baseline (horizontal line through the middle)
449
+ # Use 0-indexed coordinates (width-1, height-1 as maximum)
450
+ baseline = [[0, height // 2], [width - 1, height // 2]]
451
+
452
+ # Create a boundary polygon (rectangle around the entire image)
453
+ # Use 0-indexed coordinates to avoid "outside of image bounds" error
454
+ boundary = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
455
+
456
+ # Create a BaselineLine (not BBoxLine - that doesn't support baselines)
457
+ line = BaselineLine(
458
+ id='line_0',
459
+ baseline=baseline,
460
+ boundary=boundary,
461
+ text='',
462
+ tags=None,
463
+ split=None
464
+ )
465
+
466
+ # Create Segmentation container
467
+ seg = Segmentation(
468
+ type='baselines',
469
+ imagename='line',
470
+ text_direction='horizontal-lr',
471
+ script_detection=False,
472
+ lines=[line],
473
+ regions={},
474
+ line_orders=[]
475
+ )
476
+
477
+ # Run recognition
478
+ bidi = config.get("bidi_reordering", True)
479
+
480
+ # Model is already wrapped as TorchSeqRecognizer in load_model()
481
+ # rpred returns a generator
482
+ results = list(rpred.rpred(
483
+ network=self.model,
484
+ im=binary_image,
485
+ bounds=seg,
486
+ bidi_reordering=bidi
487
+ ))
488
+
489
+ # Extract text from first result
490
+ if results and len(results) > 0:
491
+ text = results[0].prediction
492
+ confidence = results[0].confidences
493
+ avg_confidence = sum(confidence) / len(confidence) if confidence else 1.0
494
+
495
+ return TranscriptionResult(
496
+ text=text,
497
+ confidence=avg_confidence,
498
+ metadata={"model": "kraken"}
499
+ )
500
+ else:
501
+ return TranscriptionResult(text="", confidence=0.0)
502
+
503
+ except Exception as e:
504
+ import traceback
505
+ print(f"Error in Kraken transcription: {e}")
506
+ print(traceback.format_exc())
507
+ return TranscriptionResult(text=f"[Error: {e}]", confidence=0.0)
508
+
509
+ def get_capabilities(self) -> Dict[str, bool]:
510
+ """Kraken capabilities."""
511
+ return {
512
+ "batch_processing": False, # Could be implemented
513
+ "confidence_scores": True, # Kraken provides per-character confidence
514
+ "beam_search": False, # Internal to Kraken
515
+ "language_model": False, # Not explicitly exposed
516
+ "preprocessing": False, # External binarization recommended
517
+ }
518
+
519
+
520
+ def download_preset_model(preset_name: str) -> Optional[str]:
521
+ """Module-level helper: resolve and (if needed) download a Kraken preset model.
522
+
523
+ Used by batch_processing.py and the web server without instantiating KrakenEngine.
524
+ Returns local file path, or None on failure.
525
+ """
526
+ if preset_name not in KRAKEN_MODELS:
527
+ print(f"Unknown Kraken preset: '{preset_name}'. Available: {list(KRAKEN_MODELS)}")
528
+ return None
529
+ info = KRAKEN_MODELS[preset_name]
530
+ if info.get("source") == "local":
531
+ return info.get("path")
532
+ if info.get("source") == "zenodo":
533
+ engine = KrakenEngine.__new__(KrakenEngine)
534
+ return engine._download_zenodo_model(info["zenodo_id"], preset_name)
535
+ return None
engines/openwebui_engine.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenWebUI Engine Plugin
3
+
4
+ Wraps the OpenWebUI API (OpenAI-compatible) from uni-freiburg.de as an HTR engine.
5
+ Supports multiple models available on the OpenWebUI platform.
6
+ """
7
+
8
+ from typing import Dict, Any, Optional, List
9
+ from pathlib import Path
10
+ import numpy as np
11
+ from PIL import Image
12
+ import io
13
+ import base64
14
+
15
+ from htr_engine_base import HTREngine, TranscriptionResult
16
+
17
+ try:
18
+ from PyQt6.QtWidgets import (
19
+ QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
20
+ QPushButton, QCheckBox, QLineEdit, QGroupBox, QTextEdit,
21
+ QSpinBox
22
+ )
23
+ from PyQt6.QtCore import Qt
24
+ PYQT_AVAILABLE = True
25
+ except ImportError:
26
+ PYQT_AVAILABLE = False
27
+ QWidget = object
28
+
29
+ try:
30
+ from openai import OpenAI
31
+ OPENAI_AVAILABLE = True
32
+ except ImportError:
33
+ OPENAI_AVAILABLE = False
34
+
35
+ try:
36
+ from dotenv import load_dotenv
37
+ DOTENV_AVAILABLE = True
38
+ except ImportError:
39
+ DOTENV_AVAILABLE = False
40
+
41
+
42
+ class OpenWebUIEngine(HTREngine):
43
+ """OpenWebUI API HTR engine plugin (OpenAI-compatible)."""
44
+
45
+ def __init__(self):
46
+ self.client: Optional[OpenAI] = None
47
+ self._config_widget: Optional[QWidget] = None
48
+ self._available_models: List[str] = []
49
+
50
+ # Store config from load_model for batch processing
51
+ self._loaded_config: Dict[str, Any] = {}
52
+
53
+ # Widget references
54
+ self._model_combo: Optional[QComboBox] = None
55
+ self._api_key_edit: Optional[QLineEdit] = None
56
+ self._show_key_check: Optional[QCheckBox] = None
57
+ self._prompt_edit: Optional[QTextEdit] = None
58
+ self._temperature_spin: Optional[QSpinBox] = None
59
+ self._max_tokens_spin: Optional[QSpinBox] = None
60
+ self._refresh_models_btn: Optional[QPushButton] = None
61
+
62
+ # Default API configuration
63
+ self.base_url = ""
64
+
65
+ # Load environment variables from .env file (only once when instantiated)
66
+ self._load_env_variables()
67
+
68
+ def _load_env_variables(self):
69
+ """Load environment variables from .env file if available."""
70
+ try:
71
+ from dotenv import load_dotenv
72
+ # Look for .env in the project root (parent of engines/)
73
+ env_path = Path(__file__).parent.parent / ".env"
74
+ if env_path.exists():
75
+ load_dotenv(env_path)
76
+ except ImportError:
77
+ # Silently skip if python-dotenv is not installed
78
+ # Environment variables can still be set via OS
79
+ pass
80
+
81
+ # Load environment variables from .env file (if available)
82
+ self._load_env_file()
83
+
84
+ def _load_env_file(self):
85
+ """Load environment variables from project root's .env file.
86
+
87
+ Looks for .env in the project root directory (parent of engines/).
88
+ Silently skips loading if python-dotenv is not installed or if .env doesn't exist.
89
+
90
+ If .env loading fails or is skipped, the engine will still work if the API key
91
+ is provided through the config dict.
92
+ """
93
+ if not DOTENV_AVAILABLE:
94
+ return
95
+
96
+ env_path = Path(__file__).parent.parent / ".env"
97
+ if env_path.exists():
98
+ load_dotenv(env_path)
99
+
100
+ def get_name(self) -> str:
101
+ return "OpenWebUI"
102
+
103
+ def get_description(self) -> str:
104
+ return "OpenWebUI API from openwebui.uni-freiburg.de (OpenAI-compatible, multiple models)"
105
+
106
+ def is_available(self) -> bool:
107
+ return OPENAI_AVAILABLE
108
+
109
+ def get_unavailable_reason(self) -> str:
110
+ if not OPENAI_AVAILABLE:
111
+ return "OpenAI library not installed. Install with: pip install openai"
112
+ return ""
113
+
114
+ def get_config_widget(self) -> QWidget:
115
+ """Create OpenWebUI configuration panel."""
116
+ if self._config_widget is not None:
117
+ return self._config_widget
118
+
119
+ widget = QWidget()
120
+ layout = QVBoxLayout()
121
+
122
+ # API Key section
123
+ key_group = QGroupBox("API Key")
124
+ key_layout = QVBoxLayout()
125
+
126
+ key_input_layout = QHBoxLayout()
127
+ self._api_key_edit = QLineEdit()
128
+ self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
129
+ self._api_key_edit.setPlaceholderText("Enter your OpenWebUI API key")
130
+ key_input_layout.addWidget(self._api_key_edit)
131
+
132
+ self._show_key_check = QCheckBox("Show")
133
+ self._show_key_check.toggled.connect(self._toggle_key_visibility)
134
+ key_input_layout.addWidget(self._show_key_check)
135
+ key_layout.addLayout(key_input_layout)
136
+
137
+ key_hint = QLabel("Get your API key from https://openwebui.uni-freiburg.de")
138
+ key_hint.setStyleSheet("color: gray; font-size: 9pt;")
139
+ key_layout.addWidget(key_hint)
140
+
141
+ key_group.setLayout(key_layout)
142
+ layout.addWidget(key_group)
143
+
144
+ # Model selection with refresh button
145
+ model_group = QGroupBox("Model Selection")
146
+ model_layout = QVBoxLayout()
147
+
148
+ model_select_layout = QHBoxLayout()
149
+ self._model_combo = QComboBox()
150
+ self._model_combo.setMinimumWidth(300)
151
+ model_select_layout.addWidget(self._model_combo)
152
+
153
+ self._refresh_models_btn = QPushButton("Refresh Models")
154
+ self._refresh_models_btn.clicked.connect(self._refresh_models)
155
+ model_select_layout.addWidget(self._refresh_models_btn)
156
+
157
+ model_layout.addLayout(model_select_layout)
158
+
159
+ model_hint = QLabel("Click 'Refresh Models' to load available models from the server")
160
+ model_hint.setStyleSheet("color: gray; font-size: 9pt;")
161
+ model_layout.addWidget(model_hint)
162
+
163
+ model_group.setLayout(model_layout)
164
+ layout.addWidget(model_group)
165
+
166
+ # Generation parameters
167
+ params_group = QGroupBox("Generation Parameters")
168
+ params_layout = QVBoxLayout()
169
+
170
+ # Temperature
171
+ temp_layout = QHBoxLayout()
172
+ temp_layout.addWidget(QLabel("Temperature:"))
173
+ self._temperature_spin = QSpinBox()
174
+ self._temperature_spin.setRange(0, 100)
175
+ self._temperature_spin.setValue(10) # 0.1
176
+ self._temperature_spin.setSuffix(" (×0.01)")
177
+ temp_layout.addWidget(self._temperature_spin)
178
+ temp_layout.addStretch()
179
+ params_layout.addLayout(temp_layout)
180
+
181
+ # Max tokens
182
+ tokens_layout = QHBoxLayout()
183
+ tokens_layout.addWidget(QLabel("Max Tokens:"))
184
+ self._max_tokens_spin = QSpinBox()
185
+ self._max_tokens_spin.setRange(100, 4096)
186
+ self._max_tokens_spin.setValue(500)
187
+ tokens_layout.addWidget(self._max_tokens_spin)
188
+ tokens_layout.addStretch()
189
+ params_layout.addLayout(tokens_layout)
190
+
191
+ params_group.setLayout(params_layout)
192
+ layout.addWidget(params_group)
193
+
194
+ # Custom prompt section
195
+ prompt_group = QGroupBox("Custom Prompt (Optional)")
196
+ prompt_layout = QVBoxLayout()
197
+
198
+ self._prompt_edit = QTextEdit()
199
+ self._prompt_edit.setPlaceholderText(
200
+ "Enter custom transcription prompt...\n\n"
201
+ "Default prompt:\n"
202
+ "Transcribe the text in this historical manuscript line image. "
203
+ "Return only the transcribed text without any explanation or formatting."
204
+ )
205
+ self._prompt_edit.setMaximumHeight(120)
206
+ prompt_layout.addWidget(self._prompt_edit)
207
+
208
+ prompt_group.setLayout(prompt_layout)
209
+ layout.addWidget(prompt_group)
210
+
211
+ layout.addStretch()
212
+ widget.setLayout(layout)
213
+
214
+ self._config_widget = widget
215
+
216
+ # Try to load saved API key
217
+ self._load_saved_api_key()
218
+
219
+ return widget
220
+
221
+ def _toggle_key_visibility(self, checked: bool):
222
+ """Toggle API key visibility."""
223
+ if checked:
224
+ self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Normal)
225
+ else:
226
+ self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
227
+
228
+ def _get_api_key_file(self) -> 'Path':
229
+ """Get path to API key storage file."""
230
+ from pathlib import Path
231
+ storage_dir = Path.home() / ".trocr_gui"
232
+ storage_dir.mkdir(exist_ok=True)
233
+ return storage_dir / "api_keys.json"
234
+
235
+ def _load_saved_api_key(self):
236
+ """Load saved API key."""
237
+ try:
238
+ import json
239
+ key_file = self._get_api_key_file()
240
+
241
+ if key_file.exists():
242
+ with open(key_file, "r") as f:
243
+ keys = json.load(f)
244
+
245
+ if "openwebui" in keys:
246
+ self._api_key_edit.setText(keys["openwebui"])
247
+ except Exception as e:
248
+ print(f"Warning: Could not load saved API key: {e}")
249
+
250
+ def _save_api_key(self):
251
+ """Save API key."""
252
+ try:
253
+ import json
254
+ key_file = self._get_api_key_file()
255
+
256
+ # Load existing keys
257
+ keys = {}
258
+ if key_file.exists():
259
+ with open(key_file, "r") as f:
260
+ keys = json.load(f)
261
+
262
+ # Update key for OpenWebUI
263
+ api_key = self._api_key_edit.text().strip()
264
+
265
+ if api_key:
266
+ keys["openwebui"] = api_key
267
+
268
+ with open(key_file, "w") as f:
269
+ json.dump(keys, f, indent=2)
270
+ except Exception as e:
271
+ print(f"Warning: Could not save API key: {e}")
272
+
273
+ def _refresh_models(self):
274
+ """Fetch available models from OpenWebUI API."""
275
+ api_key = self._api_key_edit.text().strip()
276
+
277
+ if not api_key:
278
+ self._model_combo.clear()
279
+ self._model_combo.addItem("Please enter API key first")
280
+ return
281
+
282
+ try:
283
+ # Create temporary client to fetch models
284
+ client = OpenAI(
285
+ base_url=self.base_url,
286
+ api_key=api_key
287
+ )
288
+
289
+ # Fetch models
290
+ models = client.models.list()
291
+
292
+ self._available_models = []
293
+ for model in models.data:
294
+ self._available_models.append(model.id)
295
+
296
+ # Update combo box
297
+ self._model_combo.clear()
298
+ if self._available_models:
299
+ self._model_combo.addItems(sorted(self._available_models))
300
+ print(f"[OpenWebUI] Loaded {len(self._available_models)} models")
301
+ else:
302
+ self._model_combo.addItem("No models found")
303
+
304
+ except Exception as e:
305
+ print(f"Error fetching models: {e}")
306
+ self._model_combo.clear()
307
+ self._model_combo.addItem(f"Error: {str(e)[:50]}")
308
+
309
+ def get_config(self) -> Dict[str, Any]:
310
+ """Extract configuration from widget controls."""
311
+ if self._config_widget is None:
312
+ return {}
313
+
314
+ prompt_text = self._prompt_edit.toPlainText().strip()
315
+
316
+ return {
317
+ "api_key": self._api_key_edit.text().strip(),
318
+ "model": self._model_combo.currentText(),
319
+ "temperature": self._temperature_spin.value() / 100.0,
320
+ "max_tokens": self._max_tokens_spin.value(),
321
+ "custom_prompt": prompt_text if prompt_text else None,
322
+ }
323
+
324
+ def set_config(self, config: Dict[str, Any]):
325
+ """Restore configuration to widget controls."""
326
+ if self._config_widget is None:
327
+ return
328
+
329
+ self._api_key_edit.setText(config.get("api_key", ""))
330
+
331
+ model = config.get("model", "")
332
+ idx = self._model_combo.findText(model)
333
+ if idx >= 0:
334
+ self._model_combo.setCurrentIndex(idx)
335
+
336
+ temp = int(config.get("temperature", 0.1) * 100)
337
+ self._temperature_spin.setValue(temp)
338
+
339
+ self._max_tokens_spin.setValue(config.get("max_tokens", 500))
340
+
341
+ custom_prompt = config.get("custom_prompt", "")
342
+ if custom_prompt:
343
+ self._prompt_edit.setPlainText(custom_prompt)
344
+
345
+ def load_model(self, config: Dict[str, Any]) -> bool:
346
+ """Initialize OpenWebUI client."""
347
+ try:
348
+ api_key = config.get("api_key", "")
349
+
350
+ if not api_key:
351
+ print("Error: No API key provided. Paste your key in the field.")
352
+ return False
353
+
354
+ base_url = config.get("base_url", "").strip().rstrip("/")
355
+ if not base_url:
356
+ print("Error: No OpenWebUI base URL provided.")
357
+ return False
358
+
359
+ # Store config for batch processing (model, temperature, etc.)
360
+ self._loaded_config = config.copy()
361
+
362
+ # Save API key for future use
363
+ if self._api_key_edit and self._api_key_edit.text().strip():
364
+ self._save_api_key()
365
+
366
+ self.base_url = base_url
367
+
368
+ # Initialize client
369
+ self.client = OpenAI(
370
+ base_url=self.base_url,
371
+ api_key=api_key
372
+ )
373
+
374
+ model = config.get("model", config.get("model_id", "unknown"))
375
+ print(f"[OpenWebUI] Client initialized with base URL: {self.base_url}, model: {model}")
376
+ return True
377
+
378
+ except Exception as e:
379
+ print(f"Error initializing OpenWebUI client: {e}")
380
+ self.client = None
381
+ return False
382
+
383
+ def unload_model(self):
384
+ """Unload OpenWebUI client."""
385
+ if self.client is not None:
386
+ self.client = None
387
+ self._loaded_config = {}
388
+
389
+ def is_model_loaded(self) -> bool:
390
+ """Check if client is initialized."""
391
+ return self.client is not None
392
+
393
+ def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
394
+ """Transcribe a line image with OpenWebUI API."""
395
+ if self.client is None:
396
+ return TranscriptionResult(text="[OpenWebUI client not initialized]", confidence=0.0)
397
+
398
+ if config is None:
399
+ # First try loaded config (from batch processing), then GUI config
400
+ if self._loaded_config:
401
+ config = self._loaded_config
402
+ else:
403
+ config = self.get_config()
404
+
405
+ try:
406
+ # Convert numpy array to PIL Image
407
+ if isinstance(image, np.ndarray):
408
+ pil_image = Image.fromarray(image)
409
+ else:
410
+ pil_image = image
411
+
412
+ # Convert to RGB if needed
413
+ if pil_image.mode != 'RGB':
414
+ pil_image = pil_image.convert('RGB')
415
+
416
+ # Encode image to base64
417
+ buffered = io.BytesIO()
418
+ pil_image.save(buffered, format="PNG")
419
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
420
+
421
+ # Prepare prompt
422
+ custom_prompt = config.get("custom_prompt")
423
+ if custom_prompt:
424
+ prompt = custom_prompt
425
+ else:
426
+ prompt = (
427
+ "Transcribe the text in this historical manuscript line image. "
428
+ "Return only the transcribed text without any explanation or formatting."
429
+ )
430
+
431
+ # Get model and parameters
432
+ model = config.get("model", "gpt-4-vision-preview")
433
+ temperature = config.get("temperature", 0.1)
434
+ max_tokens = config.get("max_tokens")
435
+ # Treat 0 as "no limit" (HTML number fields send 0 for blank)
436
+ if max_tokens is not None and max_tokens <= 0:
437
+ max_tokens = None
438
+
439
+ # Call OpenWebUI API (OpenAI-compatible)
440
+ api_kwargs = dict(
441
+ model=model,
442
+ messages=[
443
+ {
444
+ "role": "user",
445
+ "content": [
446
+ {
447
+ "type": "text",
448
+ "text": prompt
449
+ },
450
+ {
451
+ "type": "image_url",
452
+ "image_url": {
453
+ "url": f"data:image/png;base64,{img_base64}"
454
+ }
455
+ }
456
+ ]
457
+ }
458
+ ],
459
+ temperature=temperature,
460
+ )
461
+ if max_tokens is not None:
462
+ api_kwargs["max_tokens"] = max_tokens
463
+ response = self.client.chat.completions.create(**api_kwargs)
464
+
465
+ # Extract transcription
466
+ text = response.choices[0].message.content.strip()
467
+
468
+ # Extract usage info
469
+ usage = {}
470
+ if hasattr(response, 'usage') and response.usage:
471
+ usage = {
472
+ "prompt_tokens": response.usage.prompt_tokens,
473
+ "completion_tokens": response.usage.completion_tokens,
474
+ "total_tokens": response.usage.total_tokens
475
+ }
476
+
477
+ return TranscriptionResult(
478
+ text=text,
479
+ confidence=1.0, # OpenWebUI doesn't provide confidence
480
+ metadata={
481
+ "provider": "openwebui",
482
+ "model": model,
483
+ "usage": usage
484
+ }
485
+ )
486
+
487
+ except Exception as e:
488
+ print(f"Error in OpenWebUI transcription: {e}")
489
+ import traceback
490
+ traceback.print_exc()
491
+ return TranscriptionResult(text=f"[OpenWebUI Error: {e}]", confidence=0.0)
492
+
493
+ def get_capabilities(self) -> Dict[str, bool]:
494
+ """OpenWebUI capabilities."""
495
+ return {
496
+ "batch_processing": False,
497
+ "confidence_scores": False,
498
+ "beam_search": False,
499
+ "language_model": True,
500
+ "preprocessing": True,
501
+ }
502
+
503
+ def requires_line_segmentation(self) -> bool:
504
+ """OpenWebUI VLMs can process full pages directly without segmentation."""
505
+ return False # VLMs process full page images
engines/pylaia_engine.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PyLaia Engine Plugin
3
+
4
+ Wraps the PyLaia CTC-based HTR inference system as a plugin.
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Dict, Any, List, Optional
9
+ import numpy as np
10
+
11
+ from htr_engine_base import HTREngine, TranscriptionResult
12
+
13
+ try:
14
+ from PyQt6.QtWidgets import (
15
+ QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
16
+ QPushButton, QCheckBox, QLineEdit, QFileDialog,
17
+ QGroupBox, QDoubleSpinBox
18
+ )
19
+ from PyQt6.QtCore import Qt
20
+ PYQT_AVAILABLE = True
21
+ except ImportError:
22
+ PYQT_AVAILABLE = False
23
+ QWidget = object
24
+
25
+ try:
26
+ # Use native Linux implementation (no WSL dependency)
27
+ from inference_pylaia_native import PyLaiaInference, PYLAIA_MODELS
28
+ PYLAIA_AVAILABLE = True
29
+ PYLAIA_LM_AVAILABLE = False # Language model not yet implemented
30
+ except ImportError:
31
+ PYLAIA_AVAILABLE = False
32
+ PYLAIA_MODELS = {}
33
+ PYLAIA_LM_AVAILABLE = False
34
+
35
+
36
+ class PyLaiaEngine(HTREngine):
37
+ """PyLaia CTC-based HTR engine plugin."""
38
+
39
+ def __init__(self):
40
+ self.model: Optional[PyLaiaInference] = None
41
+ self.model_lm: Optional[PyLaiaInferenceLM] = None
42
+ self._config_widget: Optional[QWidget] = None
43
+
44
+ # Widget references
45
+ self._model_combo: Optional[QComboBox] = None
46
+ self._use_lm_check: Optional[QCheckBox] = None
47
+ self._lm_weight_spin: Optional[QDoubleSpinBox] = None
48
+ self._custom_model_edit: Optional[QLineEdit] = None
49
+ self._custom_lm_edit: Optional[QLineEdit] = None
50
+ self._enable_spaces_check: Optional[QCheckBox] = None
51
+
52
+ def get_name(self) -> str:
53
+ return "CRNN-CTC (PyLaia-inspired)"
54
+
55
+ def get_description(self) -> str:
56
+ return "Puigcerver CRNN-CTC: clean-room PyTorch reimplementation of the PyLaia architecture"
57
+
58
+ def get_aliases(self) -> List[str]:
59
+ return ["crnn-ctc", "CRNN-CTC", "PyLaia"] # "PyLaia" kept for backward compatibility
60
+
61
+ def is_available(self) -> bool:
62
+ return PYLAIA_AVAILABLE
63
+
64
+ def get_unavailable_reason(self) -> str:
65
+ if not PYLAIA_AVAILABLE:
66
+ return "CRNN-CTC engine not available. Check that inference_pylaia_native.py exists and dependencies are installed."
67
+ return ""
68
+
69
+ def get_config_widget(self) -> QWidget:
70
+ """Create PyLaia configuration panel."""
71
+ if self._config_widget is not None:
72
+ return self._config_widget
73
+
74
+ widget = QWidget()
75
+ layout = QVBoxLayout()
76
+
77
+ # Model selection
78
+ model_group = QGroupBox("Model Selection")
79
+ model_layout = QVBoxLayout()
80
+
81
+ # Preset models
82
+ model_layout.addWidget(QLabel("Preset Model:"))
83
+ self._model_combo = QComboBox()
84
+ self._populate_preset_models()
85
+ self._model_combo.currentTextChanged.connect(self._on_preset_changed)
86
+ model_layout.addWidget(self._model_combo)
87
+
88
+ # Custom model path
89
+ model_layout.addWidget(QLabel("Custom Model Path:"))
90
+ custom_layout = QHBoxLayout()
91
+ self._custom_model_edit = QLineEdit()
92
+ self._custom_model_edit.setPlaceholderText("Leave empty to use preset model")
93
+ custom_layout.addWidget(self._custom_model_edit)
94
+ browse_model_btn = QPushButton("Browse...")
95
+ browse_model_btn.clicked.connect(self._browse_model)
96
+ custom_layout.addWidget(browse_model_btn)
97
+ model_layout.addLayout(custom_layout)
98
+
99
+ model_group.setLayout(model_layout)
100
+ layout.addWidget(model_group)
101
+
102
+ # Language model settings
103
+ lm_group = QGroupBox("Language Model (Optional)")
104
+ lm_layout = QVBoxLayout()
105
+
106
+ self._use_lm_check = QCheckBox("Use Language Model")
107
+ self._use_lm_check.setChecked(False)
108
+ self._use_lm_check.toggled.connect(self._on_lm_toggled)
109
+ if not PYLAIA_LM_AVAILABLE:
110
+ self._use_lm_check.setEnabled(False)
111
+ self._use_lm_check.setToolTip("KenLM not available. Install with: pip install kenlm")
112
+ lm_layout.addWidget(self._use_lm_check)
113
+
114
+ # LM weight
115
+ weight_layout = QHBoxLayout()
116
+ weight_layout.addWidget(QLabel("LM Weight:"))
117
+ self._lm_weight_spin = QDoubleSpinBox()
118
+ self._lm_weight_spin.setRange(0.0, 10.0)
119
+ self._lm_weight_spin.setValue(1.5)
120
+ self._lm_weight_spin.setSingleStep(0.1)
121
+ self._lm_weight_spin.setToolTip("Higher = more influence from language model")
122
+ self._lm_weight_spin.setEnabled(False)
123
+ weight_layout.addWidget(self._lm_weight_spin)
124
+ weight_layout.addStretch()
125
+ lm_layout.addLayout(weight_layout)
126
+
127
+ # Custom LM path
128
+ lm_layout.addWidget(QLabel("Custom LM Path:"))
129
+ custom_lm_layout = QHBoxLayout()
130
+ self._custom_lm_edit = QLineEdit()
131
+ self._custom_lm_edit.setPlaceholderText("Leave empty for auto-detection")
132
+ self._custom_lm_edit.setEnabled(False)
133
+ custom_lm_layout.addWidget(self._custom_lm_edit)
134
+ browse_lm_btn = QPushButton("Browse...")
135
+ browse_lm_btn.clicked.connect(self._browse_lm)
136
+ browse_lm_btn.setEnabled(False)
137
+ self._browse_lm_btn = browse_lm_btn
138
+ custom_lm_layout.addWidget(browse_lm_btn)
139
+ lm_layout.addLayout(custom_lm_layout)
140
+
141
+ lm_group.setLayout(lm_layout)
142
+ layout.addWidget(lm_group)
143
+
144
+ # Output options
145
+ output_group = QGroupBox("Output Options")
146
+ output_layout = QVBoxLayout()
147
+
148
+ self._enable_spaces_check = QCheckBox("Convert <space> tokens to spaces")
149
+ self._enable_spaces_check.setChecked(True)
150
+ self._enable_spaces_check.setToolTip(
151
+ "When enabled, <space> or <SPACE> tokens in the vocabulary are converted to actual spaces.\n"
152
+ "Disable to keep them as literal <space> text."
153
+ )
154
+ output_layout.addWidget(self._enable_spaces_check)
155
+
156
+ self._flip_rtl_check = QCheckBox("RTL manuscript (flip line images)")
157
+ self._flip_rtl_check.setChecked(False)
158
+ self._flip_rtl_check.setToolTip(
159
+ "Flip line images horizontally for right-to-left scripts.\n"
160
+ "Required for models trained on RTL manuscripts (Ottoman, Arabic, Hebrew, etc.)\n"
161
+ "with left-to-right transcriptions (Latin transliteration)."
162
+ )
163
+ output_layout.addWidget(self._flip_rtl_check)
164
+
165
+ output_group.setLayout(output_layout)
166
+ layout.addWidget(output_group)
167
+
168
+ layout.addStretch()
169
+ widget.setLayout(layout)
170
+
171
+ self._config_widget = widget
172
+ return widget
173
+
174
+ def _populate_preset_models(self):
175
+ """Populate preset models dropdown."""
176
+ if self._model_combo is None:
177
+ return
178
+
179
+ self._model_combo.clear()
180
+
181
+ if not PYLAIA_MODELS:
182
+ self._model_combo.addItem("No preset models found")
183
+ return
184
+
185
+ for model_id in PYLAIA_MODELS.keys():
186
+ self._model_combo.addItem(model_id)
187
+
188
+ def _on_preset_changed(self, preset_name: str):
189
+ """Update when preset changes."""
190
+ # Could add description display here
191
+ pass
192
+
193
+ def _on_lm_toggled(self, checked: bool):
194
+ """Enable/disable LM controls."""
195
+ self._lm_weight_spin.setEnabled(checked)
196
+ self._custom_lm_edit.setEnabled(checked)
197
+ self._browse_lm_btn.setEnabled(checked)
198
+
199
+ def _browse_model(self):
200
+ """Open file dialog to select model file."""
201
+ file_path, _ = QFileDialog.getOpenFileName(
202
+ self._config_widget,
203
+ "Select CRNN-CTC Model",
204
+ "models",
205
+ "CRNN-CTC Models (*.ckpt *.pth *.pt);;All Files (*)"
206
+ )
207
+
208
+ if file_path:
209
+ self._custom_model_edit.setText(file_path)
210
+
211
+ def _browse_lm(self):
212
+ """Open file dialog to select LM file."""
213
+ file_path, _ = QFileDialog.getOpenFileName(
214
+ self._config_widget,
215
+ "Select KenLM Model",
216
+ "models",
217
+ "KenLM Models (*.arpa *.klm *.bin);;All Files (*)"
218
+ )
219
+
220
+ if file_path:
221
+ self._custom_lm_edit.setText(file_path)
222
+
223
+ def get_config(self) -> Dict[str, Any]:
224
+ """Extract configuration from widget controls."""
225
+ if self._config_widget is None:
226
+ return {}
227
+
228
+ custom_model = self._custom_model_edit.text().strip()
229
+ preset_model = self._model_combo.currentText()
230
+
231
+ config = {
232
+ "model_path": custom_model if custom_model else preset_model,
233
+ "use_lm": self._use_lm_check.isChecked(),
234
+ "lm_weight": self._lm_weight_spin.value(),
235
+ "enable_spaces": self._enable_spaces_check.isChecked(),
236
+ "flip_rtl": self._flip_rtl_check.isChecked(),
237
+ }
238
+
239
+ if config["use_lm"]:
240
+ custom_lm = self._custom_lm_edit.text().strip()
241
+ if custom_lm:
242
+ config["lm_path"] = custom_lm
243
+
244
+ return config
245
+
246
+ def set_config(self, config: Dict[str, Any]):
247
+ """Restore configuration to widget controls."""
248
+ if self._config_widget is None:
249
+ return
250
+
251
+ model_path = config.get("model_path", "")
252
+
253
+ # Try to find in presets
254
+ idx = self._model_combo.findText(model_path)
255
+ if idx >= 0:
256
+ self._model_combo.setCurrentIndex(idx)
257
+ self._custom_model_edit.clear()
258
+ else:
259
+ self._custom_model_edit.setText(model_path)
260
+
261
+ self._use_lm_check.setChecked(config.get("use_lm", False))
262
+ self._lm_weight_spin.setValue(config.get("lm_weight", 1.5))
263
+ self._enable_spaces_check.setChecked(config.get("enable_spaces", True))
264
+ if hasattr(self, '_flip_rtl_check'):
265
+ self._flip_rtl_check.setChecked(config.get("flip_rtl", False))
266
+
267
+ if "lm_path" in config:
268
+ self._custom_lm_edit.setText(config["lm_path"])
269
+
270
+ def load_model(self, config: Dict[str, Any]) -> bool:
271
+ """Load PyLaia model."""
272
+ try:
273
+ model_path = config.get("model_path", "")
274
+ if not model_path or model_path == "No preset models found":
275
+ return False
276
+
277
+ # If it's a preset name, resolve to actual path and syms
278
+ syms_path = None
279
+ if model_path in PYLAIA_MODELS:
280
+ preset_info = PYLAIA_MODELS[model_path]
281
+ if isinstance(preset_info, dict):
282
+ if preset_info.get("repo_id"):
283
+ try:
284
+ from huggingface_hub import hf_hub_download
285
+ except ImportError as exc:
286
+ raise RuntimeError(
287
+ "huggingface_hub is required for Hugging Face model presets"
288
+ ) from exc
289
+ repo_id = preset_info["repo_id"]
290
+ model_path = hf_hub_download(
291
+ repo_id=repo_id,
292
+ filename=preset_info.get("checkpoint", "best_model.pt"),
293
+ )
294
+ syms_path = hf_hub_download(
295
+ repo_id=repo_id,
296
+ filename=preset_info.get("syms", "symbols.txt"),
297
+ )
298
+ else:
299
+ model_path = preset_info.get("checkpoint", preset_info.get("path", model_path))
300
+ syms_path = preset_info.get("syms")
301
+ # If preset_info is just a string, use it as the path
302
+ elif isinstance(preset_info, str):
303
+ model_path = preset_info
304
+
305
+ use_lm = config.get("use_lm", False)
306
+
307
+ # Unload previous model
308
+ self.unload_model()
309
+
310
+ if use_lm and PYLAIA_LM_AVAILABLE:
311
+ # Load with language model
312
+ lm_weight = config.get("lm_weight", 1.5)
313
+ lm_path = config.get("lm_path")
314
+
315
+ self.model_lm = PyLaiaInferenceLM(
316
+ model_path=model_path,
317
+ lm_path=lm_path,
318
+ lm_weight=lm_weight
319
+ )
320
+ self.model = None
321
+ else:
322
+ # Load without language model
323
+ # PyLaiaInference expects checkpoint_path, syms_path, and enable_spaces
324
+ enable_spaces = config.get("enable_spaces", True)
325
+ self.model = PyLaiaInference(
326
+ checkpoint_path=model_path,
327
+ syms_path=syms_path,
328
+ enable_spaces=enable_spaces
329
+ )
330
+ self.model_lm = None
331
+
332
+ return True
333
+
334
+ except Exception as e:
335
+ import traceback
336
+ print(f"Error loading PyLaia model: {e}")
337
+ print(traceback.format_exc())
338
+ self.model = None
339
+ self.model_lm = None
340
+ return False
341
+
342
+ def unload_model(self):
343
+ """Unload model from memory."""
344
+ if self.model is not None:
345
+ del self.model
346
+ self.model = None
347
+
348
+ if self.model_lm is not None:
349
+ del self.model_lm
350
+ self.model_lm = None
351
+
352
+ # Free GPU memory
353
+ import torch
354
+ if torch.cuda.is_available():
355
+ torch.cuda.empty_cache()
356
+
357
+ def is_model_loaded(self) -> bool:
358
+ """Check if model is loaded."""
359
+ return self.model is not None or self.model_lm is not None
360
+
361
+ def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
362
+ """Transcribe a line image with PyLaia."""
363
+ if not self.is_model_loaded():
364
+ return TranscriptionResult(text="[Model not loaded]", confidence=0.0)
365
+
366
+ try:
367
+ # Convert numpy to PIL
368
+ from PIL import Image as PILImage
369
+ if isinstance(image, np.ndarray):
370
+ pil_image = PILImage.fromarray(image)
371
+ else:
372
+ pil_image = image
373
+
374
+ # Flip horizontally for RTL scripts
375
+ if config and config.get("flip_rtl", False):
376
+ pil_image = pil_image.transpose(PILImage.FLIP_LEFT_RIGHT)
377
+
378
+ # PyLaiaInferenceWSL uses transcribe() which returns (text, confidence) tuple
379
+ # Use LM version if available (not yet implemented for WSL)
380
+ if self.model_lm is not None:
381
+ # PyLaiaInferenceLM might have different method
382
+ result = self.model_lm.transcribe(pil_image)
383
+ else:
384
+ result = self.model.transcribe(pil_image)
385
+
386
+ # Result is a tuple: (text, confidence)
387
+ if isinstance(result, tuple):
388
+ text, confidence = result
389
+ else:
390
+ # Fallback for dict-style results
391
+ text = result.get("text", "")
392
+ confidence = result.get("confidence", 1.0)
393
+
394
+ return TranscriptionResult(
395
+ text=text,
396
+ confidence=confidence,
397
+ metadata={"model": "pylaia"}
398
+ )
399
+
400
+ except Exception as e:
401
+ import traceback
402
+ print(f"Error in PyLaia transcription: {e}")
403
+ print(traceback.format_exc())
404
+ return TranscriptionResult(text=f"[Error: {e}]", confidence=0.0)
405
+
406
+ def get_capabilities(self) -> Dict[str, bool]:
407
+ """PyLaia capabilities."""
408
+ return {
409
+ "batch_processing": False, # Could be implemented
410
+ "confidence_scores": True, # CTC provides confidence
411
+ "beam_search": False, # CTC uses greedy/beam decoding
412
+ "language_model": PYLAIA_LM_AVAILABLE, # Optional KenLM
413
+ "preprocessing": False, # External preprocessing recommended
414
+ }
hf-space/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Polyscriptor HTR Demo
3
+ emoji: 📝
4
+ colorFrom: teal
5
+ colorTo: slate
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ # Polyscriptor HTR Demo
12
+
13
+ This is the hosted Hugging Face Spaces demo for Polyscriptor. It runs the
14
+ existing FastAPI/Web UI with a constrained demo mode:
15
+
16
+ - CRNN-CTC (PyLaia-inspired) engines only.
17
+ - Public model presets are downloaded from `achimrabus/*` Hugging Face model repos.
18
+ - CPU inference only.
19
+ - Kraken Classical line segmentation, with HPP as a lightweight fallback.
20
+ - Temporary uploads only.
21
+
22
+ The normal Polyscriptor server, local GPU workflow, and the existing mobile PWA
23
+ demo under `web/static/pwa/` are not changed by this Space configuration.
24
+
25
+ ## Source Code
26
+
27
+ Public source repository:
28
+
29
+ https://github.com/achimrabus/polyscriptor
30
+
31
+ The Space repository is a curated deployment snapshot for the hosted demo. The
32
+ GitHub repository contains the broader Polyscriptor codebase and local workflows.
33
+
34
+ ## Deployment Note
35
+
36
+ Hugging Face Docker Spaces expect the `Dockerfile` at the root of the Space
37
+ repository. This branch includes a root `Dockerfile` for direct Space builds and
38
+ keeps the Space-specific notes and dependency set in `hf-space/`.
39
+
40
+ When publishing into a dedicated Space repository under
41
+ `https://huggingface.co/spaces/achimrabus/...`, use `hf-space/SPACE_README.md`
42
+ as the Space repository root `README.md`. The Polyscriptor project README is
43
+ left untouched in this branch.
hf-space/SPACE_README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Polyscriptor HTR Demo
3
+ emoji: 📝
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ # Polyscriptor HTR Demo
12
+
13
+ Polyscriptor is a browser-based demo for handwritten text recognition (HTR) on
14
+ historical Slavic manuscript material. This Hugging Face Space runs a constrained
15
+ public version of the Polyscriptor FastAPI/Web interface.
16
+
17
+ The hosted demo is intended for quick inspection and teaching. It is not the full
18
+ local research environment used for training, batch processing, GPU inference, or
19
+ private manuscript collections.
20
+
21
+ ## Source Code
22
+
23
+ The public Polyscriptor source code is available on GitHub:
24
+
25
+ https://github.com/achimrabus/polyscriptor
26
+
27
+ This Hugging Face Space contains the curated hosted demo deployment. The GitHub
28
+ repository contains the broader Polyscriptor codebase, including the web UI,
29
+ engine plugins, segmentation code, training utilities, and local workflows.
30
+
31
+ ## What This Demo Supports
32
+
33
+ - CRNN-CTC / PyLaia-inspired HTR presets for selected public model repositories.
34
+ - User-supplied API keys for OpenAI, Gemini, Claude, and OpenWebUI-compatible
35
+ endpoints.
36
+ - Public model download from the Hugging Face Hub, primarily under
37
+ `achimrabus/*`.
38
+ - CPU-only inference.
39
+ - Kraken Classical line segmentation, with HPP as a lightweight fallback.
40
+ - Temporary image uploads during the active session.
41
+
42
+ ## Limitations
43
+
44
+ - No private models are bundled with this Space.
45
+ - API-based engines require users to paste their own API key in the browser
46
+ form. The Space does not ship with shared provider credentials.
47
+ - Uploaded files are treated as temporary runtime data and are not part of the
48
+ repository.
49
+ - Large local GPU/VLM engines from the full Polyscriptor workflow are not
50
+ enabled here.
51
+ - Accuracy depends strongly on script, language, writing style, image quality,
52
+ and segmentation quality.
53
+
54
+ ## Model Notes
55
+
56
+ The demo uses publicly available model presets. For best results, choose a model
57
+ that matches the manuscript tradition as closely as possible. The current public
58
+ Polyscriptor model cards are available at:
59
+
60
+ https://huggingface.co/achimrabus
61
+
62
+ ## Project Context
63
+
64
+ Polyscriptor is developed for historical HTR workflows, with a focus on Slavic
65
+ manuscripts and reproducible comparison of OCR/HTR engines. The full development
66
+ repository contains additional tooling for local use, training, evaluation, and
67
+ batch processing; this Space contains only the hosted demo configuration.
68
+
69
+ ## Privacy
70
+
71
+ Do not upload sensitive or unpublished manuscript images unless you are
72
+ comfortable processing them in a hosted public demo environment. The application
73
+ uses temporary server-side files during processing, but this Space should be
74
+ treated as a public demonstration service rather than a secure private workflow.
75
+
76
+ For API-based engines, provider keys are entered by the user at runtime. Do not
77
+ commit keys to this repository or add them to the Space configuration unless you
78
+ intend to provide a shared project credential.
hf-space/requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+
3
+ torch>=2.5.1,<2.10
4
+ torchvision>=0.20.1,<0.25
5
+ numpy>=2.0,<2.1
6
+ pillow==11.1.0
7
+ opencv-python-headless==4.11.0.86
8
+ scikit-learn>=1.5,<1.6
9
+ scipy>=1.13,<1.14
10
+ kraken>=6.0.0,<7.0.0
11
+ fastapi>=0.111.0
12
+ uvicorn[standard]>=0.29.0
13
+ python-multipart>=0.0.9
14
+ pymupdf>=1.24.0
15
+ pyyaml==6.0.2
16
+ huggingface_hub>=0.23.0
17
+ python-Levenshtein>=0.23.0
18
+ openai>=1.50.0
19
+ anthropic>=0.34.0
20
+ google-genai>=1.0.0
21
+ google-generativeai>=0.8.0
htr_engine_base.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTR Engine Plugin System - Base Classes and Registry
3
+
4
+ This module defines the plugin architecture for HTR (Handwritten Text Recognition) engines.
5
+ All HTR engines (TrOCR, Qwen3, CRNN-CTC, Kraken, etc.) implement the HTREngine interface.
6
+
7
+ Design principles:
8
+ - Abstraction: Each engine is self-contained and interchangeable
9
+ - Scalability: New engines can be added without modifying existing code
10
+ - Consistency: All engines expose the same interface to the GUI
11
+ - Flexibility: Each engine can have custom configuration widgets
12
+ """
13
+
14
+ from abc import ABC, abstractmethod
15
+ from typing import Dict, Any, Optional, List
16
+ from dataclasses import dataclass
17
+ import os
18
+ import numpy as np
19
+
20
+ try:
21
+ from PyQt6.QtWidgets import QWidget, QVBoxLayout, QLabel
22
+ PYQT_AVAILABLE = True
23
+ except ImportError:
24
+ PYQT_AVAILABLE = False
25
+ QWidget = object
26
+
27
+
28
+ @dataclass
29
+ class TranscriptionResult:
30
+ """Result from HTR engine transcription."""
31
+ text: str
32
+ confidence: float = 1.0
33
+ metadata: Dict[str, Any] = None
34
+
35
+ def __post_init__(self):
36
+ if self.metadata is None:
37
+ self.metadata = {}
38
+
39
+
40
+ class HTREngine(ABC):
41
+ """Abstract base class for HTR engines.
42
+
43
+ All HTR engines must implement this interface to be compatible
44
+ with the GUI and batch processing systems.
45
+ """
46
+
47
+ @abstractmethod
48
+ def get_name(self) -> str:
49
+ """Get display name for the engine.
50
+
51
+ Returns:
52
+ str: Human-readable engine name (e.g., "TrOCR", "Qwen3 VLM")
53
+ """
54
+ pass
55
+
56
+ @abstractmethod
57
+ def get_description(self) -> str:
58
+ """Get brief description of the engine.
59
+
60
+ Returns:
61
+ str: One-line description (e.g., "Transformer-based OCR for manuscripts")
62
+ """
63
+ pass
64
+
65
+ @abstractmethod
66
+ def is_available(self) -> bool:
67
+ """Check if engine dependencies are installed and functional.
68
+
69
+ Returns:
70
+ bool: True if engine can be used, False otherwise
71
+ """
72
+ pass
73
+
74
+ @abstractmethod
75
+ def get_unavailable_reason(self) -> str:
76
+ """Get reason why engine is unavailable (if is_available() == False).
77
+
78
+ Returns:
79
+ str: Explanation and installation instructions
80
+ """
81
+ pass
82
+
83
+ @abstractmethod
84
+ def get_config_widget(self) -> QWidget:
85
+ """Create and return configuration widget for this engine.
86
+
87
+ The widget should contain all engine-specific controls (model selection,
88
+ beam search, preprocessing options, etc.). The GUI will embed this widget
89
+ in the configuration panel.
90
+
91
+ Returns:
92
+ QWidget: Qt widget with engine configuration controls
93
+ """
94
+ pass
95
+
96
+ @abstractmethod
97
+ def get_config(self) -> Dict[str, Any]:
98
+ """Get current configuration from the config widget.
99
+
100
+ This method extracts values from the widget controls and returns
101
+ them as a dictionary that can be passed to transcribe_line().
102
+
103
+ Returns:
104
+ Dict[str, Any]: Configuration parameters
105
+ """
106
+ pass
107
+
108
+ @abstractmethod
109
+ def set_config(self, config: Dict[str, Any]):
110
+ """Set configuration values in the config widget.
111
+
112
+ Used to restore saved settings when switching engines.
113
+
114
+ Args:
115
+ config: Configuration parameters
116
+ """
117
+ pass
118
+
119
+ @abstractmethod
120
+ def load_model(self, config: Dict[str, Any]) -> bool:
121
+ """Load the HTR model with given configuration.
122
+
123
+ Args:
124
+ config: Configuration parameters (from get_config())
125
+
126
+ Returns:
127
+ bool: True if model loaded successfully, False otherwise
128
+ """
129
+ pass
130
+
131
+ @abstractmethod
132
+ def unload_model(self):
133
+ """Unload model from memory to free resources.
134
+
135
+ Called when switching to a different engine or closing the application.
136
+ """
137
+ pass
138
+
139
+ @abstractmethod
140
+ def is_model_loaded(self) -> bool:
141
+ """Check if model is currently loaded.
142
+
143
+ Returns:
144
+ bool: True if model is ready for inference
145
+ """
146
+ pass
147
+
148
+ @abstractmethod
149
+ def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
150
+ """Transcribe a single line image.
151
+
152
+ Args:
153
+ image: Line image as numpy array (RGB, shape: H x W x 3)
154
+ config: Optional configuration overrides
155
+
156
+ Returns:
157
+ TranscriptionResult: Transcription text and metadata
158
+ """
159
+ pass
160
+
161
+ def requires_line_segmentation(self) -> bool:
162
+ """Check if engine requires pre-segmented lines or can process full pages.
163
+
164
+ Returns:
165
+ bool: True if lines must be segmented first (TrOCR, CRNN-CTC),
166
+ False if engine handles full pages (Qwen3, Commercial APIs)
167
+ """
168
+ return True # Default: most engines need line segmentation
169
+
170
+ def transcribe_lines(self, images: List[np.ndarray], config: Optional[Dict[str, Any]] = None) -> List[TranscriptionResult]:
171
+ """Transcribe multiple line images (batch processing).
172
+
173
+ Default implementation calls transcribe_line() for each image.
174
+ Engines can override this for optimized batch processing.
175
+
176
+ Args:
177
+ images: List of line images
178
+ config: Optional configuration overrides
179
+
180
+ Returns:
181
+ List[TranscriptionResult]: Transcriptions for each image
182
+ """
183
+ return [self.transcribe_line(img, config) for img in images]
184
+
185
+ def supports_batch(self) -> bool:
186
+ """Check if engine supports optimized batch processing.
187
+
188
+ Returns:
189
+ bool: True if transcribe_lines() is optimized, False if it just loops
190
+ """
191
+ return False
192
+
193
+ def get_aliases(self) -> List[str]:
194
+ """Get alternative names for this engine (e.g., short CLI aliases).
195
+
196
+ Returns:
197
+ List[str]: Alternative names accepted by the registry (default: none)
198
+ """
199
+ return []
200
+
201
+ def get_capabilities(self) -> Dict[str, bool]:
202
+ """Get engine capabilities.
203
+
204
+ Returns:
205
+ Dict with capability flags:
206
+ - batch_processing: Supports batch inference
207
+ - confidence_scores: Returns confidence scores
208
+ - beam_search: Supports beam search decoding
209
+ - language_model: Uses language model for post-processing
210
+ - preprocessing: Has built-in preprocessing
211
+ """
212
+ return {
213
+ "batch_processing": self.supports_batch(),
214
+ "confidence_scores": False,
215
+ "beam_search": False,
216
+ "language_model": False,
217
+ "preprocessing": False,
218
+ }
219
+
220
+
221
+ class HTREngineRegistry:
222
+ """Registry of available HTR engines.
223
+
224
+ Manages discovery, registration, and instantiation of HTR engines.
225
+ """
226
+
227
+ def __init__(self):
228
+ self.engines: List[HTREngine] = []
229
+ self._engine_cache: Dict[str, HTREngine] = {}
230
+
231
+ def register(self, engine: HTREngine):
232
+ """Register an HTR engine.
233
+
234
+ Args:
235
+ engine: HTREngine instance to register
236
+ """
237
+ self.engines.append(engine)
238
+ self._engine_cache[engine.get_name()] = engine
239
+ for alias in engine.get_aliases():
240
+ self._engine_cache[alias] = engine
241
+
242
+ def discover_engines(self):
243
+ """Automatically discover and register all available engines.
244
+
245
+ Tries to import each engine module and registers it if available.
246
+ """
247
+ if os.environ.get("POLYSCRIPTOR_DEMO_MODE") == "hf_space":
248
+ demo_engines = [
249
+ ("CRNN-CTC", "engines.pylaia_engine", "PyLaiaEngine"),
250
+ ("Commercial APIs", "engines.commercial_api_engine", "CommercialAPIEngine"),
251
+ ("OpenWebUI", "engines.openwebui_engine", "OpenWebUIEngine"),
252
+ ]
253
+ for label, module_name, class_name in demo_engines:
254
+ try:
255
+ module = __import__(module_name, fromlist=[class_name])
256
+ self.register(getattr(module, class_name)())
257
+ except ImportError as e:
258
+ print(f"Warning: Failed to load {label} engine: {e}")
259
+ return
260
+
261
+ # Import and register TrOCR engine
262
+ try:
263
+ from engines.trocr_engine import TrOCREngine
264
+ self.register(TrOCREngine())
265
+ except ImportError as e:
266
+ print(f"Warning: Failed to load TrOCR engine: {e}")
267
+
268
+ # Import and register Qwen3 engine
269
+ try:
270
+ from engines.qwen3_engine import Qwen3Engine
271
+ self.register(Qwen3Engine())
272
+ except ImportError as e:
273
+ print(f"Warning: Failed to load Qwen3 engine: {e}")
274
+
275
+ # Import and register Churro engine
276
+ try:
277
+ from engines.churro_engine import ChurroEngine
278
+ self.register(ChurroEngine())
279
+ except ImportError as e:
280
+ print(f"Warning: Failed to load Churro engine: {e}")
281
+
282
+ # Import and register CRNN-CTC engine
283
+ try:
284
+ from engines.pylaia_engine import PyLaiaEngine
285
+ self.register(PyLaiaEngine())
286
+ except ImportError as e:
287
+ print(f"Warning: Failed to load CRNN-CTC engine: {e}")
288
+
289
+ # Import and register Kraken engine
290
+ try:
291
+ from engines.kraken_engine import KrakenEngine
292
+ self.register(KrakenEngine())
293
+ except ImportError as e:
294
+ print(f"Warning: Failed to load Kraken engine: {e}")
295
+
296
+ # Import and register Commercial API engine
297
+ try:
298
+ from engines.commercial_api_engine import CommercialAPIEngine
299
+ self.register(CommercialAPIEngine())
300
+ except ImportError as e:
301
+ print(f"Warning: Failed to load Commercial API engine: {e}")
302
+
303
+ # Import and register Party engine
304
+ try:
305
+ from engines.party_engine import PartyEngine
306
+ self.register(PartyEngine())
307
+ except ImportError as e:
308
+ print(f"Warning: Failed to load Party engine: {e}")
309
+
310
+ # Import and register OpenWebUI engine
311
+ try:
312
+ from engines.openwebui_engine import OpenWebUIEngine
313
+ self.register(OpenWebUIEngine())
314
+ except ImportError as e:
315
+ print(f"Warning: Failed to load OpenWebUI engine: {e}")
316
+
317
+ # Import and register DeepSeek-OCR engine
318
+ try:
319
+ from engines.deepseek_ocr_engine import DeepSeekOCREngine
320
+ self.register(DeepSeekOCREngine())
321
+ except ImportError as e:
322
+ print(f"Warning: Failed to load DeepSeek-OCR engine: {e}")
323
+
324
+ # Import and register LightOnOCR engine
325
+ try:
326
+ from engines.lighton_ocr_engine import LightOnOCREngine
327
+ self.register(LightOnOCREngine())
328
+ except ImportError as e:
329
+ print(f"Warning: Failed to load LightOnOCR engine: {e}")
330
+
331
+ # Import and register PaddleOCR engine
332
+ try:
333
+ from engines.paddle_engine import PaddleOCREngine
334
+ self.register(PaddleOCREngine())
335
+ except ImportError as e:
336
+ print(f"Warning: Failed to load PaddleOCR engine: {e}")
337
+
338
+ def get_available_engines(self) -> List[HTREngine]:
339
+ """Get list of engines with satisfied dependencies.
340
+
341
+ Returns:
342
+ List[HTREngine]: Engines that can be used
343
+ """
344
+ return [e for e in self.engines if e.is_available()]
345
+
346
+ def get_all_engines(self) -> List[HTREngine]:
347
+ """Get all registered engines (including unavailable ones).
348
+
349
+ Returns:
350
+ List[HTREngine]: All registered engines
351
+ """
352
+ return self.engines
353
+
354
+ def get_engine_by_name(self, name: str) -> Optional[HTREngine]:
355
+ """Get engine by display name.
356
+
357
+ Args:
358
+ name: Engine display name
359
+
360
+ Returns:
361
+ Optional[HTREngine]: Engine instance or None if not found
362
+ """
363
+ return self._engine_cache.get(name)
364
+
365
+ def get_engine_names(self) -> List[str]:
366
+ """Get list of available engine names.
367
+
368
+ Returns:
369
+ List[str]: Engine display names
370
+ """
371
+ return [e.get_name() for e in self.get_available_engines()]
372
+
373
+
374
+ # Global registry instance (singleton pattern)
375
+ _global_registry: Optional[HTREngineRegistry] = None
376
+
377
+
378
+ def get_global_registry() -> HTREngineRegistry:
379
+ """Get global HTR engine registry (singleton).
380
+
381
+ Returns:
382
+ HTREngineRegistry: Global registry instance
383
+ """
384
+ global _global_registry
385
+ if _global_registry is None:
386
+ _global_registry = HTREngineRegistry()
387
+ _global_registry.discover_engines()
388
+ return _global_registry
389
+
390
+
391
+ # Convenience function for GUI
392
+ def get_available_engine_names() -> List[str]:
393
+ """Get list of available engine names (convenience function).
394
+
395
+ Returns:
396
+ List[str]: Engine display names
397
+ """
398
+ return get_global_registry().get_engine_names()
inference_commercial_api.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Commercial VLM/LLM API inference for manuscript transcription.
3
+
4
+ Supports:
5
+ - OpenAI GPT-4 Vision / GPT-4o
6
+ - Google Gemini Pro Vision / Gemini Flash
7
+ - Anthropic Claude 3 (Opus, Sonnet, Haiku)
8
+
9
+ Usage:
10
+ # OpenAI
11
+ api = OpenAIInference(api_key="YOUR_OPENAI_API_KEY")
12
+ text = api.transcribe(image)
13
+
14
+ # Gemini
15
+ api = GeminiInference(api_key="YOUR_GEMINI_API_KEY")
16
+ text = api.transcribe(image)
17
+
18
+ # Claude
19
+ api = ClaudeInference(api_key="YOUR_ANTHROPIC_API_KEY")
20
+ text = api.transcribe(image)
21
+ """
22
+
23
+ import base64
24
+ import io
25
+ import time
26
+ from abc import ABC, abstractmethod
27
+ from pathlib import Path
28
+ from typing import Optional, Dict, Any
29
+ from PIL import Image
30
+
31
+ # API clients (install with: pip install openai google-generativeai anthropic)
32
+ try:
33
+ from openai import OpenAI
34
+ OPENAI_AVAILABLE = True
35
+ except ImportError:
36
+ OPENAI_AVAILABLE = False
37
+
38
+ try:
39
+ from google import genai as _google_genai_new
40
+ from google.genai import types as _google_genai_types
41
+ GEMINI_AVAILABLE = True
42
+ GEMINI_NEW_SDK = True
43
+ except ImportError:
44
+ GEMINI_NEW_SDK = False
45
+ try:
46
+ import google.generativeai as genai # legacy fallback
47
+ GEMINI_AVAILABLE = True
48
+ except ImportError:
49
+ GEMINI_AVAILABLE = False
50
+
51
+ try:
52
+ from anthropic import Anthropic
53
+ CLAUDE_AVAILABLE = True
54
+ except ImportError:
55
+ CLAUDE_AVAILABLE = False
56
+
57
+
58
+ class BaseAPIInference(ABC):
59
+ """Base class for commercial API inference."""
60
+
61
+ def __init__(self, api_key: str, default_prompt: Optional[str] = None):
62
+ """
63
+ Initialize API client.
64
+
65
+ Args:
66
+ api_key: API key for the service
67
+ default_prompt: Default transcription prompt
68
+ """
69
+ self.api_key = api_key
70
+ self.default_prompt = default_prompt or self._get_default_prompt()
71
+
72
+ @abstractmethod
73
+ def _get_default_prompt(self) -> str:
74
+ """Get default transcription prompt."""
75
+ pass
76
+
77
+ @abstractmethod
78
+ def transcribe(
79
+ self,
80
+ image: Image.Image,
81
+ prompt: Optional[str] = None,
82
+ **kwargs
83
+ ) -> str:
84
+ """
85
+ Transcribe a manuscript line image.
86
+
87
+ Args:
88
+ image: PIL Image
89
+ prompt: Custom prompt (uses default if None)
90
+ **kwargs: Provider-specific parameters
91
+
92
+ Returns:
93
+ Transcribed text
94
+ """
95
+ pass
96
+
97
+ @staticmethod
98
+ def encode_image_base64(image: Image.Image, format: str = "PNG") -> str:
99
+ """
100
+ Encode PIL Image to base64 string.
101
+
102
+ Args:
103
+ image: PIL Image
104
+ format: Image format (PNG, JPEG, etc.)
105
+
106
+ Returns:
107
+ Base64-encoded image string
108
+ """
109
+ buffered = io.BytesIO()
110
+ image.save(buffered, format=format)
111
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
112
+
113
+ @staticmethod
114
+ def resize_image_if_needed(
115
+ image: Image.Image,
116
+ max_dimension: int = 2048
117
+ ) -> Image.Image:
118
+ """
119
+ Resize image if larger than max dimension while preserving aspect ratio.
120
+
121
+ Args:
122
+ image: PIL Image
123
+ max_dimension: Maximum width or height
124
+
125
+ Returns:
126
+ Resized image (or original if already small enough)
127
+ """
128
+ width, height = image.size
129
+
130
+ if width <= max_dimension and height <= max_dimension:
131
+ return image
132
+
133
+ # Calculate new size preserving aspect ratio
134
+ if width > height:
135
+ new_width = max_dimension
136
+ new_height = int(height * (max_dimension / width))
137
+ else:
138
+ new_height = max_dimension
139
+ new_width = int(width * (max_dimension / height))
140
+
141
+ return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
142
+
143
+
144
+ class OpenAIInference(BaseAPIInference):
145
+ """OpenAI GPT-4 Vision / GPT-4o inference."""
146
+
147
+ def __init__(
148
+ self,
149
+ api_key: str,
150
+ model: str = "gpt-4o", # gpt-4o, gpt-4-vision-preview, gpt-4-turbo
151
+ default_prompt: Optional[str] = None
152
+ ):
153
+ """
154
+ Initialize OpenAI inference.
155
+
156
+ Args:
157
+ api_key: OpenAI API key
158
+ model: Model name
159
+ default_prompt: Default transcription prompt
160
+ """
161
+ if not OPENAI_AVAILABLE:
162
+ raise ImportError("OpenAI library not installed. Install with: pip install openai")
163
+
164
+ super().__init__(api_key, default_prompt)
165
+ self.model = model
166
+ self.client = OpenAI(api_key=api_key)
167
+
168
+ def _get_default_prompt(self) -> str:
169
+ return (
170
+ "Transcribe all handwritten text in this manuscript image. "
171
+ "Preserve the original language (Cyrillic, Latin, etc.) and layout. "
172
+ "Output only the transcribed text without any additional commentary."
173
+ )
174
+
175
+ def transcribe(
176
+ self,
177
+ image: Image.Image,
178
+ prompt: Optional[str] = None,
179
+ max_tokens: int = 500,
180
+ temperature: float = 1.0,
181
+ **kwargs
182
+ ) -> str:
183
+ """
184
+ Transcribe with OpenAI GPT-4 Vision.
185
+
186
+ Args:
187
+ image: PIL Image
188
+ prompt: Custom prompt
189
+ max_tokens: Maximum tokens to generate
190
+ temperature: Sampling temperature (web default ~1.0). Lower (0-0.3) = deterministic; higher = more variation.
191
+ **kwargs: Additional OpenAI parameters
192
+
193
+ Returns:
194
+ Transcribed text
195
+ """
196
+ prompt = prompt or self.default_prompt
197
+
198
+ # Resize if needed (GPT-4V supports up to 2048x2048)
199
+ image = self.resize_image_if_needed(image, max_dimension=2048)
200
+
201
+ # Encode image
202
+ base64_image = self.encode_image_base64(image, format="PNG")
203
+
204
+ # API call
205
+ response = self.client.chat.completions.create(
206
+ model=self.model,
207
+ messages=[
208
+ {
209
+ "role": "user",
210
+ "content": [
211
+ {"type": "text", "text": prompt},
212
+ {
213
+ "type": "image_url",
214
+ "image_url": {
215
+ "url": f"data:image/png;base64,{base64_image}"
216
+ }
217
+ }
218
+ ]
219
+ }
220
+ ],
221
+ max_tokens=max_tokens,
222
+ temperature=temperature,
223
+ **kwargs
224
+ )
225
+
226
+ return response.choices[0].message.content.strip()
227
+
228
+
229
+ class GeminiInference(BaseAPIInference):
230
+ """Google Gemini inference via google-genai SDK (with legacy google-generativeai fallback)."""
231
+
232
+ # thinking_mode string -> thinking_budget token count (max tokens for internal reasoning)
233
+ # "low": 8000 — moderate budget; fast enough for most lines
234
+ # "high": None — no ThinkingConfig passed at all; model decides dynamically (no cap)
235
+ _THINKING_BUDGETS = {"low": 8000, "high": None}
236
+
237
+ def __init__(
238
+ self,
239
+ api_key: str,
240
+ model: str = "gemini-2.0-flash",
241
+ default_prompt: Optional[str] = None,
242
+ ):
243
+ if not GEMINI_AVAILABLE:
244
+ raise ImportError(
245
+ "Google AI library not installed. Install with: pip install google-genai"
246
+ )
247
+ super().__init__(api_key, default_prompt)
248
+ self.model_name = model
249
+ # Populated after each transcribe() call — for UI token display
250
+ self.last_usage: Dict[str, Any] = {}
251
+ self._last_call_usage: Dict[str, Any] = {}
252
+
253
+ if GEMINI_NEW_SDK:
254
+ self._client = _google_genai_new.Client(api_key=api_key)
255
+ else:
256
+ # Legacy fallback
257
+ genai.configure(api_key=api_key)
258
+ self._legacy_model = genai.GenerativeModel(model)
259
+
260
+ def _get_default_prompt(self) -> str:
261
+ return (
262
+ "Transcribe all handwritten text in this manuscript image. "
263
+ "Preserve the original language (Cyrillic, Latin, etc.) and layout. "
264
+ "Output only the transcribed text without any additional commentary."
265
+ )
266
+
267
+ def _build_config(self, temperature, max_output_tokens, thinking_budget, safety_settings,
268
+ request_thoughts: bool = True):
269
+ """Build GenerateContentConfig for google-genai SDK.
270
+
271
+ request_thoughts=True (default): always sets include_thoughts=True so thought parts
272
+ appear in candidates[].content.parts[] and can be exported. Pass False when retrying
273
+ against a model that rejects ThinkingConfig entirely.
274
+ """
275
+ kw: Dict[str, Any] = {"temperature": temperature}
276
+ if max_output_tokens:
277
+ kw["max_output_tokens"] = max_output_tokens
278
+ if safety_settings:
279
+ kw["safety_settings"] = safety_settings
280
+ if request_thoughts:
281
+ # Always request thought text back; only cap thinking_budget when explicitly set
282
+ tc_kw: Dict[str, Any] = {"include_thoughts": True}
283
+ if thinking_budget is not None:
284
+ tc_kw["thinking_budget"] = thinking_budget
285
+ kw["thinking_config"] = _google_genai_types.ThinkingConfig(**tc_kw)
286
+ return _google_genai_types.GenerateContentConfig(**kw)
287
+
288
+ def _generate(self, prompt, image, temperature, thinking_budget, safety_settings, verbose):
289
+ """Single generate call. Handles thinking-not-supported gracefully."""
290
+ if not GEMINI_NEW_SDK:
291
+ # Legacy google-generativeai path
292
+ gen_cfg = genai.GenerationConfig(temperature=temperature or 0.0)
293
+ resp = self._legacy_model.generate_content(
294
+ [prompt, image], generation_config=gen_cfg, safety_settings=safety_settings
295
+ )
296
+ self._last_call_usage = {}
297
+ return resp.text.strip()
298
+
299
+ config = self._build_config(temperature or 0.0, None, thinking_budget, safety_settings,
300
+ request_thoughts=True)
301
+ try:
302
+ resp = self._client.models.generate_content(
303
+ model=self.model_name, contents=[prompt, image], config=config
304
+ )
305
+ except Exception as e:
306
+ err = str(e)
307
+ # Non-thinking models reject ThinkingConfig with a 400/invalid error — retry without it
308
+ if "thinking" in err.lower() or ("400" in err and "invalid" in err.lower()):
309
+ if verbose:
310
+ print(f"Model does not support ThinkingConfig, retrying without.")
311
+ config = self._build_config(temperature or 0.0, None, thinking_budget,
312
+ safety_settings, request_thoughts=False)
313
+ resp = self._client.models.generate_content(
314
+ model=self.model_name, contents=[prompt, image], config=config
315
+ )
316
+ else:
317
+ raise
318
+
319
+ usage = getattr(resp, "usage_metadata", None)
320
+ self._last_call_usage = {
321
+ "prompt_tokens": getattr(usage, "prompt_token_count", None) if usage else None,
322
+ "output_tokens": getattr(usage, "candidates_token_count", None) if usage else None,
323
+ "thinking_tokens": getattr(usage, "thoughts_token_count", None) if usage else None,
324
+ "total_tokens": getattr(usage, "total_token_count", None) if usage else None,
325
+ }
326
+ # Extract thinking text from thought parts (present when include_thoughts=True was sent)
327
+ thinking_parts = []
328
+ try:
329
+ for cand in (getattr(resp, "candidates", None) or []):
330
+ for part in (getattr(getattr(cand, "content", None), "parts", None) or []):
331
+ if getattr(part, "thought", False) and getattr(part, "text", None):
332
+ thinking_parts.append(part.text)
333
+ except Exception:
334
+ pass
335
+ self._last_call_usage["thinking_text"] = "\n\n".join(thinking_parts) if thinking_parts else None
336
+ return resp.text.strip()
337
+
338
+ def _maybe_continue(
339
+ self,
340
+ current_text: str,
341
+ prompt: str,
342
+ image,
343
+ thinking_budget,
344
+ safety_settings,
345
+ auto_continue: bool,
346
+ max_auto_continuations: int,
347
+ continuation_min_new_chars: int,
348
+ verbose_block_logging: bool,
349
+ ) -> str:
350
+ if not auto_continue:
351
+ return current_text
352
+ accumulated = current_text
353
+ for pass_idx in range(1, max_auto_continuations + 1):
354
+ continuation_prompt = (
355
+ f"{prompt}\n\nPartial transcription so far (DO NOT repeat it):\n"
356
+ f"{accumulated}\n\nContinue transcribing remaining, previously UNTRANSCRIBED text. "
357
+ "Output ONLY the new continuation without repeating prior characters."
358
+ )
359
+ try:
360
+ new_chunk = self._generate(
361
+ continuation_prompt, image, None, thinking_budget,
362
+ safety_settings, verbose_block_logging
363
+ )
364
+ except Exception as e:
365
+ if verbose_block_logging:
366
+ print(f"Continuation {pass_idx} failed: {e}")
367
+ break
368
+ if not new_chunk:
369
+ if verbose_block_logging:
370
+ print(f"Continuation {pass_idx}: no new text, stopping.")
371
+ break
372
+ # Guard against repetition
373
+ if accumulated and new_chunk.startswith(accumulated[:200]):
374
+ overlap_pos = new_chunk.find(accumulated[-50:])
375
+ if overlap_pos > 0:
376
+ new_chunk = new_chunk[overlap_pos + len(accumulated[-50:]):]
377
+ delta = len(new_chunk)
378
+ if delta < continuation_min_new_chars:
379
+ if verbose_block_logging:
380
+ print(f"Continuation {pass_idx}: only {delta} chars, stopping.")
381
+ break
382
+ accumulated += ("\n" if not accumulated.endswith("\n") else "") + new_chunk
383
+ if verbose_block_logging:
384
+ print(f"Continuation {pass_idx}: +{delta} chars (total {len(accumulated)})")
385
+ return accumulated
386
+
387
+ def transcribe(
388
+ self,
389
+ image,
390
+ prompt: Optional[str] = None,
391
+ temperature: float = 0.0,
392
+ max_output_tokens: Optional[int] = None,
393
+ auto_retry_on_block: bool = True,
394
+ safety_relax: bool = True,
395
+ verbose_block_logging: bool = True,
396
+ thinking_mode: Optional[str] = None,
397
+ fast_direct: bool = False,
398
+ fast_direct_early_exit: bool = True,
399
+ auto_continue: bool = False,
400
+ max_auto_continuations: int = 2,
401
+ continuation_min_new_chars: int = 50,
402
+ reasoning_fallback_threshold: float = 1.0,
403
+ record_stats_csv: Optional[str] = None,
404
+ apply_restriction_prompt: bool = False,
405
+ fallback_max_output_tokens: int = 8192,
406
+ **kwargs,
407
+ ) -> str:
408
+ """Transcribe a manuscript image with Google Gemini.
409
+
410
+ Args:
411
+ image: PIL Image or numpy array
412
+ prompt: Transcription prompt (uses default if None)
413
+ temperature: Sampling temperature (0.0 = deterministic)
414
+ max_output_tokens: Output token cap (None = model default)
415
+ thinking_mode: None | "low" | "high" -- maps to thinking_budget
416
+ record_stats_csv: Path to append usage CSV row (None to skip)
417
+ auto_continue: Request continuation calls if output seems truncated
418
+ """
419
+ from PIL import Image as _PIL_Image
420
+ import numpy as np
421
+ if isinstance(image, np.ndarray):
422
+ image = _PIL_Image.fromarray(image)
423
+ image = self.resize_image_if_needed(image, max_dimension=3072)
424
+ prompt = prompt or self.default_prompt
425
+
426
+ # Map thinking_mode to thinking_budget
427
+ thinking_budget = self._THINKING_BUDGETS.get(thinking_mode) # None if mode is None/unknown
428
+
429
+ # Safety settings
430
+ safety_settings = None
431
+ if safety_relax and GEMINI_NEW_SDK:
432
+ safety_settings = [
433
+ _google_genai_types.SafetySetting(category=cat, threshold="BLOCK_NONE")
434
+ for cat in (
435
+ "HARM_CATEGORY_HARASSMENT",
436
+ "HARM_CATEGORY_HATE_SPEECH",
437
+ "HARM_CATEGORY_SEXUALLY_EXPLICIT",
438
+ "HARM_CATEGORY_DANGEROUS_CONTENT",
439
+ )
440
+ ]
441
+
442
+ self._last_call_usage = {}
443
+
444
+ try:
445
+ result_text = self._generate(
446
+ prompt, image, temperature, thinking_budget, safety_settings, verbose_block_logging
447
+ )
448
+ except Exception as e:
449
+ raise ValueError(f"Gemini transcription failed: {e}") from e
450
+
451
+ # Persist usage for callers (e.g. statistics panel, CSV logging)
452
+ self.last_usage = dict(self._last_call_usage)
453
+ u = self.last_usage
454
+ if verbose_block_logging and u.get("total_tokens"):
455
+ print(
456
+ f"[tokens] prompt={u.get('prompt_tokens')} "
457
+ f"output={u.get('output_tokens')} "
458
+ f"thinking={u.get('thinking_tokens')} "
459
+ f"total={u.get('total_tokens')}"
460
+ )
461
+
462
+ if record_stats_csv:
463
+ try:
464
+ from datetime import datetime
465
+ with open(record_stats_csv, "a") as f:
466
+ f.write(
467
+ f"{datetime.utcnow().isoformat()},"
468
+ f"{self.model_name},"
469
+ f"{thinking_mode or 'default'},"
470
+ f"final_success,"
471
+ f"{u.get('prompt_tokens')},"
472
+ f"{u.get('output_tokens')},"
473
+ f"{u.get('thinking_tokens')},"
474
+ f"{u.get('total_tokens')},"
475
+ f"{len(result_text)}\n"
476
+ )
477
+ except Exception as csv_e:
478
+ if verbose_block_logging:
479
+ print(f"Stats logging failed: {csv_e}")
480
+
481
+ return self._maybe_continue(
482
+ result_text, prompt, image, thinking_budget, safety_settings,
483
+ auto_continue, max_auto_continuations, continuation_min_new_chars,
484
+ verbose_block_logging,
485
+ )
486
+
487
+ class ClaudeInference(BaseAPIInference):
488
+ """Anthropic Claude 3 inference (Opus, Sonnet, Haiku)."""
489
+
490
+ def __init__(
491
+ self,
492
+ api_key: str,
493
+ model: str = "claude-3-5-sonnet-20241022", # claude-3-5-sonnet-20241022, claude-3-opus-20240229, claude-3-haiku-20240307
494
+ default_prompt: Optional[str] = None
495
+ ):
496
+ """
497
+ Initialize Claude inference.
498
+
499
+ Args:
500
+ api_key: Anthropic API key
501
+ model: Model name
502
+ default_prompt: Default transcription prompt
503
+ """
504
+ if not CLAUDE_AVAILABLE:
505
+ raise ImportError("Anthropic library not installed. Install with: pip install anthropic")
506
+
507
+ super().__init__(api_key, default_prompt)
508
+ self.model = model
509
+ self.client = Anthropic(api_key=api_key)
510
+
511
+ def _get_default_prompt(self) -> str:
512
+ return (
513
+ "Transcribe all handwritten text in this manuscript image. "
514
+ "Preserve the original language (Cyrillic, Latin, etc.) and layout. "
515
+ "Output only the transcribed text without any additional commentary."
516
+ )
517
+
518
+ def transcribe(
519
+ self,
520
+ image: Image.Image,
521
+ prompt: Optional[str] = None,
522
+ max_tokens: int = 500,
523
+ temperature: float = 0.0,
524
+ **kwargs
525
+ ) -> str:
526
+ """
527
+ Transcribe with Anthropic Claude.
528
+
529
+ Args:
530
+ image: PIL Image
531
+ prompt: Custom prompt
532
+ max_tokens: Maximum tokens to generate
533
+ temperature: Sampling temperature (0.0 = deterministic)
534
+ **kwargs: Additional Claude parameters
535
+
536
+ Returns:
537
+ Transcribed text
538
+ """
539
+ prompt = prompt or self.default_prompt
540
+
541
+ # Resize if needed (Claude supports up to 1568px on longest side)
542
+ image = self.resize_image_if_needed(image, max_dimension=1568)
543
+
544
+ # Encode image
545
+ base64_image = self.encode_image_base64(image, format="PNG")
546
+
547
+ # API call
548
+ response = self.client.messages.create(
549
+ model=self.model,
550
+ max_tokens=max_tokens,
551
+ temperature=temperature,
552
+ messages=[
553
+ {
554
+ "role": "user",
555
+ "content": [
556
+ {
557
+ "type": "image",
558
+ "source": {
559
+ "type": "base64",
560
+ "media_type": "image/png",
561
+ "data": base64_image
562
+ }
563
+ },
564
+ {
565
+ "type": "text",
566
+ "text": prompt
567
+ }
568
+ ]
569
+ }
570
+ ],
571
+ **kwargs
572
+ )
573
+
574
+ return response.content[0].text.strip()
575
+
576
+
577
+ # Model availability checks
578
+ def check_api_availability() -> Dict[str, bool]:
579
+ """Check which API libraries are installed."""
580
+ return {
581
+ "openai": OPENAI_AVAILABLE,
582
+ "gemini": GEMINI_AVAILABLE,
583
+ "claude": CLAUDE_AVAILABLE,
584
+ }
585
+
586
+
587
+ # Fallback API model lists (used if dynamic fetching fails)
588
+ OPENAI_MODELS_FALLBACK = [
589
+ "gpt-4o",
590
+ "gpt-4o-mini",
591
+ "gpt-4o-2024-11-20",
592
+ "chatgpt-4o-latest",
593
+ "gpt-4-turbo",
594
+ "gpt-4-vision-preview",
595
+ "o1-preview",
596
+ "o1-mini",
597
+ ]
598
+
599
+ GEMINI_MODELS_FALLBACK = [
600
+ # Free tier models (generally available)
601
+ "gemini-1.5-flash",
602
+ "gemini-1.5-flash-002",
603
+ "gemini-1.5-flash-8b",
604
+ "gemini-2.0-flash-exp",
605
+ # Paid/preview models (may require upgrade)
606
+ "gemini-1.5-pro",
607
+ "gemini-1.5-pro-002",
608
+ "gemini-1.5-pro-exp-0827",
609
+ # Experimental (may not be available to all users)
610
+ "gemini-exp-1206",
611
+ "gemini-exp-1121",
612
+ # Gemini 3 preview models (latest, may have restrictions)
613
+ "gemini-3-pro-preview",
614
+ ]
615
+
616
+ CLAUDE_MODELS_FALLBACK = [
617
+ "claude-opus-4-6",
618
+ "claude-sonnet-4-6",
619
+ "claude-haiku-4-5-20251001",
620
+ "claude-3-5-sonnet-20241022",
621
+ "claude-3-5-haiku-20241022",
622
+ "claude-3-opus-20240229",
623
+ "claude-3-haiku-20240307",
624
+ ]
625
+
626
+
627
+ def fetch_openai_models(api_key: str = None) -> list:
628
+ """
629
+ Dynamically fetch available OpenAI models from API.
630
+
631
+ Args:
632
+ api_key: OpenAI API key (uses env var if not provided)
633
+
634
+ Returns:
635
+ List of vision-capable model IDs, or fallback list if fetch fails
636
+ """
637
+ if not OPENAI_AVAILABLE:
638
+ return OPENAI_MODELS_FALLBACK
639
+
640
+ try:
641
+ if not api_key:
642
+ return OPENAI_MODELS_FALLBACK
643
+
644
+ client = OpenAI(api_key=api_key)
645
+ models = client.models.list()
646
+
647
+ # Filter for vision-capable models (GPT-4 family + o1)
648
+ vision_models = []
649
+ for model in models.data:
650
+ model_id = model.id
651
+ # Include GPT-4 vision models and o1 models
652
+ if any(prefix in model_id for prefix in [
653
+ "gpt-4o", "gpt-4-turbo", "gpt-4-vision",
654
+ "chatgpt-4o", "o1-", "gpt-4.5" # Include potential GPT-4.5
655
+ ]):
656
+ vision_models.append(model_id)
657
+
658
+ # Sort with newest/best models first
659
+ vision_models.sort(reverse=True)
660
+
661
+ # Return dynamic list if we found models, otherwise fallback
662
+ return vision_models if vision_models else OPENAI_MODELS_FALLBACK
663
+
664
+ except Exception as e:
665
+ print(f"[OpenAI] Could not fetch models dynamically: {e}")
666
+ print(f"[OpenAI] Using fallback model list")
667
+ return OPENAI_MODELS_FALLBACK
668
+
669
+
670
+ def fetch_gemini_models(api_key: str = None) -> list:
671
+ """Dynamically fetch available Gemini models; returns fallback list on failure."""
672
+ if not GEMINI_AVAILABLE:
673
+ return GEMINI_MODELS_FALLBACK
674
+ try:
675
+ if not api_key:
676
+ return GEMINI_MODELS_FALLBACK
677
+ if GEMINI_NEW_SDK:
678
+ client = _google_genai_new.Client(api_key=api_key)
679
+ models = [
680
+ m.name.replace("models/", "")
681
+ for m in client.models.list()
682
+ if "generateContent" in (getattr(m, "supported_actions", None) or [])
683
+ ]
684
+ else:
685
+ genai.configure(api_key=api_key)
686
+ models = [
687
+ m.name.replace("models/", "")
688
+ for m in genai.list_models()
689
+ if "generateContent" in m.supported_generation_methods
690
+ ]
691
+ models = [m for m in models if m.startswith("gemini")]
692
+ models.sort(reverse=True)
693
+ return models if models else GEMINI_MODELS_FALLBACK
694
+ except Exception as e:
695
+ print(f"[Gemini] Could not fetch models: {e}")
696
+ return GEMINI_MODELS_FALLBACK
697
+
698
+ def fetch_claude_models(api_key: str = None) -> list:
699
+ """
700
+ Dynamically fetch available Claude models via Anthropic API.
701
+
702
+ Returns:
703
+ List of Claude model IDs (newest first), or fallback list if fetch fails.
704
+ """
705
+ if not CLAUDE_AVAILABLE:
706
+ return CLAUDE_MODELS_FALLBACK
707
+
708
+ try:
709
+ if not api_key:
710
+ return CLAUDE_MODELS_FALLBACK
711
+
712
+ client = Anthropic(api_key=api_key)
713
+ models_page = client.models.list()
714
+ model_ids = [m.id for m in models_page.data]
715
+ # Sort newest first (IDs contain dates like -20241022 or version numbers)
716
+ model_ids.sort(reverse=True)
717
+ return model_ids if model_ids else CLAUDE_MODELS_FALLBACK
718
+
719
+ except Exception as e:
720
+ print(f"[Claude] Could not fetch models dynamically: {e}")
721
+ return CLAUDE_MODELS_FALLBACK
722
+
723
+
724
+ # Initialize model lists (will be updated when API keys are provided)
725
+ OPENAI_MODELS = OPENAI_MODELS_FALLBACK.copy()
726
+ GEMINI_MODELS = GEMINI_MODELS_FALLBACK.copy()
727
+ CLAUDE_MODELS = CLAUDE_MODELS_FALLBACK.copy()
728
+
729
+
730
+ if __name__ == "__main__":
731
+ # Example usage
732
+ import sys
733
+
734
+ if len(sys.argv) < 4:
735
+ print("Usage: python inference_commercial_api.py <provider> <api_key> <image_path>")
736
+ print("Providers: openai, gemini, claude")
737
+ sys.exit(1)
738
+
739
+ provider = sys.argv[1].lower()
740
+ api_key = sys.argv[2]
741
+ image_path = sys.argv[3]
742
+
743
+ # Load image
744
+ image = Image.open(image_path).convert("RGB")
745
+
746
+ # Initialize appropriate inference client
747
+ if provider == "openai":
748
+ api = OpenAIInference(api_key)
749
+ elif provider == "gemini":
750
+ api = GeminiInference(api_key)
751
+ elif provider == "claude":
752
+ api = ClaudeInference(api_key)
753
+ else:
754
+ print(f"Unknown provider: {provider}")
755
+ sys.exit(1)
756
+
757
+ # Transcribe
758
+ print(f"Transcribing with {provider}...")
759
+ text = api.transcribe(image)
760
+ print(f"\nResult: {text}")
inference_page.py ADDED
@@ -0,0 +1,946 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Whole-page OCR inference for Ukrainian handwritten text using TrOCR.
3
+
4
+ This script performs line segmentation and transcription on unsegmented page images.
5
+
6
+ Usage:
7
+ # Basic usage with checkpoint
8
+ python inference_page.py --image path/to/page.jpg --checkpoint models/ukrainian_model/checkpoint-3000
9
+
10
+ # With custom settings
11
+ python inference_page.py --image page.jpg --checkpoint checkpoint-3000 --num_beams 4 --output output.txt
12
+
13
+ # With Transkribus PAGE XML (uses existing segmentation)
14
+ python inference_page.py --image page.jpg --xml page.xml --checkpoint checkpoint-3000
15
+
16
+ Future: Can be extended with a GUI using tkinter or PyQt.
17
+ """
18
+
19
+ import argparse
20
+ import torch
21
+ from pathlib import Path
22
+ from PIL import Image, ImageDraw
23
+ import numpy as np
24
+ from typing import List, Tuple, Optional
25
+ import xml.etree.ElementTree as ET
26
+ from dataclasses import dataclass
27
+ import cv2
28
+
29
+ # Disable PIL DecompressionBomb protection for large manuscript images
30
+ Image.MAX_IMAGE_PIXELS = None
31
+
32
+ # Optional: the hosted Hugging Face Space uses this module for segmentation, but
33
+ # does not enable TrOCR inference. Avoid making transformers a startup dependency.
34
+ try:
35
+ from transformers import VisionEncoderDecoderModel, TrOCRProcessor
36
+ except ImportError:
37
+ VisionEncoderDecoderModel = None
38
+ TrOCRProcessor = None
39
+
40
+
41
+ @dataclass
42
+ class LineSegment:
43
+ """Represents a segmented text line."""
44
+ image: Image.Image
45
+ bbox: Tuple[int, int, int, int] # x1, y1, x2, y2
46
+ coords: Optional[List[Tuple[int, int]]] = None # polygon coordinates if available
47
+ text: Optional[str] = None # transcription result
48
+ confidence: Optional[float] = None # average confidence score (0-1)
49
+ char_confidences: Optional[List[float]] = None # per-character confidence scores
50
+
51
+
52
+ def sort_lines_by_region(regions, lines):
53
+ """
54
+ Sort lines in reading order: regions left-to-right, lines top-to-bottom
55
+ within each region.
56
+
57
+ Works with SegRegion objects from kraken_segmenter (which carry bbox and
58
+ line_ids) and any list of line-like objects that have a ``.bbox`` attribute
59
+ with (x1, y1, x2, y2) format.
60
+
61
+ Args:
62
+ regions: List of SegRegion (from kraken_segmenter) with .bbox and .line_ids.
63
+ If empty/None, lines are returned sorted top-to-bottom.
64
+ lines: List of LineSegment (or kraken LineSegment).
65
+
66
+ Returns:
67
+ List of lines re-ordered by region reading order.
68
+ """
69
+ if not regions or not lines:
70
+ # No region info — simple top-to-bottom sort
71
+ return sorted(lines, key=lambda l: l.bbox[1])
72
+
73
+ # Sort regions left-to-right by mean x-center
74
+ sorted_regions = sorted(
75
+ regions,
76
+ key=lambda r: (r.bbox[0] + r.bbox[2]) / 2,
77
+ )
78
+
79
+ # Assign each line to the region whose bbox contains the line's center
80
+ region_buckets = {r.id: [] for r in sorted_regions}
81
+ unassigned = []
82
+
83
+ for line in lines:
84
+ cx = (line.bbox[0] + line.bbox[2]) / 2
85
+ cy = (line.bbox[1] + line.bbox[3]) / 2
86
+ assigned = False
87
+ for r in sorted_regions:
88
+ rx1, ry1, rx2, ry2 = r.bbox
89
+ if rx1 <= cx <= rx2 and ry1 <= cy <= ry2:
90
+ region_buckets[r.id].append(line)
91
+ assigned = True
92
+ break
93
+ if not assigned:
94
+ unassigned.append(line)
95
+
96
+ # Build ordered list: per-region top-to-bottom, then unassigned at the end
97
+ ordered = []
98
+ for r in sorted_regions:
99
+ bucket = region_buckets[r.id]
100
+ bucket.sort(key=lambda l: l.bbox[1])
101
+ ordered.extend(bucket)
102
+
103
+ unassigned.sort(key=lambda l: l.bbox[1])
104
+ ordered.extend(unassigned)
105
+ return ordered
106
+
107
+
108
+ def normalize_background(image: Image.Image) -> Image.Image:
109
+ """
110
+ Normalize background to light gray (similar to Efendiev dataset).
111
+
112
+ CRITICAL for Ukrainian dataset: Models trained on data with background
113
+ normalization MUST have normalization applied at inference time as well.
114
+
115
+ Args:
116
+ image: PIL Image with potentially aged/colored background
117
+
118
+ Returns:
119
+ PIL Image with normalized background
120
+ """
121
+ # Convert PIL to OpenCV format
122
+ img_array = np.array(image)
123
+
124
+ # Convert to LAB color space for better lighting normalization
125
+ lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB)
126
+ l, a, b = cv2.split(lab)
127
+
128
+ # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) to L channel
129
+ # This normalizes lighting variations across the image
130
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
131
+ l_normalized = clahe.apply(l)
132
+
133
+ # Merge back and convert to RGB
134
+ lab_normalized = cv2.merge([l_normalized, a, b])
135
+ rgb_normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2RGB)
136
+
137
+ # Convert to grayscale to remove color variations (aged paper tones)
138
+ gray = cv2.cvtColor(rgb_normalized, cv2.COLOR_RGB2GRAY)
139
+
140
+ # Convert back to RGB with uniform background
141
+ # This creates a light gray background similar to Efendiev dataset
142
+ normalized_rgb = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
143
+
144
+ return Image.fromarray(normalized_rgb)
145
+
146
+
147
+ class LineSegmenter:
148
+ """Improved line segmentation using horizontal projection with multiple strategies."""
149
+
150
+ def __init__(self, min_line_height: int = 15, min_gap: int = 5,
151
+ sensitivity: float = 0.02, use_morph: bool = True):
152
+ """
153
+ Initialize LineSegmenter.
154
+
155
+ Args:
156
+ min_line_height: Minimum height of a line in pixels (default: 15, lowered for tighter spacing)
157
+ min_gap: Minimum gap between lines in pixels (default: 5, lowered for tight spacing)
158
+ sensitivity: Threshold for detecting text (0.01-0.1, lower = more sensitive, default: 0.02)
159
+ use_morph: Apply morphological operations to clean up detection (default: True)
160
+ """
161
+ self.min_line_height = min_line_height
162
+ self.min_gap = min_gap
163
+ self.sensitivity = sensitivity
164
+ self.use_morph = use_morph
165
+
166
+ def segment_lines(self, image: Image.Image, debug: bool = False) -> List[LineSegment]:
167
+ """
168
+ Segment page image into text lines using horizontal projection.
169
+
170
+ Improved algorithm:
171
+ 1. Multiple binarization strategies (Otsu + Sauvola for different scripts)
172
+ 2. Morphological operations to connect broken text
173
+ 3. Lower sensitivity threshold for tight line spacing
174
+ 4. Smart gap detection based on local context
175
+
176
+ Args:
177
+ image: Input page image (PIL Image)
178
+ debug: If True, visualize segmentation
179
+
180
+ Returns:
181
+ List of LineSegment objects
182
+ """
183
+ # Convert to grayscale
184
+ gray = np.array(image.convert('L'))
185
+
186
+ # Try multiple binarization strategies and combine
187
+ from scipy.ndimage import gaussian_filter
188
+ blurred = gaussian_filter(gray, sigma=1.0)
189
+
190
+ # Strategy 1: Otsu's method (global threshold)
191
+ threshold_otsu = self._otsu_threshold(blurred)
192
+ binary_otsu = blurred < threshold_otsu
193
+
194
+ # Strategy 2: Adaptive threshold (local threshold, better for varying contrast)
195
+ binary_adaptive = self._adaptive_threshold(gray)
196
+
197
+ # Combine both strategies (logical OR to catch text in both)
198
+ binary = np.logical_or(binary_otsu, binary_adaptive)
199
+
200
+ # Apply morphological closing to connect broken characters
201
+ if self.use_morph:
202
+ from scipy.ndimage import binary_closing
203
+ # Horizontal structuring element to connect characters on same line
204
+ struct = np.ones((3, 5)) # 3 pixels tall, 5 pixels wide
205
+ binary = binary_closing(binary, structure=struct, iterations=2)
206
+
207
+ # Horizontal projection (sum of black pixels per row)
208
+ h_projection = binary.sum(axis=1)
209
+
210
+ # Adaptive threshold based on image statistics
211
+ # Use lower threshold for better sensitivity
212
+ if h_projection.max() > 0:
213
+ threshold = h_projection.max() * self.sensitivity
214
+ else:
215
+ # Fallback if no text detected
216
+ threshold = 1
217
+
218
+ is_text = h_projection > threshold
219
+
220
+ # Apply median filter to smooth out noise in projection
221
+ from scipy.ndimage import median_filter
222
+ is_text_smoothed = median_filter(is_text.astype(float), size=3) > 0.5
223
+
224
+ # Find continuous text regions with improved gap detection
225
+ lines = []
226
+ in_line = False
227
+ start_y = 0
228
+ gap_count = 0
229
+
230
+ for y in range(len(is_text_smoothed)):
231
+ if is_text_smoothed[y]:
232
+ if not in_line:
233
+ # Start of new line
234
+ start_y = y
235
+ in_line = True
236
+ gap_count = 0
237
+ else:
238
+ # Continue line, reset gap counter
239
+ gap_count = 0
240
+ else:
241
+ if in_line:
242
+ # Potential gap - count consecutive gap pixels
243
+ gap_count += 1
244
+ if gap_count >= self.min_gap:
245
+ # End of line (gap is large enough)
246
+ end_y = y - gap_count
247
+ if end_y - start_y >= self.min_line_height:
248
+ lines.append((start_y, end_y))
249
+ in_line = False
250
+ gap_count = 0
251
+
252
+ # Don't forget last line if image ends with text
253
+ if in_line and len(is_text_smoothed) - start_y >= self.min_line_height:
254
+ lines.append((start_y, len(is_text_smoothed)))
255
+
256
+ # Post-process: Merge lines that are too close (likely one line split incorrectly)
257
+ merged_lines = self._merge_close_lines(lines, max_gap=self.min_gap * 2)
258
+
259
+ # Create LineSegment objects
260
+ segments = []
261
+ width = image.width
262
+
263
+ for y1, y2 in merged_lines:
264
+ # Add padding (larger padding for better context)
265
+ padding = 8
266
+ y1_pad = max(0, y1 - padding)
267
+ y2_pad = min(image.height, y2 + padding)
268
+
269
+ # Crop line (full width for now, could be refined with vertical projection)
270
+ bbox = (0, y1_pad, width, y2_pad)
271
+ line_img = image.crop(bbox)
272
+
273
+ segments.append(LineSegment(
274
+ image=line_img,
275
+ bbox=bbox
276
+ ))
277
+
278
+ if debug:
279
+ self._visualize_segmentation(image, segments, h_projection)
280
+
281
+ print(f"[LineSegmenter] Detected {len(segments)} lines (sensitivity={self.sensitivity}, min_height={self.min_line_height})")
282
+
283
+ return segments
284
+
285
+ @staticmethod
286
+ def _adaptive_threshold(gray: np.ndarray, block_size: int = 35) -> np.ndarray:
287
+ """
288
+ Apply adaptive thresholding using a local window.
289
+ Better for images with varying illumination or contrast.
290
+ """
291
+ # Use cv2 if available, otherwise fallback to simple method
292
+ try:
293
+ import cv2
294
+ # Adaptive Gaussian thresholding
295
+ binary = cv2.adaptiveThreshold(
296
+ gray.astype(np.uint8),
297
+ 255,
298
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
299
+ cv2.THRESH_BINARY_INV,
300
+ block_size,
301
+ 10
302
+ )
303
+ return binary > 0
304
+ except:
305
+ # Fallback: simple global threshold
306
+ threshold = np.mean(gray) - np.std(gray) * 0.5
307
+ return gray < threshold
308
+
309
+ @staticmethod
310
+ def _merge_close_lines(lines: List[Tuple[int, int]], max_gap: int = 10) -> List[Tuple[int, int]]:
311
+ """Merge lines that are very close together (likely one line split incorrectly)."""
312
+ if not lines:
313
+ return lines
314
+
315
+ merged = [lines[0]]
316
+ for y1, y2 in lines[1:]:
317
+ prev_y1, prev_y2 = merged[-1]
318
+ gap = y1 - prev_y2
319
+
320
+ if gap <= max_gap:
321
+ # Merge with previous line
322
+ merged[-1] = (prev_y1, y2)
323
+ else:
324
+ # Add as new line
325
+ merged.append((y1, y2))
326
+
327
+ return merged
328
+
329
+ @staticmethod
330
+ def _otsu_threshold(gray_array: np.ndarray) -> float:
331
+ """Compute Otsu's threshold."""
332
+ hist, bin_edges = np.histogram(gray_array, bins=256, range=(0, 256))
333
+ hist = hist.astype(float)
334
+
335
+ # Normalize
336
+ hist /= hist.sum()
337
+
338
+ # Cumulative sums
339
+ weight1 = np.cumsum(hist)
340
+ weight2 = np.cumsum(hist[::-1])[::-1]
341
+
342
+ # Cumulative means
343
+ mean1 = np.cumsum(hist * np.arange(256))
344
+ mean2 = (np.cumsum((hist * np.arange(256))[::-1])[::-1])
345
+
346
+ # Avoid division by zero
347
+ weight1 = np.clip(weight1, 1e-10, 1)
348
+ weight2 = np.clip(weight2, 1e-10, 1)
349
+
350
+ # Between-class variance
351
+ variance = weight1 * weight2 * ((mean1 / weight1) - (mean2 / weight2)) ** 2
352
+
353
+ return np.argmax(variance)
354
+
355
+ @staticmethod
356
+ def _visualize_segmentation(image: Image.Image, segments: List[LineSegment],
357
+ h_projection: Optional[np.ndarray] = None):
358
+ """Visualize line segmentation for debugging."""
359
+ vis = image.copy()
360
+ draw = ImageDraw.Draw(vis)
361
+
362
+ for i, seg in enumerate(segments):
363
+ x1, y1, x2, y2 = seg.bbox
364
+ # Alternate colors for visibility
365
+ color = 'red' if i % 2 == 0 else 'blue'
366
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
367
+ draw.text((x1 + 5, y1 + 5), f"Line {i+1}", fill=color)
368
+
369
+ vis.show()
370
+
371
+ # Optionally show projection profile
372
+ if h_projection is not None:
373
+ import matplotlib.pyplot as plt
374
+ plt.figure(figsize=(12, 4))
375
+ plt.plot(h_projection)
376
+ plt.title("Horizontal Projection Profile")
377
+ plt.xlabel("Y Position")
378
+ plt.ylabel("Text Density")
379
+ plt.grid(True)
380
+ plt.show()
381
+
382
+
383
+ class PageXMLSegmenter:
384
+ """Segment using existing Transkribus PAGE XML annotations."""
385
+
386
+ NS = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
387
+
388
+ def __init__(self, xml_path: str):
389
+ self.xml_path = Path(xml_path)
390
+
391
+ def segment_lines(self, image: Image.Image) -> List[LineSegment]:
392
+ """Extract lines using PAGE XML coordinates with correct reading order."""
393
+ tree = ET.parse(self.xml_path)
394
+ root = tree.getroot()
395
+
396
+ # Determine scale factors: PAGE XML stores absolute pixel coords for the
397
+ # original scan. If the uploaded image was resized, we must scale coords.
398
+ ns = self.NS
399
+ # Try both common PAGE XML namespaces (2013 and 2019 Transkribus variants)
400
+ page_elem = root.find('.//page:Page', ns)
401
+ if page_elem is None:
402
+ ns_2019 = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
403
+ page_elem = root.find('.//page:Page', ns_2019)
404
+ if page_elem is not None:
405
+ ns = ns_2019
406
+ xml_w = int(page_elem.get('imageWidth', image.width)) if page_elem is not None else image.width
407
+ xml_h = int(page_elem.get('imageHeight', image.height)) if page_elem is not None else image.height
408
+ scale_x = image.width / xml_w if xml_w > 0 else 1.0
409
+ scale_y = image.height / xml_h if xml_h > 0 else 1.0
410
+
411
+ # Will be populated below for visualization in the viewer
412
+ self.region_data: list = []
413
+
414
+ # Store regions with their reading order
415
+ regions_with_order = []
416
+
417
+ for region in root.findall('.//page:TextRegion', ns):
418
+ # Extract region reading order from custom attribute
419
+ region_order = self._extract_reading_order(region.get('custom', ''))
420
+
421
+ # Get region Y coordinate as fallback (from first TextLine or Coords)
422
+ region_y = self._get_region_y_position(region, ns)
423
+
424
+ # Store lines for this region with their reading order
425
+ lines_with_order = []
426
+
427
+ for text_line in region.findall('.//page:TextLine', ns):
428
+ # Get coordinates
429
+ coords_elem = text_line.find('page:Coords', ns)
430
+ if coords_elem is None:
431
+ continue
432
+
433
+ coords_str = coords_elem.get('points')
434
+ if not coords_str:
435
+ continue
436
+
437
+ # Parse coordinates and scale to uploaded image dimensions
438
+ coords = self._parse_coords(coords_str)
439
+ if scale_x != 1.0 or scale_y != 1.0:
440
+ coords = [(int(x * scale_x), int(y * scale_y)) for x, y in coords]
441
+ x1, y1, x2, y2 = self._get_bounding_box(coords)
442
+
443
+ # Crop line with padding
444
+ padding = 5
445
+ x1_pad = max(0, x1 - padding)
446
+ y1_pad = max(0, y1 - padding)
447
+ x2_pad = min(image.width, x2 + padding)
448
+ y2_pad = min(image.height, y2 + padding)
449
+
450
+ bbox = (x1_pad, y1_pad, x2_pad, y2_pad)
451
+ line_img = image.crop(bbox)
452
+
453
+ segment = LineSegment(
454
+ image=line_img,
455
+ bbox=bbox,
456
+ coords=coords
457
+ )
458
+
459
+ # Extract line reading order from custom attribute
460
+ line_order = self._extract_reading_order(text_line.get('custom', ''))
461
+
462
+ # Use line reading order if available, otherwise Y coordinate
463
+ sort_key = line_order if line_order is not None else y1
464
+ lines_with_order.append((sort_key, segment))
465
+
466
+ # Sort lines within this region
467
+ lines_with_order.sort(key=lambda x: x[0])
468
+ sorted_lines = [seg for _, seg in lines_with_order]
469
+
470
+ # Collect TextRegion bbox for viewer visualization
471
+ region_id = region.get('id', f'region_{len(regions_with_order)}')
472
+ region_coords_elem = region.find('page:Coords', ns)
473
+ if region_coords_elem is not None:
474
+ rc_str = region_coords_elem.get('points', '')
475
+ if rc_str:
476
+ rc = self._parse_coords(rc_str)
477
+ if scale_x != 1.0 or scale_y != 1.0:
478
+ rc = [(int(x * scale_x), int(y * scale_y)) for x, y in rc]
479
+ rx1, ry1, rx2, ry2 = self._get_bounding_box(rc)
480
+ self.region_data.append({
481
+ "id": region_id,
482
+ "bbox": [rx1, ry1, rx2, ry2],
483
+ "num_lines": len(sorted_lines),
484
+ })
485
+
486
+ # Use region reading order if available, otherwise region Y position
487
+ region_sort_key = region_order if region_order is not None else region_y
488
+ regions_with_order.append((region_sort_key, sorted_lines))
489
+
490
+ # Sort regions by reading order (or Y position fallback)
491
+ regions_with_order.sort(key=lambda x: x[0])
492
+
493
+ # Flatten: concatenate all lines from all regions in order
494
+ segments = []
495
+ for _, region_lines in regions_with_order:
496
+ segments.extend(region_lines)
497
+
498
+ return segments
499
+
500
+ @staticmethod
501
+ def _extract_reading_order(custom_attr: str) -> Optional[int]:
502
+ """Extract reading order index from custom attribute.
503
+
504
+ Format: custom="readingOrder {index:5;}"
505
+ Returns: 5 (or None if not found/parseable)
506
+ """
507
+ if not custom_attr or 'readingOrder' not in custom_attr:
508
+ return None
509
+
510
+ try:
511
+ # Find "index:X;" pattern
512
+ start = custom_attr.index('index:') + 6
513
+ end = custom_attr.index(';', start)
514
+ return int(custom_attr[start:end])
515
+ except (ValueError, IndexError):
516
+ return None
517
+
518
+ def _get_region_y_position(self, region, ns=None) -> int:
519
+ """Get Y position of region for fallback sorting.
520
+
521
+ Uses the Y coordinate of the region's Coords or first TextLine.
522
+ """
523
+ if ns is None:
524
+ ns = self.NS
525
+ # Try region Coords first
526
+ coords_elem = region.find('page:Coords', ns)
527
+ if coords_elem is not None:
528
+ coords_str = coords_elem.get('points')
529
+ if coords_str:
530
+ coords = self._parse_coords(coords_str)
531
+ _, y1, _, _ = self._get_bounding_box(coords)
532
+ return y1
533
+
534
+ # Fallback: use first TextLine Y position
535
+ text_line = region.find('.//page:TextLine', ns)
536
+ if text_line is not None:
537
+ coords_elem = text_line.find('page:Coords', ns)
538
+ if coords_elem is not None:
539
+ coords_str = coords_elem.get('points')
540
+ if coords_str:
541
+ coords = self._parse_coords(coords_str)
542
+ _, y1, _, _ = self._get_bounding_box(coords)
543
+ return y1
544
+
545
+ # Default fallback
546
+ return 0
547
+
548
+ @staticmethod
549
+ def _parse_coords(coords_str: str) -> List[Tuple[int, int]]:
550
+ """Parse coordinate string from PAGE XML."""
551
+ points = coords_str.split()
552
+ return [(int(p.split(',')[0]), int(p.split(',')[1])) for p in points]
553
+
554
+ @staticmethod
555
+ def _get_bounding_box(coords: List[Tuple[int, int]]) -> Tuple[int, int, int, int]:
556
+ """Get bounding box from polygon coordinates."""
557
+ xs = [p[0] for p in coords]
558
+ ys = [p[1] for p in coords]
559
+ return min(xs), min(ys), max(xs), max(ys)
560
+
561
+
562
+ class TrOCRInference:
563
+ """TrOCR model inference."""
564
+
565
+ def __init__(self, model_path: str, device: Optional[str] = None,
566
+ base_model: str = "kazars24/trocr-base-handwritten-ru",
567
+ normalize_bg: bool = False,
568
+ flip_rtl: bool = False,
569
+ is_huggingface: bool = False):
570
+ """
571
+ Initialize TrOCR inference.
572
+
573
+ Args:
574
+ model_path: Path to local checkpoint or HuggingFace model ID
575
+ device: 'cuda', 'cpu', or None for auto-detect
576
+ base_model: Base model for processor (used with local checkpoints)
577
+ normalize_bg: Apply background normalization
578
+ flip_rtl: Flip line images horizontally for RTL scripts
579
+ is_huggingface: If True, load from HuggingFace Hub instead of local path
580
+ """
581
+ self.model_path = model_path
582
+ self.base_model = base_model
583
+ self.normalize_bg = normalize_bg
584
+ self.flip_rtl = flip_rtl
585
+ self.is_huggingface = is_huggingface
586
+
587
+ if device is None:
588
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
589
+ else:
590
+ self.device = device
591
+
592
+ print(f"Loading model from {'HuggingFace Hub' if is_huggingface else 'local checkpoint'}: {model_path}...")
593
+ print(f"Using device: {self.device}")
594
+ print(f"Background normalization: {'Enabled' if self.normalize_bg else 'Disabled'}")
595
+
596
+ if VisionEncoderDecoderModel is None or TrOCRProcessor is None:
597
+ raise ImportError("TrOCR inference requires transformers to be installed")
598
+
599
+ if is_huggingface:
600
+ # Load both processor and model from HuggingFace Hub
601
+ print(f"Downloading from HuggingFace Hub (if not cached): {model_path}")
602
+
603
+ # Try to load processor from model first, fallback to base_model if it fails
604
+ try:
605
+ print(f"Attempting to load processor from {model_path}...")
606
+ self.processor = TrOCRProcessor.from_pretrained(model_path)
607
+ # Some models (e.g. dh-unibe/trocr-kurrent) ship a truncated tokenizer
608
+ # with only special tokens (vocab_size=5). The model itself uses the full
609
+ # microsoft/trocr-base-handwritten vocabulary (50265 tokens). Detect this
610
+ # by checking vocab_size and replace only the tokenizer – keep the image
611
+ # processor from the model so preprocessing stays correct.
612
+ if self.processor.tokenizer.vocab_size < 100:
613
+ print(f"WARNING: tokenizer from '{model_path}' has vocab_size="
614
+ f"{self.processor.tokenizer.vocab_size} (looks broken). "
615
+ f"Replacing tokenizer with microsoft/trocr-base-handwritten.")
616
+ _fallback = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
617
+ self.processor.tokenizer = _fallback.tokenizer
618
+ except Exception as e:
619
+ print(f"Failed to load processor from model: {e}")
620
+ print(f"Falling back to base model processor: {self.base_model}")
621
+ self.processor = TrOCRProcessor.from_pretrained(self.base_model)
622
+
623
+ self.model = VisionEncoderDecoderModel.from_pretrained(
624
+ model_path, low_cpu_mem_usage=False)
625
+ # For backwards compatibility
626
+ self.checkpoint_path = model_path
627
+ else:
628
+ # Load processor from base model, model from local checkpoint
629
+ self.checkpoint_path = Path(model_path)
630
+
631
+ # If model_path points to a specific file (e.g., model.safetensors),
632
+ # use the parent directory for from_pretrained()
633
+ if self.checkpoint_path.is_file():
634
+ model_dir = self.checkpoint_path.parent
635
+ print(f"Model path is a file, using directory: {model_dir}")
636
+ else:
637
+ model_dir = self.checkpoint_path
638
+
639
+ # Try to load processor from the local model first (correct tokenizer),
640
+ # fall back to base_model for old checkpoints that lack processor files.
641
+ try:
642
+ print(f"Attempting to load processor from local model: {model_dir}")
643
+ self.processor = TrOCRProcessor.from_pretrained(model_dir)
644
+ except Exception as e:
645
+ print(f"Local processor not found ({e}), falling back to base model: {self.base_model}")
646
+ self.processor = TrOCRProcessor.from_pretrained(self.base_model)
647
+ self.model = VisionEncoderDecoderModel.from_pretrained(
648
+ model_dir, low_cpu_mem_usage=False)
649
+
650
+ self.model.to(self.device)
651
+ # mBART decoder creates _float_tensor lazily on CPU; force it to the right device now.
652
+ for m in self.model.modules():
653
+ if hasattr(m, '_float_tensor'):
654
+ m._float_tensor = m._float_tensor.to(self.device)
655
+ self.model.eval()
656
+
657
+ print("Model loaded successfully!")
658
+
659
+ def transcribe_line(self, line_image: Image.Image, num_beams: int = 4,
660
+ max_length: int = 128, return_confidence: bool = False):
661
+ """
662
+ Transcribe a single line image.
663
+
664
+ Args:
665
+ line_image: PIL Image of text line
666
+ num_beams: Number of beams for beam search (higher = better quality, slower)
667
+ max_length: Maximum sequence length
668
+ return_confidence: If True, return (text, confidence) tuple
669
+
670
+ Returns:
671
+ If return_confidence=False: Transcribed text string
672
+ If return_confidence=True: Tuple of (text, confidence_score, char_confidences)
673
+ """
674
+ # Apply background normalization if enabled
675
+ if self.normalize_bg:
676
+ line_image = normalize_background(line_image)
677
+
678
+ # Flip horizontally for RTL scripts (model trained on flipped images)
679
+ if self.flip_rtl:
680
+ line_image = line_image.transpose(Image.FLIP_LEFT_RIGHT)
681
+
682
+ # Ensure image is in RGB mode (TrOCR requires 3 channels)
683
+ if line_image.mode != 'RGB':
684
+ line_image = line_image.convert('RGB')
685
+
686
+ # Prepare image
687
+ pixel_values = self.processor(
688
+ images=line_image,
689
+ return_tensors="pt"
690
+ ).pixel_values.to(self.device)
691
+
692
+ # Generate text with scores
693
+ with torch.no_grad():
694
+ if return_confidence:
695
+ # Generate with output scores for confidence
696
+ outputs = self.model.generate(
697
+ pixel_values,
698
+ num_beams=num_beams,
699
+ max_length=max_length,
700
+ early_stopping=True,
701
+ output_scores=True,
702
+ return_dict_in_generate=True
703
+ )
704
+ generated_ids = outputs.sequences
705
+
706
+ # Calculate confidence from scores
707
+ # scores is a tuple of tensors, one per generation step
708
+ # generated_ids shape: (batch_size, sequence_length)
709
+ if hasattr(outputs, 'scores') and outputs.scores and len(outputs.scores) > 0:
710
+ import torch.nn.functional as F
711
+
712
+ # Get the actual generated tokens (excluding special tokens like BOS)
713
+ # generated_ids[0] is the first (and only) sequence in the batch
714
+ generated_tokens = generated_ids[0].cpu().numpy()
715
+
716
+ # scores is a tuple with one tensor per generation step
717
+ # Each tensor has shape (batch_size * num_beams, vocab_size)
718
+ token_confidences = []
719
+
720
+ for step_idx, score_tensor in enumerate(outputs.scores):
721
+ # Get probabilities for this generation step
722
+ # score_tensor shape: (num_beams, vocab_size) for batch_size=1
723
+ probs = F.softmax(score_tensor, dim=-1)
724
+
725
+ # The actual generated token at this step
726
+ # Skip BOS token (index 0), so generated token index is step_idx + 1
727
+ if step_idx + 1 < len(generated_tokens):
728
+ actual_token_id = generated_tokens[step_idx + 1]
729
+
730
+ # Get probability of the actual selected token (from best beam, index 0)
731
+ token_prob = probs[0, actual_token_id].item()
732
+ token_confidences.append(token_prob)
733
+
734
+ # Calculate average confidence
735
+ avg_confidence = sum(token_confidences) / len(token_confidences) if token_confidences else 0.0
736
+ char_confidences = token_confidences
737
+ else:
738
+ avg_confidence = 0.0
739
+ char_confidences = []
740
+ else:
741
+ generated_ids = self.model.generate(
742
+ pixel_values,
743
+ num_beams=num_beams,
744
+ max_length=max_length,
745
+ early_stopping=True
746
+ )
747
+ avg_confidence = None
748
+ char_confidences = None
749
+
750
+ # Decode
751
+ text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
752
+
753
+ if return_confidence:
754
+ return text, avg_confidence, char_confidences
755
+ else:
756
+ return text
757
+
758
+ def transcribe_segments(self, segments: List[LineSegment],
759
+ num_beams: int = 4, max_length: int = 128,
760
+ show_progress: bool = True) -> List[LineSegment]:
761
+ """
762
+ Transcribe multiple line segments.
763
+
764
+ Args:
765
+ segments: List of LineSegment objects
766
+ num_beams: Beam search parameter
767
+ max_length: Max sequence length
768
+ show_progress: Show progress bar
769
+
770
+ Returns:
771
+ Updated segments with text field filled
772
+ """
773
+ if show_progress:
774
+ from tqdm import tqdm
775
+ iterator = tqdm(segments, desc="Transcribing lines")
776
+ else:
777
+ iterator = segments
778
+
779
+ for segment in iterator:
780
+ segment.text = self.transcribe_line(
781
+ segment.image,
782
+ num_beams=num_beams,
783
+ max_length=max_length
784
+ )
785
+
786
+ return segments
787
+
788
+
789
+ def main():
790
+ parser = argparse.ArgumentParser(
791
+ description="Whole-page OCR inference for Ukrainian handwritten text"
792
+ )
793
+ parser.add_argument(
794
+ '--image',
795
+ type=str,
796
+ required=True,
797
+ help='Path to input page image'
798
+ )
799
+ parser.add_argument(
800
+ '--checkpoint',
801
+ type=str,
802
+ required=True,
803
+ help='Path to TrOCR checkpoint directory'
804
+ )
805
+ parser.add_argument(
806
+ '--xml',
807
+ type=str,
808
+ default=None,
809
+ help='Optional: PAGE XML file for line segmentation (if not provided, automatic segmentation is used)'
810
+ )
811
+ parser.add_argument(
812
+ '--output',
813
+ type=str,
814
+ default=None,
815
+ help='Output text file (default: <image_name>_transcription.txt)'
816
+ )
817
+ parser.add_argument(
818
+ '--num_beams',
819
+ type=int,
820
+ default=4,
821
+ help='Number of beams for beam search (default: 4, higher=better quality but slower)'
822
+ )
823
+ parser.add_argument(
824
+ '--max_length',
825
+ type=int,
826
+ default=128,
827
+ help='Maximum sequence length (default: 128)'
828
+ )
829
+ parser.add_argument(
830
+ '--min_line_height',
831
+ type=int,
832
+ default=20,
833
+ help='Minimum line height for automatic segmentation (default: 20)'
834
+ )
835
+ parser.add_argument(
836
+ '--debug',
837
+ action='store_true',
838
+ help='Visualize line segmentation'
839
+ )
840
+ parser.add_argument(
841
+ '--device',
842
+ type=str,
843
+ default=None,
844
+ choices=['cuda', 'cpu'],
845
+ help='Device to use for inference (default: auto-detect)'
846
+ )
847
+ parser.add_argument(
848
+ '--base_model',
849
+ type=str,
850
+ default='kazars24/trocr-base-handwritten-ru',
851
+ help='Base model for processor (default: kazars24/trocr-base-handwritten-ru)'
852
+ )
853
+ parser.add_argument(
854
+ '--normalize-background',
855
+ action='store_true',
856
+ help='Apply background normalization (REQUIRED if model was trained with --normalize-background)'
857
+ )
858
+ parser.add_argument(
859
+ '--flip-rtl',
860
+ action='store_true',
861
+ help='Flip line images horizontally for RTL scripts (REQUIRED if model was trained with --flip-rtl)'
862
+ )
863
+
864
+ args = parser.parse_args()
865
+
866
+ print("=" * 80)
867
+ print("TrOCR Whole-Page Inference")
868
+ print("=" * 80)
869
+ print(f"Input image: {args.image}")
870
+ print(f"Checkpoint: {args.checkpoint}")
871
+ print(f"Segmentation: {'PAGE XML' if args.xml else 'Automatic'}")
872
+ print(f"Beam search: {args.num_beams}")
873
+ print("=" * 80)
874
+
875
+ # Load image
876
+ print("\nLoading image...")
877
+ Image.MAX_IMAGE_PIXELS = None # Allow large images
878
+ from PIL import ImageOps
879
+ image = Image.open(args.image)
880
+ image = ImageOps.exif_transpose(image) # Fix EXIF orientation
881
+ image = image.convert('RGB')
882
+ print(f"Image size: {image.width}x{image.height}")
883
+
884
+ # Segment lines
885
+ print("\nSegmenting lines...")
886
+ if args.xml:
887
+ segmenter = PageXMLSegmenter(args.xml)
888
+ segments = segmenter.segment_lines(image)
889
+ print(f"Found {len(segments)} lines in PAGE XML")
890
+ else:
891
+ segmenter = LineSegmenter(
892
+ min_line_height=args.min_line_height
893
+ )
894
+ segments = segmenter.segment_lines(image, debug=args.debug)
895
+ print(f"Detected {len(segments)} lines")
896
+
897
+ if not segments:
898
+ print("ERROR: No lines detected!")
899
+ return
900
+
901
+ # Initialize TrOCR
902
+ print("\nInitializing TrOCR model...")
903
+ ocr = TrOCRInference(
904
+ args.checkpoint,
905
+ device=args.device,
906
+ base_model=args.base_model,
907
+ normalize_bg=args.normalize_background, # NEW: pass normalization flag
908
+ flip_rtl=args.flip_rtl
909
+ )
910
+
911
+ # Transcribe
912
+ print(f"\nTranscribing {len(segments)} lines...")
913
+ segments = ocr.transcribe_segments(
914
+ segments,
915
+ num_beams=args.num_beams,
916
+ max_length=args.max_length
917
+ )
918
+
919
+ # Prepare output
920
+ transcription = "\n".join(seg.text for seg in segments if seg.text)
921
+
922
+ # Determine output path
923
+ if args.output:
924
+ output_path = Path(args.output)
925
+ else:
926
+ image_path = Path(args.image)
927
+ output_path = image_path.parent / f"{image_path.stem}_transcription.txt"
928
+
929
+ # Save
930
+ print(f"\nSaving transcription to {output_path}...")
931
+ with open(output_path, 'w', encoding='utf-8') as f:
932
+ f.write(transcription)
933
+
934
+ # Print results
935
+ print("\n" + "=" * 80)
936
+ print("TRANSCRIPTION RESULT")
937
+ print("=" * 80)
938
+ print(transcription)
939
+ print("=" * 80)
940
+ print(f"\nTranscription saved to: {output_path}")
941
+ print(f"Total lines: {len(segments)}")
942
+ print(f"Average confidence: N/A (not implemented yet)")
943
+
944
+
945
+ if __name__ == '__main__':
946
+ main()
inference_pylaia_native.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Native PyLaia Inference (No WSL)
3
+
4
+ This module provides inference for PyLaia CRNN models trained with train_pylaia.py.
5
+ It loads the PyTorch checkpoint directly and runs inference natively on Linux.
6
+ """
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from pathlib import Path
11
+ from typing import Tuple, Optional, List
12
+ from PIL import Image
13
+ import torchvision.transforms as transforms
14
+ import logging
15
+ import json
16
+ import os
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class CRNN(nn.Module):
22
+ """
23
+ CRNN architecture (same as train_pylaia.py).
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ img_height: int = 128,
29
+ num_channels: int = 1,
30
+ num_classes: int = 100,
31
+ cnn_filters: List[int] = [12, 24, 48, 48],
32
+ cnn_poolsize: List[int] = [2, 2, 0, 2],
33
+ rnn_hidden: int = 256,
34
+ rnn_layers: int = 3,
35
+ dropout: float = 0.5
36
+ ):
37
+ super(CRNN, self).__init__()
38
+
39
+ self.img_height = img_height
40
+ self.num_classes = num_classes
41
+ self.cnn_poolsize = cnn_poolsize
42
+
43
+ # CNN layers
44
+ cnn_layers = []
45
+ in_channels = num_channels
46
+
47
+ for i, out_channels in enumerate(cnn_filters):
48
+ cnn_layers.extend([
49
+ nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, dilation=1),
50
+ nn.BatchNorm2d(out_channels),
51
+ nn.LeakyReLU(0.2, inplace=True)
52
+ ])
53
+
54
+ if cnn_poolsize[i] > 0:
55
+ cnn_layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
56
+
57
+ in_channels = out_channels
58
+
59
+ self.cnn = nn.Sequential(*cnn_layers)
60
+
61
+ # Calculate RNN input size
62
+ num_pools = sum(1 for p in cnn_poolsize if p > 0)
63
+ cnn_output_height = img_height // (2 ** num_pools)
64
+ rnn_input_size = cnn_filters[-1] * cnn_output_height
65
+
66
+ # Bidirectional LSTM
67
+ self.rnn = nn.LSTM(
68
+ input_size=rnn_input_size,
69
+ hidden_size=rnn_hidden,
70
+ num_layers=rnn_layers,
71
+ dropout=dropout if rnn_layers > 1 else 0,
72
+ bidirectional=True,
73
+ batch_first=False
74
+ )
75
+
76
+ self.lin_dropout = nn.Dropout(dropout)
77
+ self.fc = nn.Linear(rnn_hidden * 2, num_classes)
78
+
79
+ def forward(self, x):
80
+ """
81
+ Args:
82
+ x: [batch, channels, height, width]
83
+
84
+ Returns:
85
+ log_probs: [width, batch, num_classes]
86
+ """
87
+ # CNN
88
+ conv = self.cnn(x)
89
+
90
+ # Reshape for RNN
91
+ batch, channels, height, width = conv.size()
92
+ conv = conv.permute(3, 0, 1, 2) # [width, batch, channels, height]
93
+ conv = conv.reshape(width, batch, channels * height)
94
+
95
+ # RNN
96
+ rnn_out, _ = self.rnn(conv)
97
+ rnn_out = self.lin_dropout(rnn_out)
98
+
99
+ # Output projection
100
+ output = self.fc(rnn_out)
101
+
102
+ # Log softmax for CTC
103
+ log_probs = torch.nn.functional.log_softmax(output, dim=2)
104
+
105
+ return log_probs
106
+
107
+
108
+ class PyLaiaInference:
109
+ """
110
+ Native PyLaia inference (no WSL dependency).
111
+ Loads PyTorch checkpoint directly and runs inference on Linux.
112
+ """
113
+
114
+ def __init__(self, checkpoint_path: str, syms_path: str = None, enable_spaces: bool = True):
115
+ """
116
+ Initialize PyLaia inference.
117
+
118
+ Args:
119
+ checkpoint_path: Path to .ckpt checkpoint file
120
+ syms_path: Path to symbols file. If None, will look in data directory.
121
+ enable_spaces: If True, convert <space> tokens to actual spaces. If False, keep as <space>.
122
+ """
123
+ self.enable_spaces = enable_spaces
124
+ self.checkpoint_path = Path(checkpoint_path)
125
+
126
+ if not self.checkpoint_path.exists():
127
+ raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
128
+
129
+ # Find symbols file
130
+ if syms_path is None:
131
+ # First: look alongside the checkpoint for symbols.txt or syms.txt
132
+ model_dir = self.checkpoint_path.parent
133
+ for _candidate in ("symbols.txt", "syms.txt"):
134
+ _candidate_path = model_dir / _candidate
135
+ if _candidate_path.exists():
136
+ syms_path = _candidate_path
137
+ logger.info(f"Found symbols file alongside checkpoint: {syms_path}")
138
+ break
139
+ if syms_path is None:
140
+ # Last-resort fallback
141
+ syms_path = Path("data/pylaia_glagolitic/syms.txt")
142
+
143
+ self.syms_path = Path(syms_path)
144
+ if not self.syms_path.exists():
145
+ raise FileNotFoundError(f"Symbols file not found: {syms_path}")
146
+
147
+ # Load symbols (handle both list and KALDI formats)
148
+ # CRITICAL: Use rstrip('\n\r') not strip() to preserve leading/trailing whitespace in symbols (e.g., TAB)
149
+ with open(self.syms_path, 'r', encoding='utf-8') as f:
150
+ symbols_raw = [line.rstrip('\n\r') for line in f if line.rstrip('\n\r')]
151
+
152
+ # Auto-detect format: KALDI format has "symbol index" pairs
153
+ if symbols_raw and ' ' in symbols_raw[0]:
154
+ parts = symbols_raw[0].split()
155
+ if len(parts) == 2 and parts[1].isdigit():
156
+ # KALDI format: "symbol index"
157
+ # Parse carefully to handle whitespace symbols (e.g., TAB at index 131)
158
+ self.symbols = []
159
+ for line in symbols_raw:
160
+ # Get the last token (index)
161
+ idx_str = line.split()[-1]
162
+ if not idx_str.isdigit():
163
+ continue
164
+ # Symbol is everything before the last space + index
165
+ symbol = line[:line.rfind(' ' + idx_str)]
166
+ self.symbols.append(symbol)
167
+ logger.info(f"Detected KALDI format vocabulary")
168
+ else:
169
+ # List format (one symbol per line)
170
+ self.symbols = symbols_raw
171
+ else:
172
+ # List format (one symbol per line)
173
+ self.symbols = symbols_raw
174
+
175
+ # Remove <ctc> token if present (CTC blank is handled separately as index 0)
176
+ if self.symbols and self.symbols[0] == '<ctc>':
177
+ self.symbols = self.symbols[1:]
178
+ logger.info(f"Removed <ctc> token from vocabulary (using index 0 for CTC blank)")
179
+
180
+ # Create char-to-index mapping (0 reserved for CTC blank)
181
+ self.char2idx = {char: idx + 1 for idx, char in enumerate(self.symbols)}
182
+ self.idx2char = {idx: char for char, idx in self.char2idx.items()}
183
+ self.idx2char[0] = '' # CTC blank
184
+
185
+ # Map <SPACE> or <space> to actual space (if enabled)
186
+ if self.enable_spaces:
187
+ if '<SPACE>' in self.char2idx:
188
+ space_idx = self.char2idx['<SPACE>']
189
+ self.idx2char[space_idx] = ' '
190
+ elif '<space>' in self.char2idx:
191
+ space_idx = self.char2idx['<space>']
192
+ self.idx2char[space_idx] = ' '
193
+
194
+ # Load checkpoint
195
+ logger.info(f"Loading PyLaia checkpoint: {checkpoint_path}")
196
+ checkpoint = torch.load(self.checkpoint_path, map_location='cpu', weights_only=False)
197
+
198
+ # CRITICAL: If checkpoint has idx2char, use it instead of vocabulary file
199
+ # This handles models trained with different vocabulary parsing (strip vs rstrip)
200
+ if 'idx2char' in checkpoint:
201
+ logger.info(f"Using idx2char from checkpoint ({len(checkpoint['idx2char'])} characters)")
202
+ self.idx2char = checkpoint['idx2char']
203
+ self.char2idx = checkpoint.get('char2idx', {char: idx for idx, char in self.idx2char.items()})
204
+ # Still apply enable_spaces setting
205
+ if self.enable_spaces:
206
+ for idx, char in list(self.idx2char.items()):
207
+ if char == '<SPACE>' or char == '<space>':
208
+ self.idx2char[idx] = ' '
209
+
210
+ # Extract model state dict from checkpoint
211
+ # train_pylaia.py saves checkpoints with 'model_state_dict' key
212
+ state_dict = checkpoint.get('model_state_dict', checkpoint.get('state_dict', checkpoint))
213
+
214
+ # Infer number of classes from checkpoint (fc.weight shape is [num_classes, rnn_hidden*2])
215
+ fc_weight_shape = state_dict['fc.weight'].shape
216
+ num_classes = fc_weight_shape[0]
217
+
218
+ logger.info(f"Inferred {num_classes} output classes from checkpoint")
219
+ logger.info(f"Vocabulary has {len(self.symbols)} symbols (+ 1 blank = {len(self.symbols)+1} expected)")
220
+
221
+ # Initialize model
222
+ self.model = CRNN(
223
+ img_height=128,
224
+ num_channels=1,
225
+ num_classes=num_classes,
226
+ cnn_filters=[12, 24, 48, 48],
227
+ cnn_poolsize=[2, 2, 0, 2],
228
+ rnn_hidden=256,
229
+ rnn_layers=3,
230
+ dropout=0.5
231
+ )
232
+
233
+ # Load weights
234
+ self.model.load_state_dict(state_dict, strict=True)
235
+
236
+ # Set device
237
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
238
+ self.model = self.model.to(self.device)
239
+ self.model.eval()
240
+
241
+ # Image preprocessing (same as training)
242
+ self.transform = transforms.Compose([
243
+ transforms.ToTensor(),
244
+ transforms.Normalize(mean=[0.5], std=[0.5])
245
+ ])
246
+
247
+ logger.info(f"Loaded PyLaia model with {num_classes} output classes")
248
+ logger.info(f"Using device: {self.device}")
249
+
250
+ def preprocess_image(self, image: Image.Image) -> torch.Tensor:
251
+ """
252
+ Preprocess image for inference.
253
+
254
+ Args:
255
+ image: PIL Image (RGB or grayscale)
256
+
257
+ Returns:
258
+ Preprocessed tensor [1, 1, height, width]
259
+ """
260
+ # Convert to grayscale
261
+ if image.mode != 'L':
262
+ image = image.convert('L')
263
+
264
+ # Resize to target height (128) while preserving aspect ratio
265
+ target_height = 128
266
+ aspect_ratio = image.width / image.height
267
+ new_width = int(target_height * aspect_ratio)
268
+ image = image.resize((new_width, target_height), Image.LANCZOS)
269
+
270
+ # Apply transforms
271
+ img_tensor = self.transform(image) # [1, height, width]
272
+ img_tensor = img_tensor.unsqueeze(0) # [1, 1, height, width]
273
+
274
+ return img_tensor
275
+
276
+ def decode_ctc(self, log_probs: torch.Tensor) -> Tuple[str, float]:
277
+ """
278
+ Decode CTC output using greedy decoding.
279
+
280
+ Args:
281
+ log_probs: [seq_len, 1, num_classes]
282
+
283
+ Returns:
284
+ Tuple of (decoded_text, confidence)
285
+ """
286
+ # Get most likely class at each time step
287
+ probs = torch.exp(log_probs)
288
+ _, pred_indices = torch.max(probs, dim=2) # [seq_len, 1]
289
+ pred_indices = pred_indices.squeeze(1).cpu().numpy() # [seq_len]
290
+
291
+ # CTC greedy decoding: remove consecutive duplicates and blanks
292
+ decoded_chars = []
293
+ prev_idx = -1
294
+ confidences = []
295
+
296
+ for t, idx in enumerate(pred_indices):
297
+ if idx != 0 and idx != prev_idx: # Not blank and not duplicate
298
+ char = self.idx2char.get(idx, '')
299
+ if char:
300
+ decoded_chars.append(char)
301
+ # Get confidence for this character
302
+ char_conf = probs[t, 0, idx].item()
303
+ confidences.append(char_conf)
304
+ prev_idx = idx
305
+
306
+ # Join characters
307
+ text = ''.join(decoded_chars)
308
+
309
+ # Average confidence
310
+ confidence = sum(confidences) / len(confidences) if confidences else 0.0
311
+
312
+ return text, confidence
313
+
314
+ def transcribe(self, image: Image.Image) -> Tuple[str, float]:
315
+ """
316
+ Transcribe a single line image.
317
+
318
+ Args:
319
+ image: PIL Image of text line
320
+
321
+ Returns:
322
+ Tuple of (transcription_text, confidence_score)
323
+ """
324
+ try:
325
+ # Preprocess
326
+ img_tensor = self.preprocess_image(image).to(self.device)
327
+
328
+ # Forward pass
329
+ with torch.no_grad():
330
+ log_probs = self.model(img_tensor) # [width, 1, num_classes]
331
+
332
+ # Decode
333
+ text, confidence = self.decode_ctc(log_probs)
334
+
335
+ return text, confidence
336
+
337
+ except Exception as e:
338
+ logger.error(f"Error during PyLaia inference: {e}")
339
+ import traceback
340
+ traceback.print_exc()
341
+ return "", 0.0
342
+
343
+
344
+ # Model registry (updated for trained models)
345
+ PYLAIA_MODELS = {
346
+ "Church Slavonic (2.89% CER)": {
347
+ "checkpoint": "models/pylaia_church_slavonic_20251103_222215/best_model.pt",
348
+ "syms": "models/pylaia_church_slavonic_20251103_222215/symbols.txt",
349
+ "description": "PyLaia CRNN - Church Slavonic manuscript (2.89% CER)"
350
+ },
351
+ "Prosta Mova (3.77% CER)": {
352
+ "checkpoint": "models/pylaia_prosta_mova_v4_20251121_155322/best_model.pt",
353
+ "syms": "models/pylaia_prosta_mova_v4_20251121_155322/symbols.txt",
354
+ "description": "PyLaia CRNN - Prosta Mova (3.77% CER)"
355
+ },
356
+ "Glagolitic (5.33% CER)": {
357
+ "checkpoint": "models/pylaia_glagolitic_with_spaces_20251102_182103/best_model.pt",
358
+ "syms": "data/pylaia_glagolitic/syms.txt",
359
+ "description": "PyLaia CRNN - Glagolitic manuscript (76 symbols, 5.33% CER)"
360
+ },
361
+ "Ukrainian (4.76% CER)": {
362
+ "checkpoint": "models/pylaia_ukrainian_v2c_20251124_180634/best_model.pt",
363
+ "syms": "models/pylaia_ukrainian_v2c_20251124_180634/symbols.txt",
364
+ "description": "PyLaia CRNN - Ukrainian manuscript (4.76% CER)"
365
+ },
366
+ "Ukrainian (13.53% CER - OLD)": {
367
+ "checkpoint": "models/pylaia_ukrainian_retrain_20251102_213431/best_model.pt",
368
+ "syms": "models/pylaia_ukrainian_retrain_20251102_213431/symbols.txt",
369
+ "description": "PyLaia CRNN - Ukrainian manuscript (180 symbols, 13.53% CER)"
370
+ },
371
+ "Glagolitic (old)": {
372
+ "checkpoint": "models/pylaia_glagolitic_single_gpu/best_model.pt",
373
+ "syms": "models/pylaia_glagolitic_single_gpu/symbols.txt",
374
+ "description": "PyLaia model - old Glagolitic training (no spaces)"
375
+ }
376
+ }
377
+
378
+
379
+ def _register_hf_space_demo_models() -> None:
380
+ """Add public Hugging Face CRNN-CTC presets for the hosted demo mode."""
381
+ if os.environ.get("POLYSCRIPTOR_DEMO_MODE") != "hf_space":
382
+ return
383
+ PYLAIA_MODELS.clear()
384
+ PYLAIA_MODELS.update({
385
+ "Ukrainian (HF, 4.76% CER)": {
386
+ "repo_id": "achimrabus/crnn-ctc-ukrainian",
387
+ "checkpoint": "best_model.pt",
388
+ "syms": "symbols.txt",
389
+ "description": "Public Hugging Face CRNN-CTC model for Ukrainian HTR",
390
+ },
391
+ "Prosta Mova (HF, 3.77% CER)": {
392
+ "repo_id": "achimrabus/crnn-ctc-prosta-mova",
393
+ "checkpoint": "best_model.pt",
394
+ "syms": "symbols.txt",
395
+ "description": "Public Hugging Face CRNN-CTC model for Prosta Mova HTR",
396
+ },
397
+ "Church Slavonic (HF, 2.89% CER)": {
398
+ "repo_id": "achimrabus/crnn-ctc-church-slavonic",
399
+ "checkpoint": "best_model.pt",
400
+ "syms": "symbols.txt",
401
+ "description": "Public Hugging Face CRNN-CTC model for Church Slavonic HTR",
402
+ },
403
+ "Glagolitic (HF, 5.33% CER)": {
404
+ "repo_id": "achimrabus/crnn-ctc-glagolitic",
405
+ "checkpoint": "best_model.pt",
406
+ "syms": "symbols.txt",
407
+ "description": "Public Hugging Face CRNN-CTC model for Glagolitic HTR",
408
+ },
409
+ })
410
+
411
+
412
+ def _scan_pylaia_models(models_dir: str = "models") -> None:
413
+ """Scan models/ for CRNN-CTC checkpoints not already in PYLAIA_MODELS.
414
+
415
+ Any subdirectory containing best_model.pt that isn't already registered
416
+ is added automatically, using its folder name as the display key.
417
+ A co-located symbols.txt or syms.txt is used as the symbols file.
418
+ This lets users drop a trained model into models/ without editing the registry.
419
+ """
420
+ models_path = Path(models_dir)
421
+ if not models_path.is_dir():
422
+ return
423
+
424
+ registered = {
425
+ str(Path(info["checkpoint"])) if isinstance(info, dict) else str(Path(info))
426
+ for info in PYLAIA_MODELS.values()
427
+ }
428
+
429
+ for checkpoint in sorted(models_path.glob("*/best_model.pt")):
430
+ checkpoint_str = str(checkpoint)
431
+ if checkpoint_str in registered:
432
+ continue
433
+ model_dir = checkpoint.parent
434
+ folder_name = model_dir.name
435
+ if folder_name in PYLAIA_MODELS:
436
+ continue
437
+ syms_path = None
438
+ for candidate in ("symbols.txt", "syms.txt"):
439
+ candidate_path = model_dir / candidate
440
+ if candidate_path.exists():
441
+ syms_path = str(candidate_path)
442
+ break
443
+ PYLAIA_MODELS[folder_name] = {
444
+ "checkpoint": checkpoint_str,
445
+ "syms": syms_path,
446
+ "description": f"CRNN-CTC model (auto-discovered): {folder_name}",
447
+ }
448
+ logger.debug(f"Auto-discovered CRNN-CTC model: {folder_name}")
449
+
450
+
451
+ # Populate registry with any models not hard-coded above
452
+ _register_hf_space_demo_models()
453
+ _scan_pylaia_models()
kraken_segmenter.py ADDED
@@ -0,0 +1,823 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kraken-based line segmentation for historical document OCR.
3
+
4
+ This module provides an alternative to the classical HPP (Horizontal Projection Profile)
5
+ segmentation using Kraken's pre-trained neural models.
6
+
7
+ Supports two modes:
8
+ - Classical: pageseg.segment() — fast, lines only, no regions
9
+ - Neural (blla): blla.segment() — GPU-accelerated, returns regions AND baselines,
10
+ handles multi-column layouts
11
+ """
12
+
13
+ import os
14
+ import time
15
+ from dataclasses import dataclass, field
16
+ from typing import Any, List, Optional, NamedTuple, Tuple, Dict
17
+ from PIL import Image
18
+ import numpy as np
19
+
20
+ # Module-level cache: maps model path -> loaded TorchVGSLModel.
21
+ # Shared across all KrakenLineSegmenter instances so that the model is loaded
22
+ # from disk only once per process, even in batch processing loops.
23
+ _MODEL_CACHE: Dict[str, Any] = {}
24
+
25
+
26
+ class LineSegment(NamedTuple):
27
+ """Represents a segmented text line."""
28
+ image: Image.Image
29
+ bbox: tuple # (x1, y1, x2, y2)
30
+ baseline: Optional[List[tuple]] = None # List of (x, y) points
31
+
32
+
33
+ @dataclass
34
+ class SegRegion:
35
+ """Represents a detected text region (column, marginalia, etc.)."""
36
+ id: str
37
+ bbox: Tuple[int, int, int, int] # (x1, y1, x2, y2)
38
+ line_ids: List[str] = field(default_factory=list)
39
+ polygon: Optional[List[Tuple[int, int]]] = None # Convex hull or neural polygon
40
+ mode: str = "neural" # "neural" or "classical"
41
+
42
+
43
+ class KrakenLineSegmenter:
44
+ """
45
+ Line segmentation using Kraken with pre-trained models.
46
+
47
+ Kraken is specifically designed for historical document OCR and provides:
48
+ - Pre-trained models that work out-of-the-box
49
+ - Baseline detection (not just bounding boxes)
50
+ - Robust handling of degraded/faded text
51
+ - Support for rotated and multi-column layouts
52
+
53
+ Performance: ~3-8s per page (CPU), ~1-3s (GPU)
54
+ Accuracy: 90-95% on historical documents
55
+ """
56
+
57
+ def __init__(self, model_path: Optional[str] = None, device: str = "cpu"):
58
+ """
59
+ Initialize Kraken segmenter.
60
+
61
+ Args:
62
+ model_path: Path to custom segmentation model (.mlmodel file).
63
+ Note: Kraken 5.x uses classical segmentation by default.
64
+ Neural baseline segmentation requires additional setup.
65
+ device: 'cpu' or 'cuda' for GPU acceleration (not used by classical segmenter)
66
+ """
67
+ self.model_path = model_path
68
+ self.device = device
69
+
70
+ # Import kraken components
71
+ try:
72
+ from kraken import binarization, pageseg
73
+ self.binarization = binarization
74
+ self.pageseg = pageseg
75
+ except ImportError as e:
76
+ raise ImportError(
77
+ "Kraken is not installed. Install it with: pip install kraken\n"
78
+ f"Original error: {e}"
79
+ )
80
+
81
+ # Note: model_path is currently not used as pageseg.segment() doesn't accept models
82
+ # The classical segmentation algorithm is robust and works well for most documents
83
+ if model_path:
84
+ print(f"[KrakenSegmenter] Warning: Custom model path provided but not used.")
85
+ print(f"[KrakenSegmenter] Kraken 5.x pageseg.segment() uses classical algorithm.")
86
+ print(f"[KrakenSegmenter] Neural baseline segmentation requires kraken.lib.models workflow.")
87
+
88
+ def segment_lines(
89
+ self,
90
+ image: Image.Image,
91
+ text_direction: str = 'horizontal-lr',
92
+ use_binarization: bool = True
93
+ ) -> List[LineSegment]:
94
+ """
95
+ Segment image into text lines using Kraken.
96
+
97
+ Args:
98
+ image: PIL Image to segment
99
+ text_direction: Text direction - 'horizontal-lr' (left-to-right),
100
+ 'horizontal-rl', 'vertical-lr', 'vertical-rl'
101
+ use_binarization: Whether to apply neural binarization preprocessing
102
+ (recommended for degraded documents)
103
+
104
+ Returns:
105
+ List of LineSegment objects sorted top to bottom
106
+ """
107
+ print(f"[KrakenSegmenter] Segmenting image (size={image.size}, mode={image.mode}, "
108
+ f"direction={text_direction}, binarize={use_binarization})")
109
+
110
+ try:
111
+ # Step 0: Convert to grayscale if needed (Kraken works better with grayscale)
112
+ if image.mode not in ('L', '1'):
113
+ print(f"[KrakenSegmenter] Converting from {image.mode} to grayscale...")
114
+ image = image.convert('L')
115
+
116
+ # Step 1: Binarize (required by pageseg.segment)
117
+ # pageseg.segment REQUIRES binary images
118
+ if use_binarization:
119
+ print(f"[KrakenSegmenter] Applying neural binarization...")
120
+ processed_img = self.binarization.nlbin(image)
121
+ else:
122
+ # Simple Otsu binarization as fallback
123
+ print(f"[KrakenSegmenter] Applying Otsu binarization...")
124
+ import numpy as np
125
+ from PIL import ImageOps
126
+ # Otsu's method
127
+ img_array = np.array(image)
128
+ threshold = np.median(img_array) # Simple threshold
129
+ binary = img_array > threshold
130
+ processed_img = Image.fromarray((binary * 255).astype(np.uint8), mode='L')
131
+
132
+ # Step 2: Line segmentation using Kraken's classical algorithm
133
+ # This is more robust than basic HPP and works well on historical documents
134
+ print(f"[KrakenSegmenter] Running line segmentation...")
135
+ seg_result = self.pageseg.segment(
136
+ processed_img,
137
+ text_direction=text_direction
138
+ )
139
+
140
+ # Handle both dict (old Kraken) and Segmentation object (new Kraken)
141
+ if isinstance(seg_result, dict):
142
+ print(f"[KrakenSegmenter] pageseg.segment returned dict (old Kraken API)")
143
+ # Old API: seg_result is a dict with 'boxes' key
144
+ seg_lines = seg_result.get('boxes', seg_result.get('lines', []))
145
+ else:
146
+ print(f"[KrakenSegmenter] pageseg.segment returned Segmentation object")
147
+ seg_lines = seg_result.lines
148
+
149
+ print(f"[KrakenSegmenter] Processing {len(seg_lines)} lines...")
150
+
151
+ # Step 3: Extract line information
152
+ lines = []
153
+ for idx, line in enumerate(seg_lines):
154
+ # Extract bounding box
155
+ bbox = line.bbox # (x_min, y_min, x_max, y_max)
156
+
157
+ # Extract baseline (list of (x, y) points)
158
+ baseline = line.baseline if hasattr(line, 'baseline') else None
159
+
160
+ # Crop line image from original (not binarized)
161
+ line_img = image.crop(bbox)
162
+
163
+ lines.append(LineSegment(
164
+ image=line_img,
165
+ bbox=bbox,
166
+ baseline=baseline
167
+ ))
168
+
169
+ # Sort lines top to bottom by Y coordinate
170
+ lines = sorted(lines, key=lambda x: x.bbox[1])
171
+
172
+ print(f"[KrakenSegmenter] Detected {len(lines)} lines")
173
+
174
+ return lines
175
+
176
+ except Exception as e:
177
+ print(f"[KrakenSegmenter] ERROR: Segmentation failed: {e}")
178
+ import traceback
179
+ traceback.print_exc()
180
+ return []
181
+
182
+ def segment_with_regions(
183
+ self,
184
+ image: Image.Image,
185
+ model_path: Optional[str] = None,
186
+ device: Optional[str] = None,
187
+ min_line_height: int = 8,
188
+ max_columns: int = 4,
189
+ split_width_fraction: float = 0.40,
190
+ min_lines_to_split: int = 10,
191
+ text_direction: str = 'horizontal-lr',
192
+ ) -> Tuple[List[SegRegion], List[LineSegment]]:
193
+ """
194
+ Neural baseline segmentation using blla.segment().
195
+
196
+ Returns regions AND lines with baselines. Handles multi-column layouts
197
+ by using blla's region detection, with a column-clustering fallback when
198
+ blla returns a single region with many lines (≥30).
199
+
200
+ Falls back to classical pageseg.segment() + column clustering if blla
201
+ fails or the model file is missing.
202
+
203
+ Args:
204
+ image: PIL Image to segment (RGB or grayscale)
205
+ model_path: Path to blla .mlmodel file. Defaults to
206
+ ``pagexml/blla.mlmodel`` relative to this script.
207
+ device: 'cpu' or 'cuda' / 'cuda:0'. Defaults to self.device.
208
+ min_line_height: Discard lines shorter than this (pixels).
209
+ max_columns: Maximum number of columns to detect per region (1-8).
210
+ split_width_fraction: Minimum region width as fraction of page width
211
+ to trigger sub-column splitting (0.0-1.0). Lower values
212
+ split narrower regions. Default 0.40 (40%).
213
+ For landscape double-page spreads, try 0.20 (20%).
214
+ min_lines_to_split: Minimum number of lines in a region before
215
+ attempting to split it into sub-columns.
216
+
217
+ Returns:
218
+ (regions, lines) where *lines* carry a ``region_id`` attribute via
219
+ the companion ``SegRegion`` that owns them.
220
+ """
221
+ device = device or self.device
222
+ if model_path is None:
223
+ model_path = os.path.join(os.path.dirname(__file__), 'pagexml', 'blla.mlmodel')
224
+
225
+ print(f"[KrakenSegmenter] Neural segmentation (blla) on {image.size}, device={device}")
226
+
227
+ # ── Try neural (blla) first ──────────────────────────────────
228
+ if os.path.isfile(model_path):
229
+ try:
230
+ regions, lines = self._segment_neural(
231
+ image, model_path, device, min_line_height,
232
+ max_columns=max_columns,
233
+ split_width_fraction=split_width_fraction,
234
+ min_lines_to_split=min_lines_to_split,
235
+ text_direction=text_direction,
236
+ )
237
+ if regions:
238
+ print(f"[KrakenSegmenter] blla: {len(regions)} regions, {len(lines)} lines")
239
+ return regions, lines
240
+ print("[KrakenSegmenter] blla returned no regions; falling back to classical + clustering")
241
+ except Exception as e:
242
+ print(f"[KrakenSegmenter] blla failed ({e}); falling back to classical + clustering")
243
+ import traceback
244
+ traceback.print_exc()
245
+ else:
246
+ print(f"[KrakenSegmenter] blla model not found at {model_path}; using classical fallback")
247
+
248
+ # ── Fallback: classical pageseg + column clustering ──────────
249
+ return self._segment_classical_with_regions(image, min_line_height)
250
+
251
+ # ── internal: neural blla ────────────────────────────────────────
252
+
253
+ def _segment_neural(
254
+ self,
255
+ image: Image.Image,
256
+ model_path: str,
257
+ device: str,
258
+ min_line_height: int,
259
+ max_columns: int = 4,
260
+ split_width_fraction: float = 0.40,
261
+ min_lines_to_split: int = 10,
262
+ text_direction: str = 'horizontal-lr',
263
+ ) -> Tuple[List[SegRegion], List[LineSegment]]:
264
+ """Run blla.segment() and build SegRegion / LineSegment lists."""
265
+ from kraken import blla
266
+ from kraken.lib import vgsl
267
+ import torch
268
+
269
+ start = time.time()
270
+
271
+ # Validate device
272
+ if device.startswith('cuda') and not torch.cuda.is_available():
273
+ print(f"[KrakenSegmenter] WARNING: device={device} but CUDA not available, falling back to cpu")
274
+ device = 'cpu'
275
+
276
+ # Load model once and cache keyed by (path, device) — repeated calls
277
+ # reuse the already-loaded, already-placed model. Keying by device means
278
+ # a CPU and a CUDA instance don't share the same cached object.
279
+ cache_key = (model_path, device)
280
+ if cache_key not in _MODEL_CACHE:
281
+ print(f"[KrakenSegmenter] Loading blla model: {model_path}")
282
+ m = vgsl.TorchVGSLModel.load_model(model_path)
283
+ # blla.segment()'s device= parameter does NOT move the model —
284
+ # it must be placed on the target device explicitly before the call.
285
+ m.nn.to(device)
286
+ _MODEL_CACHE[cache_key] = m
287
+ model = _MODEL_CACHE[cache_key]
288
+
289
+ # Diagnostic: confirm model parameters are on the expected device.
290
+ try:
291
+ actual_device = next(model.nn.parameters()).device
292
+ print(f"[KrakenSegmenter] blla model on: {actual_device} (requested: {device})")
293
+ if device.startswith('cuda') and actual_device.type != 'cuda':
294
+ print(f"[KrakenSegmenter] WARNING: model is on {actual_device}, not GPU")
295
+ except Exception:
296
+ print(f"[KrakenSegmenter] blla running on device={device}")
297
+
298
+ # blla wants RGB
299
+ img = image.convert('RGB') if image.mode != 'RGB' else image
300
+
301
+ # blla has built-in autocast support (disabled by default). Enable it
302
+ # on CUDA for faster fp16 forward pass.
303
+ baseline_seg = blla.segment(img, model=model, device=device,
304
+ autocast=device.startswith('cuda'),
305
+ text_direction=text_direction)
306
+
307
+ w, h = image.size
308
+ seg_lines: List[LineSegment] = []
309
+ # region_id -> {'lines': [...], 'blla_region': ...}
310
+ regions_dict: Dict[str, dict] = {}
311
+
312
+ # Extract blla region bounding boxes for cross-column line splitting.
313
+ # blla sometimes draws baselines that span multiple columns at the same
314
+ # vertical position. Using region boundaries we can clip or split such
315
+ # lines so that each crop stays within one column.
316
+ blla_boxes = self._extract_blla_region_boxes(baseline_seg, text_direction)
317
+ if blla_boxes:
318
+ print(f"[KrakenSegmenter] blla detected {len(blla_boxes)} text regions "
319
+ f"— will clip lines to region boundaries")
320
+
321
+ for idx, line in enumerate(baseline_seg.lines):
322
+ bbox = self._extract_bbox(line)
323
+ if bbox is None:
324
+ continue
325
+
326
+ baseline = (
327
+ [(int(p[0]), int(p[1])) for p in line.baseline]
328
+ if hasattr(line, 'baseline') and line.baseline
329
+ else None
330
+ )
331
+
332
+ if blla_boxes:
333
+ # Find which detected regions this line's bbox overlaps.
334
+ overlapping = self._overlapping_blla_boxes(bbox, blla_boxes)
335
+ else:
336
+ overlapping = []
337
+
338
+ if not overlapping:
339
+ # No region overlap or no regions at all — fall back to
340
+ # centre-based assignment and keep the original bbox.
341
+ region_id, blla_region = self._find_region_for_line(
342
+ bbox, line, baseline_seg
343
+ )
344
+ sub_bboxes = [(bbox, region_id, blla_region)]
345
+ else:
346
+ # Clip / split the line at each overlapping region boundary.
347
+ sub_bboxes = []
348
+ for rx1, ry1, rx2, ry2, region_obj, region_key in overlapping:
349
+ clipped = (
350
+ max(bbox[0], rx1), max(bbox[1], ry1),
351
+ min(bbox[2], rx2), min(bbox[3], ry2),
352
+ )
353
+ sub_bboxes.append((clipped, region_key, region_obj))
354
+
355
+ for clipped_bbox, region_key, region_obj in sub_bboxes:
356
+ cx1, cy1, cx2, cy2 = clipped_bbox
357
+ if cx2 <= cx1 or cy2 <= cy1:
358
+ continue
359
+ # Filter tiny lines (after possible clamping)
360
+ if (cy2 - cy1) < min_line_height:
361
+ continue
362
+
363
+ line_img = image.crop(clipped_bbox)
364
+ seg_line = LineSegment(image=line_img, bbox=clipped_bbox, baseline=baseline)
365
+ seg_lines.append(seg_line)
366
+
367
+ if region_key not in regions_dict:
368
+ regions_dict[region_key] = {'lines': [], 'blla_region': region_obj}
369
+ regions_dict[region_key]['lines'].append((len(seg_lines) - 1, seg_line))
370
+
371
+ # Sub-split wide regions that likely contain multiple columns.
372
+ # blla often detects "left page" and "right page" as two regions on a
373
+ # double-page spread, but each page may have 2 columns internally.
374
+ # Loop until convergence: a single pass may leave wide sub-regions that
375
+ # need further splitting (e.g. a 3-column area assigned as one bucket).
376
+ for _round in range(max_columns):
377
+ prev_size = len(regions_dict)
378
+ regions_dict = self._split_wide_regions(
379
+ regions_dict, w,
380
+ min_lines_to_split=min_lines_to_split,
381
+ split_width_fraction=split_width_fraction,
382
+ max_columns=max_columns,
383
+ )
384
+ if len(regions_dict) == prev_size:
385
+ break # no new splits — converged
386
+
387
+ # Build SegRegion objects
388
+ regions, ordered_lines = self._build_regions(regions_dict, seg_lines, w,
389
+ text_direction=text_direction)
390
+
391
+ elapsed = time.time() - start
392
+ print(f"[KrakenSegmenter] blla completed in {elapsed:.2f}s")
393
+ return regions, ordered_lines
394
+
395
+ # ── internal: classical fallback with column clustering ──────────
396
+
397
+ def segment_classical_with_regions(
398
+ self,
399
+ image: Image.Image,
400
+ min_line_height: int = 15,
401
+ max_columns: int = 4,
402
+ ) -> Tuple[List[SegRegion], List[LineSegment]]:
403
+ """Public wrapper: classical pageseg + heuristic column clustering."""
404
+ return self._segment_classical_with_regions(image, min_line_height, max_columns)
405
+
406
+ def _segment_classical_with_regions(
407
+ self,
408
+ image: Image.Image,
409
+ min_line_height: int = 15,
410
+ max_columns: int = 4,
411
+ ) -> Tuple[List[SegRegion], List[LineSegment]]:
412
+ """Classical pageseg + heuristic column clustering."""
413
+ raw_lines = self.segment_lines(image)
414
+ if not raw_lines:
415
+ return [], []
416
+
417
+ # Filter small lines
418
+ raw_lines = [l for l in raw_lines if (l.bbox[3] - l.bbox[1]) >= min_line_height]
419
+
420
+ w = image.size[0]
421
+ # Cluster into columns (pass max_columns so 4-column spreads are handled)
422
+ regions_dict = self._cluster_into_columns(raw_lines, w, max_columns=max_columns)
423
+ regions, ordered_lines = self._build_regions(regions_dict, raw_lines, w)
424
+ for r in regions:
425
+ r.mode = "classical"
426
+ return regions, ordered_lines
427
+
428
+ # ── helpers ───────────────────────────────────────────────────────
429
+
430
+ @staticmethod
431
+ def _extract_bbox(line) -> Optional[Tuple[int, int, int, int]]:
432
+ """Extract (x1,y1,x2,y2) bbox from a blla line object."""
433
+ if hasattr(line, 'bbox'):
434
+ return tuple(int(v) for v in line.bbox)
435
+ if hasattr(line, 'baseline') and line.baseline:
436
+ xs = [p[0] for p in line.baseline]
437
+ ys = [p[1] for p in line.baseline]
438
+ avg_h = 30
439
+ return (int(min(xs)), int(min(ys) - avg_h // 2),
440
+ int(max(xs)), int(max(ys) + avg_h // 2))
441
+ return None
442
+
443
+ @staticmethod
444
+ def _find_region_for_line(bbox, line, baseline_seg) -> Tuple[str, object]:
445
+ """Determine which blla region a line belongs to."""
446
+ # Check tags first
447
+ if hasattr(line, 'tags') and isinstance(line.tags, dict):
448
+ rtype = line.tags.get('type')
449
+ if rtype and isinstance(rtype, str):
450
+ return rtype, None
451
+
452
+ # Check region boundaries
453
+ if hasattr(baseline_seg, 'regions') and baseline_seg.regions:
454
+ cx = (bbox[0] + bbox[2]) // 2
455
+ cy = (bbox[1] + bbox[3]) // 2
456
+ for rtype, region_list in baseline_seg.regions.items():
457
+ for ri, region in enumerate(region_list):
458
+ if hasattr(region, 'boundary') and region.boundary:
459
+ bxs = [p[0] for p in region.boundary]
460
+ bys = [p[1] for p in region.boundary]
461
+ if (min(bxs) <= cx <= max(bxs) and
462
+ min(bys) <= cy <= max(bys)):
463
+ return f"{rtype}_{ri}", region
464
+
465
+ return 'r_1', None
466
+
467
+ @staticmethod
468
+ def _extract_blla_region_boxes(
469
+ baseline_seg,
470
+ text_direction: str = 'horizontal-lr',
471
+ ) -> List[Tuple[int, int, int, int, object, str]]:
472
+ """
473
+ Build a sorted list of (x1, y1, x2, y2, region_obj, region_key) tuples
474
+ from blla's detected regions. Used to clip / split lines that cross
475
+ column boundaries. Returns an empty list when no region boundaries are
476
+ available.
477
+ """
478
+ boxes: List[Tuple[int, int, int, int, object, str]] = []
479
+ if not (hasattr(baseline_seg, 'regions') and baseline_seg.regions):
480
+ return boxes
481
+ for rtype, region_list in baseline_seg.regions.items():
482
+ for ri, region in enumerate(region_list):
483
+ if not (hasattr(region, 'boundary') and region.boundary):
484
+ continue
485
+ bxs = [p[0] for p in region.boundary]
486
+ bys = [p[1] for p in region.boundary]
487
+ boxes.append((
488
+ int(min(bxs)), int(min(bys)),
489
+ int(max(bxs)), int(max(bys)),
490
+ region, f"{rtype}_{ri}",
491
+ ))
492
+ rtl = text_direction.endswith('-rl')
493
+ boxes.sort(key=lambda t: t[0], reverse=rtl)
494
+ return boxes
495
+
496
+ @staticmethod
497
+ def _overlapping_blla_boxes(
498
+ bbox: Tuple[int, int, int, int],
499
+ blla_boxes: List[Tuple[int, int, int, int, object, str]],
500
+ ) -> List[Tuple[int, int, int, int, object, str]]:
501
+ """
502
+ Return the blla region boxes whose bbox overlaps with *bbox*.
503
+ Overlap requires intersection in both x and y.
504
+ """
505
+ x1, y1, x2, y2 = bbox
506
+ result = []
507
+ for rb in blla_boxes:
508
+ rx1, ry1, rx2, ry2 = rb[0], rb[1], rb[2], rb[3]
509
+ if rx1 < x2 and rx2 > x1 and ry1 < y2 and ry2 > y1:
510
+ result.append(rb)
511
+ return result
512
+
513
+ @staticmethod
514
+ def _estimate_columns(
515
+ lines: list,
516
+ page_w: int,
517
+ max_columns: int = 4,
518
+ min_gap_fraction: float = 0.03,
519
+ ) -> List[int]:
520
+ """
521
+ Gap-based column clustering.
522
+
523
+ Finds natural breaks in the x-center distribution by looking for the
524
+ largest gaps in the sorted sequence of line x-centers. This is more
525
+ robust than histogram peak-finding for closely spaced columns, because
526
+ a column gap is a region with *no* line centers — it shows up as a large
527
+ jump in the sorted sequence regardless of how close the columns are.
528
+
529
+ Args:
530
+ lines: List of LineSegment objects.
531
+ page_w: Width of the region being analysed (pixels).
532
+ max_columns: Maximum number of columns to return (≥1).
533
+ min_gap_fraction: Minimum gap size as a fraction of *page_w* to be
534
+ considered a column boundary. Default 0.03 (3%).
535
+ Increase if spurious splits occur within a column.
536
+ """
537
+ if not lines:
538
+ return []
539
+
540
+ # Lines wider than 60% of the region are likely headers/footers that
541
+ # span columns — exclude them from clustering to avoid false splits.
542
+ orig_centers = [((l.bbox[0] + l.bbox[2]) // 2) for l in lines]
543
+ line_widths = [(l.bbox[2] - l.bbox[0]) for l in lines]
544
+ clustering_centers = [
545
+ cx for cx, w in zip(orig_centers, line_widths)
546
+ if w < 0.60 * page_w
547
+ ]
548
+
549
+ if not clustering_centers:
550
+ # All lines are wide (e.g. single full-width text block)
551
+ return [0] * len(lines)
552
+
553
+ min_gap_px = max(10, int(min_gap_fraction * page_w))
554
+ sorted_cx = sorted(clustering_centers)
555
+
556
+ # Compute gaps between consecutive sorted x-centers
557
+ gaps = [
558
+ (sorted_cx[i + 1] - sorted_cx[i], (sorted_cx[i] + sorted_cx[i + 1]) // 2)
559
+ for i in range(len(sorted_cx) - 1)
560
+ if sorted_cx[i + 1] - sorted_cx[i] >= min_gap_px
561
+ ]
562
+
563
+ if not gaps:
564
+ return [0] * len(lines)
565
+
566
+ # Take the largest max_columns-1 gaps as column boundaries
567
+ split_midpoints = sorted(
568
+ mid for _, mid in sorted(gaps, reverse=True)[: max_columns - 1]
569
+ )
570
+
571
+ # Assign each line (using original center) to a column
572
+ assignments = []
573
+ for cx in orig_centers:
574
+ col = sum(1 for sp in split_midpoints if cx > sp)
575
+ assignments.append(col)
576
+
577
+ return assignments
578
+
579
+ def _split_wide_regions(
580
+ self,
581
+ regions_dict: Dict[str, dict],
582
+ page_w: int,
583
+ min_lines_to_split: int = 10,
584
+ split_width_fraction: float = 0.40,
585
+ max_columns: int = 4,
586
+ ) -> Dict[str, dict]:
587
+ """
588
+ Split blla regions that are wide enough to contain multiple columns.
589
+
590
+ A region whose width exceeds *split_width_fraction* of the page width
591
+ and has enough lines is run through column clustering internally.
592
+
593
+ For landscape double-page spreads, lower split_width_fraction (e.g. 0.20)
594
+ to trigger splitting on narrower regions.
595
+ """
596
+ new_dict: Dict[str, dict] = {}
597
+ split_counter = 0
598
+
599
+ for key, rdata in regions_dict.items():
600
+ region_lines = rdata['lines'] # list of (idx, LineSegment)
601
+ if len(region_lines) < min_lines_to_split:
602
+ new_dict[key] = rdata
603
+ continue
604
+
605
+ # Compute region width from line bboxes
606
+ bboxes = [l.bbox for _, l in region_lines]
607
+ rx1 = min(b[0] for b in bboxes)
608
+ rx2 = max(b[2] for b in bboxes)
609
+ region_w = rx2 - rx1
610
+
611
+ if region_w < split_width_fraction * page_w:
612
+ # Narrow enough to be a single column
613
+ new_dict[key] = rdata
614
+ continue
615
+
616
+ # Wide region — try column clustering within it.
617
+ # _estimate_columns bins x-centers into [0, page_w), so we need to
618
+ # shift line coordinates so that rx1 maps to 0.
619
+ just_lines = [l for _, l in region_lines]
620
+ shifted_lines = []
621
+ for l in just_lines:
622
+ shifted_bbox = (l.bbox[0] - rx1, l.bbox[1],
623
+ l.bbox[2] - rx1, l.bbox[3])
624
+ shifted_lines.append(LineSegment(l.image, shifted_bbox, l.baseline))
625
+ assignments = self._estimate_columns(shifted_lines, page_w=region_w,
626
+ max_columns=max_columns)
627
+
628
+ n_cols = len(set(assignments))
629
+ if n_cols <= 1:
630
+ # Clustering didn't find multiple columns
631
+ new_dict[key] = rdata
632
+ continue
633
+
634
+ print(f"[KrakenSegmenter] Splitting region '{key}' ({len(region_lines)} lines, "
635
+ f"width={region_w}px) into {n_cols} sub-columns")
636
+
637
+ # Re-compute x-centers relative to region left edge for clustering
638
+ # (already done inside _estimate_columns via absolute coords, which
639
+ # works fine since columns are spatially separated)
640
+ for col_id in sorted(set(assignments)):
641
+ sub_key = f"{key}_col{split_counter}"
642
+ split_counter += 1
643
+ sub_lines = [
644
+ region_lines[i]
645
+ for i, a in enumerate(assignments)
646
+ if a == col_id
647
+ ]
648
+ new_dict[sub_key] = {'lines': sub_lines, 'blla_region': None}
649
+
650
+ return new_dict
651
+
652
+ def _cluster_into_columns(
653
+ self,
654
+ lines: list,
655
+ page_w: int,
656
+ max_columns: int = 4,
657
+ ) -> Dict[str, dict]:
658
+ """Cluster lines into columns and return regions_dict."""
659
+ assignments = self._estimate_columns(lines, page_w, max_columns=max_columns)
660
+ regions_dict: Dict[str, dict] = {}
661
+ for idx, (col, line) in enumerate(zip(assignments, lines)):
662
+ key = f"col_{col}"
663
+ if key not in regions_dict:
664
+ regions_dict[key] = {'lines': [], 'blla_region': None}
665
+ regions_dict[key]['lines'].append((idx, line))
666
+ return regions_dict
667
+
668
+ @staticmethod
669
+ def _convex_hull(points: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
670
+ """Monotonic chain convex hull."""
671
+ pts = sorted(set(points))
672
+ if len(pts) <= 2:
673
+ return pts
674
+
675
+ def cross(o, a, b):
676
+ return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
677
+
678
+ lower = []
679
+ for p in pts:
680
+ while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0:
681
+ lower.pop()
682
+ lower.append(p)
683
+ upper = []
684
+ for p in reversed(pts):
685
+ while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0:
686
+ upper.pop()
687
+ upper.append(p)
688
+ return lower[:-1] + upper[:-1]
689
+
690
+ def _build_regions(
691
+ self,
692
+ regions_dict: Dict[str, dict],
693
+ all_lines: list,
694
+ page_w: int,
695
+ text_direction: str = 'horizontal-lr',
696
+ ) -> Tuple[List[SegRegion], List[LineSegment]]:
697
+ """
698
+ Build SegRegion objects from regions_dict.
699
+
700
+ Returns (regions, ordered_lines) where ordered_lines is sorted by
701
+ region (left-to-right for LTR, right-to-left for RTL) then
702
+ top-to-bottom within each region.
703
+ """
704
+ rtl = text_direction.endswith('-rl')
705
+
706
+ # Sort regions by mean x-center: LTR = ascending, RTL = descending
707
+ def _region_mean_x(item):
708
+ lines = item[1]['lines']
709
+ if not lines:
710
+ return 0
711
+ return sum((l.bbox[0] + l.bbox[2]) / 2 for _, l in lines) / len(lines)
712
+
713
+ sorted_regions = sorted(regions_dict.items(), key=_region_mean_x, reverse=rtl)
714
+
715
+ regions: List[SegRegion] = []
716
+ ordered_lines: List[LineSegment] = []
717
+
718
+ for ri, (region_key, rdata) in enumerate(sorted_regions, start=1):
719
+ region_lines = rdata['lines']
720
+ blla_region = rdata['blla_region']
721
+
722
+ # Sort lines top-to-bottom within region
723
+ region_lines.sort(key=lambda item: item[1].bbox[1])
724
+
725
+ region_id = f"r_{ri}"
726
+ line_ids = [f"l_{i + 1}" for i, _ in region_lines]
727
+
728
+ bboxes = [l.bbox for _, l in region_lines]
729
+ rbbox = (
730
+ min(b[0] for b in bboxes),
731
+ min(b[1] for b in bboxes),
732
+ max(b[2] for b in bboxes),
733
+ max(b[3] for b in bboxes),
734
+ )
735
+
736
+ # Polygon: prefer blla boundary, else convex hull
737
+ polygon = None
738
+ if blla_region and hasattr(blla_region, 'boundary') and blla_region.boundary:
739
+ polygon = [(int(p[0]), int(p[1])) for p in blla_region.boundary]
740
+ else:
741
+ pts = []
742
+ for _, l in region_lines:
743
+ x1, y1, x2, y2 = l.bbox
744
+ pts.extend([(x1, y1), (x2, y1), (x2, y2), (x1, y2)])
745
+ hull = self._convex_hull(pts)
746
+ polygon = hull if len(hull) >= 3 else None
747
+
748
+ regions.append(SegRegion(
749
+ id=region_id,
750
+ bbox=rbbox,
751
+ line_ids=line_ids,
752
+ polygon=polygon,
753
+ ))
754
+
755
+ for _, line in region_lines:
756
+ ordered_lines.append(line)
757
+
758
+ return regions, ordered_lines
759
+
760
+ def segment_lines_to_dict(
761
+ self,
762
+ image: Image.Image,
763
+ text_direction: str = 'horizontal-lr',
764
+ use_binarization: bool = True
765
+ ) -> List[dict]:
766
+ """
767
+ Segment image and return results as dictionaries (for compatibility).
768
+
769
+ Returns:
770
+ List of dicts with 'image', 'bbox', and 'baseline' keys
771
+ """
772
+ segments = self.segment_lines(image, text_direction, use_binarization)
773
+ return [
774
+ {
775
+ 'image': seg.image,
776
+ 'bbox': seg.bbox,
777
+ 'baseline': seg.baseline
778
+ }
779
+ for seg in segments
780
+ ]
781
+
782
+
783
+ def test_kraken_segmenter():
784
+ """Test Kraken segmenter on a sample image."""
785
+ import sys
786
+
787
+ if len(sys.argv) < 2:
788
+ print("Usage: python kraken_segmenter.py <image_path>")
789
+ sys.exit(1)
790
+
791
+ image_path = sys.argv[1]
792
+ print(f"Testing Kraken segmenter on: {image_path}")
793
+
794
+ # Load image
795
+ image = Image.open(image_path)
796
+ print(f"Image size: {image.size}")
797
+
798
+ # Create segmenter
799
+ segmenter = KrakenLineSegmenter()
800
+
801
+ # Segment lines
802
+ lines = segmenter.segment_lines(image, use_binarization=True)
803
+
804
+ # Print results
805
+ print(f"\nDetected {len(lines)} lines:")
806
+ for i, line in enumerate(lines):
807
+ print(f" Line {i+1}: bbox={line.bbox}, "
808
+ f"baseline_points={len(line.baseline) if line.baseline else 0}")
809
+
810
+ # Save line images
811
+ import os
812
+ output_dir = "kraken_test_output"
813
+ os.makedirs(output_dir, exist_ok=True)
814
+
815
+ for i, line in enumerate(lines):
816
+ output_path = os.path.join(output_dir, f"line_{i+1:03d}.png")
817
+ line.image.save(output_path)
818
+
819
+ print(f"\nLine images saved to: {output_dir}/")
820
+
821
+
822
+ if __name__ == "__main__":
823
+ test_kraken_segmenter()
page_xml_exporter.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PAGE XML Exporter
3
+
4
+ Exports line segmentation and transcription data to PAGE XML format.
5
+ Compatible with party and other PAGE XML processors.
6
+ """
7
+
8
+ import xml.etree.ElementTree as ET
9
+ from xml.dom import minidom
10
+ from pathlib import Path
11
+ from typing import List, Optional
12
+ from datetime import datetime
13
+ from inference_page import LineSegment
14
+
15
+
16
+ class PageXMLExporter:
17
+ """Export line segmentation data to PAGE XML format."""
18
+
19
+ # PAGE XML namespace
20
+ NAMESPACE = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
21
+
22
+ def __init__(self, image_path: str, image_width: int, image_height: int):
23
+ """
24
+ Initialize PAGE XML exporter.
25
+
26
+ Args:
27
+ image_path: Path to the page image file
28
+ image_width: Width of the page image in pixels
29
+ image_height: Height of the page image in pixels
30
+ """
31
+ self.image_path = Path(image_path)
32
+ self.image_width = image_width
33
+ self.image_height = image_height
34
+
35
+ def _make_root(self, creator: str, comments: Optional[str]) -> tuple:
36
+ """Build root PcGts element with Metadata and Page. Returns (root, page)."""
37
+ ET.register_namespace('', self.NAMESPACE)
38
+ root = ET.Element('PcGts', {
39
+ 'xmlns': self.NAMESPACE,
40
+ 'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
41
+ 'xsi:schemaLocation': (
42
+ f'{self.NAMESPACE} '
43
+ 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd'
44
+ ),
45
+ 'pcGtsId': f'pc-{self.image_path.stem}'
46
+ })
47
+ metadata = ET.SubElement(root, 'Metadata')
48
+ ET.SubElement(metadata, 'Creator').text = creator
49
+ ET.SubElement(metadata, 'Created').text = datetime.now().isoformat()
50
+ ET.SubElement(metadata, 'LastChange').text = datetime.now().isoformat()
51
+ if comments:
52
+ ET.SubElement(metadata, 'Comments').text = comments
53
+ page = ET.SubElement(root, 'Page', {
54
+ 'imageFilename': str(self.image_path.name),
55
+ 'imageWidth': str(self.image_width),
56
+ 'imageHeight': str(self.image_height)
57
+ })
58
+ return root, page
59
+
60
+ @staticmethod
61
+ def _write_xml(root: ET.Element, output_path: str) -> None:
62
+ xml_str = ET.tostring(root, encoding='utf-8', method='xml')
63
+ dom = minidom.parseString(xml_str)
64
+ pretty_xml = dom.toprettyxml(indent=' ', encoding='utf-8')
65
+ with open(output_path, 'wb') as f:
66
+ f.write(pretty_xml)
67
+
68
+ @staticmethod
69
+ def _baseline_points(segment) -> str:
70
+ """Return PAGE XML baseline points string for a segment."""
71
+ if hasattr(segment, 'baseline') and segment.baseline:
72
+ return ' '.join(f'{x},{y}' for x, y in segment.baseline)
73
+ x1, y1, x2, y2 = segment.bbox
74
+ bl_y = y2 - 5
75
+ return f'{x1},{bl_y} {x2},{bl_y}'
76
+
77
+ @staticmethod
78
+ def _coords_points(segment) -> str:
79
+ """Return PAGE XML coords points string for a segment."""
80
+ if hasattr(segment, 'coords') and segment.coords:
81
+ return ' '.join(f'{x},{y}' for x, y in segment.coords)
82
+ x1, y1, x2, y2 = segment.bbox
83
+ return f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
84
+
85
+ def _add_text_line(self, parent: ET.Element, line_id: str, segment,
86
+ text: Optional[str], line_idx: int) -> None:
87
+ """Add a TextLine element to parent with coords, baseline and optional text."""
88
+ line_elem = ET.SubElement(parent, 'TextLine', {
89
+ 'id': line_id,
90
+ 'custom': f'readingOrder {{index:{line_idx};}}'
91
+ })
92
+ ET.SubElement(line_elem, 'Coords').set('points', self._coords_points(segment))
93
+ ET.SubElement(line_elem, 'Baseline').set('points', self._baseline_points(segment))
94
+ if text:
95
+ conf = '1.0'
96
+ if hasattr(segment, 'confidence') and segment.confidence is not None:
97
+ conf = str(segment.confidence)
98
+ text_equiv = ET.SubElement(line_elem, 'TextEquiv', {'conf': conf})
99
+ ET.SubElement(text_equiv, 'Unicode').text = text
100
+
101
+ def export(self, segments: List[LineSegment], output_path: str,
102
+ creator: str = "TrOCR-GUI", comments: Optional[str] = None) -> None:
103
+ """
104
+ Export line segments to PAGE XML (single TextRegion, no region info).
105
+
106
+ Args:
107
+ segments: List of LineSegment objects (may carry .text attribute)
108
+ output_path: Path where to save the PAGE XML file
109
+ creator: Software/tool that created this PAGE XML
110
+ comments: Optional comments about the document
111
+ """
112
+ root, page = self._make_root(creator, comments)
113
+
114
+ # Reading order
115
+ reading_order = ET.SubElement(page, 'ReadingOrder')
116
+ ordered_group = ET.SubElement(reading_order, 'OrderedGroup', {
117
+ 'id': 'ro_1',
118
+ 'caption': 'Regions reading order'
119
+ })
120
+ ET.SubElement(ordered_group, 'RegionRefIndexed', {
121
+ 'index': '0',
122
+ 'regionRef': 'region_1'
123
+ })
124
+
125
+ # Single text region spanning all lines
126
+ text_region = ET.SubElement(page, 'TextRegion', {
127
+ 'id': 'region_1',
128
+ 'type': 'paragraph',
129
+ 'custom': 'readingOrder {index:0;}'
130
+ })
131
+ if segments:
132
+ x1 = min(seg.bbox[0] for seg in segments)
133
+ y1 = min(seg.bbox[1] for seg in segments)
134
+ x2 = max(seg.bbox[2] for seg in segments)
135
+ y2 = max(seg.bbox[3] for seg in segments)
136
+ ET.SubElement(text_region, 'Coords').set(
137
+ 'points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
138
+ )
139
+
140
+ for idx, segment in enumerate(segments):
141
+ text = getattr(segment, 'text', None) or None
142
+ self._add_text_line(text_region, f'line_{idx + 1}', segment, text, idx)
143
+
144
+ self._write_xml(root, output_path)
145
+
146
+ def export_with_regions(
147
+ self,
148
+ regions,
149
+ lines,
150
+ output_path: str,
151
+ transcriptions: Optional[List[str]] = None,
152
+ creator: str = "TrOCR-GUI",
153
+ comments: Optional[str] = None,
154
+ ) -> None:
155
+ """
156
+ Export with proper multi-region PAGE XML structure.
157
+
158
+ Creates one TextRegion per detected region (e.g. columns, marginalia),
159
+ with TextLines nested inside their region and actual baseline polylines.
160
+ ReadingOrder lists regions left-to-right and lines top-to-bottom within
161
+ each region, matching how blla / column clustering ordered them.
162
+
163
+ Args:
164
+ regions: List of SegRegion objects (duck-typed: .id, .line_ids,
165
+ .bbox, optional .polygon).
166
+ lines: Flat list of LineSegment objects, already ordered by
167
+ region (region[0]'s lines first, then region[1]'s, …).
168
+ The count of lines per region is len(region.line_ids).
169
+ output_path: Where to write the PAGE XML file.
170
+ transcriptions: Optional list of text strings, parallel to *lines*.
171
+ Pass self.transcriptions from the GUI when available.
172
+ creator: Creator string for Metadata.
173
+ comments: Optional comments string for Metadata.
174
+ """
175
+ root, page = self._make_root(creator, comments)
176
+
177
+ # ReadingOrder — one RegionRefIndexed per region
178
+ reading_order = ET.SubElement(page, 'ReadingOrder')
179
+ ordered_group = ET.SubElement(reading_order, 'OrderedGroup', {
180
+ 'id': 'ro_1',
181
+ 'caption': 'Regions reading order'
182
+ })
183
+ for ri, region in enumerate(regions):
184
+ ET.SubElement(ordered_group, 'RegionRefIndexed', {
185
+ 'index': str(ri),
186
+ 'regionRef': region.id
187
+ })
188
+
189
+ # TextRegions — one per region, lines nested inside
190
+ line_offset = 0
191
+ for ri, region in enumerate(regions):
192
+ n = len(region.line_ids) if hasattr(region, 'line_ids') else 0
193
+ region_lines = lines[line_offset:line_offset + n]
194
+ line_offset += n
195
+
196
+ text_region = ET.SubElement(page, 'TextRegion', {
197
+ 'id': region.id,
198
+ 'type': 'paragraph',
199
+ 'custom': f'readingOrder {{index:{ri};}}'
200
+ })
201
+
202
+ # Region polygon (prefer neural boundary over convex hull over bbox)
203
+ if hasattr(region, 'polygon') and region.polygon and len(region.polygon) >= 3:
204
+ pts = ' '.join(f'{x},{y}' for x, y in region.polygon)
205
+ else:
206
+ x1, y1, x2, y2 = region.bbox
207
+ pts = f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
208
+ ET.SubElement(text_region, 'Coords').set('points', pts)
209
+
210
+ for li, segment in enumerate(region_lines):
211
+ global_line_idx = line_offset - n + li # index in the flat lines list
212
+ text = None
213
+ if transcriptions and global_line_idx < len(transcriptions):
214
+ text = transcriptions[global_line_idx] or None
215
+ elif hasattr(segment, 'text'):
216
+ text = getattr(segment, 'text', None) or None
217
+ self._add_text_line(
218
+ text_region,
219
+ f'line_{ri + 1}_{li + 1}',
220
+ segment,
221
+ text,
222
+ li,
223
+ )
224
+
225
+ self._write_xml(root, output_path)
226
+
227
+ @staticmethod
228
+ def quick_export(image_path: str, segments: List[LineSegment],
229
+ output_path: Optional[str] = None) -> str:
230
+ """
231
+ Quick export helper that automatically determines output path and image dimensions.
232
+
233
+ Args:
234
+ image_path: Path to the page image
235
+ segments: List of LineSegment objects
236
+ output_path: Optional output path (default: same as image with .xml extension)
237
+
238
+ Returns:
239
+ Path to the exported PAGE XML file
240
+ """
241
+ from PIL import Image
242
+
243
+ # Load image to get dimensions
244
+ img = Image.open(image_path)
245
+ width, height = img.size
246
+
247
+ # Determine output path
248
+ if output_path is None:
249
+ output_path = Path(image_path).with_suffix('.xml')
250
+
251
+ # Export
252
+ exporter = PageXMLExporter(image_path, width, height)
253
+ exporter.export(segments, str(output_path))
254
+
255
+ return str(output_path)
256
+
257
+
258
+ if __name__ == "__main__":
259
+ # Example usage
260
+ from PIL import Image
261
+
262
+ # Create a dummy segment for testing
263
+ dummy_img = Image.new('L', (100, 30))
264
+ dummy_segment = LineSegment(
265
+ image=dummy_img,
266
+ bbox=(10, 10, 200, 40),
267
+ text="Example text",
268
+ confidence=0.95
269
+ )
270
+
271
+ exporter = PageXMLExporter("test_page.jpg", 800, 1200)
272
+ exporter.export([dummy_segment], "test_output.xml",
273
+ creator="PAGE XML Exporter Test",
274
+ comments="This is a test export")
275
+
276
+ print("Test PAGE XML created: test_output.xml")
web/polyscriptor_server.py ADDED
@@ -0,0 +1,2237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Polyscriptor Web UI — FastAPI Backend
3
+
4
+ Thin wrapper around existing HTR engine code. Provides REST API + SSE
5
+ for browser-based transcription. All heavy lifting done by the same
6
+ modules the PyQt6 GUI uses.
7
+
8
+ Usage:
9
+ source htr_gui/bin/activate
10
+ python -m uvicorn web.polyscriptor_server:app --host 0.0.0.0 --port 8765
11
+
12
+ Author: Claude Code
13
+ Date: 2026-02-26
14
+ """
15
+
16
+ import asyncio
17
+ import hashlib
18
+ import importlib
19
+ import json
20
+ import logging
21
+ import os
22
+ import sys
23
+ import time
24
+ import uuid
25
+ from dataclasses import dataclass, field
26
+ from types import SimpleNamespace
27
+ from pathlib import Path
28
+ from typing import Any, Dict, List, Optional
29
+
30
+ import numpy as np
31
+ from PIL import Image, ImageOps
32
+ from fastapi import Cookie, FastAPI, File, HTTPException, Query, Request, UploadFile
33
+ from fastapi.responses import FileResponse, Response, StreamingResponse
34
+ from fastapi.staticfiles import StaticFiles
35
+ from pydantic import BaseModel
36
+
37
+ log = logging.getLogger("polyscriptor")
38
+ DEMO_MODE = os.environ.get("POLYSCRIPTOR_DEMO_MODE", "").strip().lower()
39
+
40
+ # Add project root to path so we can import existing modules
41
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
42
+ sys.path.insert(0, str(PROJECT_ROOT))
43
+
44
+ # Load .env from project root (same as the Qt GUI does via CommercialAPIEngine)
45
+ try:
46
+ from dotenv import load_dotenv
47
+ _env_path = PROJECT_ROOT / ".env"
48
+ if _env_path.exists():
49
+ load_dotenv(_env_path)
50
+ log.info(f"Loaded environment variables from {_env_path}")
51
+ except ImportError:
52
+ pass # python-dotenv not installed — env vars must be set externally
53
+
54
+ from htr_engine_base import get_global_registry, HTREngine, TranscriptionResult
55
+
56
+ # PDF support via PyMuPDF
57
+ try:
58
+ import fitz as _fitz # PyMuPDF
59
+ PDF_AVAILABLE = True
60
+ except ImportError:
61
+ PDF_AVAILABLE = False
62
+ log.warning("PyMuPDF not installed — PDF upload disabled. Install with: pip install pymupdf")
63
+
64
+ # Lazy imports for segmentation (avoid slow startup)
65
+ _segmenters_imported = False
66
+
67
+
68
+ def _import_segmenters():
69
+ global _segmenters_imported
70
+ if _segmenters_imported:
71
+ return
72
+ global KrakenLineSegmenter, LineSegmenter, PYLAIA_MODELS
73
+ from kraken_segmenter import KrakenLineSegmenter
74
+ from inference_page import LineSegmenter
75
+ try:
76
+ from inference_pylaia_native import PYLAIA_MODELS
77
+ except ImportError:
78
+ PYLAIA_MODELS = {}
79
+ _segmenters_imported = True
80
+
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # App setup
84
+ # ---------------------------------------------------------------------------
85
+
86
+ app = FastAPI(title="Polyscriptor HTR", version="0.1.0")
87
+
88
+ # Serve static frontend files
89
+ STATIC_DIR = Path(__file__).parent / "static"
90
+ app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Engine pool — Phase 2: shared pool of loaded engine instances
94
+ # ---------------------------------------------------------------------------
95
+
96
+ @dataclass
97
+ class EngineSlot:
98
+ """One loaded engine instance in the pool."""
99
+ engine: Any # HTREngine instance (not the registry singleton)
100
+ engine_name: str
101
+ config: dict
102
+ pool_key: str
103
+ ref_count: int = 0
104
+ last_used: float = field(default_factory=time.time)
105
+ lock: asyncio.Lock = field(default_factory=asyncio.Lock)
106
+
107
+ engine_pool: Dict[str, EngineSlot] = {}
108
+ pool_lock = asyncio.Lock()
109
+
110
+ # VRAM budget estimates (GB) for eviction decisions
111
+ _ENGINE_VRAM_GB = {
112
+ "CRNN-CTC (PyLaia-inspired)": 2,
113
+ "TrOCR": 3,
114
+ "Qwen3-VL": 18,
115
+ "Churro VLM": 10,
116
+ "Kraken": 2,
117
+ "Party": 4,
118
+ "PaddleOCR": 2,
119
+ }
120
+ _NO_GPU_ENGINES = {"Commercial APIs", "OpenWebUI", "LightOnOCR", "DeepSeek-OCR"}
121
+ _TOTAL_VRAM_GB = 92 # 2x L40S @ 46GB each
122
+
123
+
124
+ # Factory: engine name -> (module, class) for creating fresh instances
125
+ _ENGINE_FACTORY = {
126
+ "TrOCR": ("engines.trocr_engine", "TrOCREngine"),
127
+ "CRNN-CTC (PyLaia-inspired)": ("engines.pylaia_engine", "PyLaiaEngine"),
128
+ "Qwen3-VL": ("engines.qwen3_engine", "Qwen3Engine"),
129
+ "Churro VLM": ("engines.churro_engine", "ChurroEngine"),
130
+ "Kraken": ("engines.kraken_engine", "KrakenEngine"),
131
+ "Commercial APIs": ("engines.commercial_api_engine", "CommercialAPIEngine"),
132
+ "Party": ("engines.party_engine", "PartyEngine"),
133
+ "OpenWebUI": ("engines.openwebui_engine", "OpenWebUIEngine"),
134
+ "DeepSeek-OCR": ("engines.deepseek_ocr_engine", "DeepSeekOCREngine"),
135
+ "LightOnOCR": ("engines.lighton_ocr_engine", "LightOnOCREngine"),
136
+ "PaddleOCR": ("engines.paddle_engine", "PaddleOCREngine"),
137
+ }
138
+
139
+
140
+ def _create_engine_instance(engine_name: str):
141
+ """Create a fresh engine instance (not the registry singleton).
142
+
143
+ The registry is used for discovery/availability only.
144
+ Pool slots get their own instances so multiple models can coexist.
145
+ """
146
+ entry = _ENGINE_FACTORY.get(engine_name)
147
+ if not entry:
148
+ return None
149
+ module_name, class_name = entry
150
+ mod = importlib.import_module(module_name)
151
+ cls = getattr(mod, class_name)
152
+ return cls()
153
+
154
+
155
+ def _make_pool_key(engine_name: str, config: dict) -> str:
156
+ """Build a key that uniquely identifies an engine+model combination."""
157
+ if engine_name == "Commercial APIs":
158
+ provider = config.get("provider", "unknown")
159
+ model = config.get("model", "unknown")
160
+ api_key = config.get("api_key", "")
161
+ key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
162
+ return f"{engine_name}::{provider}::{model}::{key_hash}"
163
+
164
+ if engine_name == "OpenWebUI":
165
+ model = config.get("model", "unknown")
166
+ base_url = config.get("base_url", "unknown")
167
+ api_key = config.get("api_key", "")
168
+ key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
169
+ return f"{engine_name}::{base_url}::{model}::{key_hash}"
170
+
171
+ if engine_name == "TrOCR":
172
+ return f"{engine_name}::{config.get('model_path', 'default')}"
173
+
174
+ if engine_name in ("CRNN-CTC (PyLaia-inspired)", "Kraken"):
175
+ return f"{engine_name}::{config.get('model_path', 'default')}"
176
+
177
+ if engine_name == "Qwen3-VL":
178
+ base = config.get("base_model", "default")
179
+ adapter = config.get("adapter", "")
180
+ return f"{engine_name}::{base}::{adapter or 'none'}"
181
+
182
+ if engine_name == "Churro VLM":
183
+ return f"{engine_name}::{config.get('model_name', 'default')}"
184
+
185
+ if engine_name == "LightOnOCR":
186
+ return f"{engine_name}::{config.get('model_path', 'default')}"
187
+
188
+ # Fallback: hash the config
189
+ config_hash = hashlib.sha256(str(sorted(config.items())).encode()).hexdigest()[:12]
190
+ return f"{engine_name}::{config_hash}"
191
+
192
+
193
+ async def _maybe_evict(new_engine_name: str):
194
+ """Evict LRU slots with ref_count==0 if VRAM is tight. Called UNDER pool_lock."""
195
+ if new_engine_name in _NO_GPU_ENGINES:
196
+ return
197
+ needed = _ENGINE_VRAM_GB.get(new_engine_name, 4)
198
+ used = sum(_ENGINE_VRAM_GB.get(s.engine_name, 4)
199
+ for s in engine_pool.values()
200
+ if s.engine_name not in _NO_GPU_ENGINES)
201
+ if used + needed <= _TOTAL_VRAM_GB:
202
+ return
203
+ # Evict: ref_count==0, oldest first
204
+ candidates = sorted(
205
+ [(k, s) for k, s in engine_pool.items()
206
+ if s.ref_count == 0 and s.engine_name not in _NO_GPU_ENGINES],
207
+ key=lambda x: x[1].last_used
208
+ )
209
+ for key, slot in candidates:
210
+ if used + needed <= _TOTAL_VRAM_GB:
211
+ break
212
+ log.info(f"Evicting engine slot '{key}' (last used {time.time() - slot.last_used:.0f}s ago)")
213
+ try:
214
+ slot.engine.unload_model()
215
+ except Exception as e:
216
+ log.warning(f"Error unloading evicted engine: {e}")
217
+ del engine_pool[key]
218
+ used -= _ENGINE_VRAM_GB.get(slot.engine_name, 4)
219
+ if used + needed > _TOTAL_VRAM_GB:
220
+ log.warning(f"VRAM tight: ~{used}GB used + ~{needed}GB needed > {_TOTAL_VRAM_GB}GB total")
221
+
222
+
223
+ # Compatibility shims — will be removed after full migration
224
+ loaded_engine: Optional[HTREngine] = None
225
+ loaded_engine_name: str = ""
226
+ loaded_config: dict = {}
227
+
228
+ # Persistent upload storage (survives server restarts)
229
+ UPLOAD_DIR = Path(__file__).parent / "uploads"
230
+ UPLOAD_DIR.mkdir(exist_ok=True)
231
+
232
+ # Upload TTL: 24 hours
233
+ _UPLOAD_TTL_SECONDS = 86400
234
+
235
+ # Session TTL: 2 hours of inactivity
236
+ _SESSION_TTL_SECONDS = 7200
237
+
238
+ # Cookie name for session tracking
239
+ _SESSION_COOKIE = "polyscriptor_session"
240
+
241
+
242
+ # ---------------------------------------------------------------------------
243
+ # Per-user sessions — Phase 1 of multi-user refactoring
244
+ # ---------------------------------------------------------------------------
245
+
246
+ @dataclass
247
+ class UserSession:
248
+ session_id: str
249
+ image_cache: Dict[str, dict] = field(default_factory=dict)
250
+ cancel_events: Dict[str, asyncio.Event] = field(default_factory=dict)
251
+ pool_key: Optional[str] = None # Reference into engine_pool
252
+ created_at: float = field(default_factory=time.time)
253
+ last_active: float = field(default_factory=time.time)
254
+
255
+
256
+ sessions: Dict[str, UserSession] = {}
257
+ global_image_cache: Dict[str, dict] = {}
258
+
259
+
260
+ def _get_or_create_session(session_id: Optional[str]) -> tuple[UserSession, bool]:
261
+ """Return (session, created). If session_id is missing/unknown, create a new one."""
262
+ if session_id and session_id in sessions:
263
+ session = sessions[session_id]
264
+ session.last_active = time.time()
265
+ return session, False
266
+ new_id = str(uuid.uuid4())
267
+ session = UserSession(session_id=new_id)
268
+ sessions[new_id] = session
269
+ return session, True
270
+
271
+
272
+ def _cleanup_expired_sessions() -> int:
273
+ """Remove sessions inactive for more than _SESSION_TTL_SECONDS. Returns count removed."""
274
+ cutoff = time.time() - _SESSION_TTL_SECONDS
275
+ expired = [sid for sid, s in sessions.items() if s.last_active < cutoff]
276
+ for sid in expired:
277
+ session = sessions.pop(sid)
278
+ # Release pool reference
279
+ if session.pool_key and session.pool_key in engine_pool:
280
+ slot = engine_pool[session.pool_key]
281
+ slot.ref_count = max(0, slot.ref_count - 1)
282
+ if slot.ref_count == 0:
283
+ log.info(f"Immediate eviction (session expiry): '{slot.engine_name}'")
284
+ try:
285
+ slot.engine.unload_model()
286
+ except Exception as e:
287
+ log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
288
+ if session.pool_key in engine_pool:
289
+ del engine_pool[session.pool_key]
290
+ # Clean up upload files belonging to this session
291
+ for iid, img_data in session.image_cache.items():
292
+ p = img_data.get("path")
293
+ if p:
294
+ Path(p).unlink(missing_ok=True)
295
+ xp = img_data.get("xml_path")
296
+ if xp:
297
+ Path(xp).unlink(missing_ok=True)
298
+ log.info(f"Expired session {sid[:8]}... ({len(session.image_cache)} images)")
299
+ return len(expired)
300
+
301
+
302
+ _SESSION_PASSTHROUGH_PATHS = {"/api/gpu", "/api/engines", "/api/kraken/presets"}
303
+
304
+
305
+ @app.middleware("http")
306
+ async def session_middleware(request: Request, call_next):
307
+ """Inject session into request.state; set session cookie on new sessions.
308
+
309
+ Pure status/discovery routes (GPU poll, engine list) are excluded from
310
+ last_active updates so that background browser polling cannot keep a session
311
+ alive indefinitely and prevent engine-slot eviction.
312
+ """
313
+ session_id = request.cookies.get(_SESSION_COOKIE)
314
+ session, created = _get_or_create_session(session_id)
315
+ request.state.session = session
316
+
317
+ # Don't update last_active for polling-only routes
318
+ if request.url.path in _SESSION_PASSTHROUGH_PATHS:
319
+ session.last_active # read only — no write
320
+ else:
321
+ session.last_active = time.time()
322
+
323
+ response = await call_next(request)
324
+
325
+ if created or session_id != session.session_id:
326
+ cookie_kwargs = {
327
+ "key": _SESSION_COOKIE,
328
+ "value": session.session_id,
329
+ "httponly": True,
330
+ "max_age": _SESSION_TTL_SECONDS,
331
+ }
332
+ if DEMO_MODE == "hf_space":
333
+ cookie_kwargs.update({"samesite": "none", "secure": True})
334
+ else:
335
+ cookie_kwargs.update({"samesite": "lax"})
336
+ response.set_cookie(
337
+ **cookie_kwargs
338
+ )
339
+ return response
340
+
341
+
342
+ def _get_session(request: Request) -> UserSession:
343
+ """FastAPI dependency: extract session set by middleware."""
344
+ return request.state.session
345
+
346
+
347
+ def _cleanup_old_uploads() -> int:
348
+ """Delete uploads older than TTL and evict image_cache entries across all sessions."""
349
+ cutoff = time.time() - _UPLOAD_TTL_SECONDS
350
+ deleted = 0
351
+ for f in list(UPLOAD_DIR.iterdir()):
352
+ if f.is_file():
353
+ try:
354
+ if f.stat().st_mtime < cutoff:
355
+ f.unlink(missing_ok=True)
356
+ deleted += 1
357
+ except OSError:
358
+ pass
359
+ # Evict stale image_cache entries whose file no longer exists (all sessions)
360
+ for session in sessions.values():
361
+ for iid in list(session.image_cache.keys()):
362
+ p = session.image_cache[iid].get("path")
363
+ if p and not Path(p).exists():
364
+ del session.image_cache[iid]
365
+ return deleted
366
+
367
+
368
+ _SLOT_IDLE_TTL_SECONDS = 6 * 3600 # evict loaded engines idle for 6h, regardless of ref_count
369
+
370
+
371
+ def _evict_idle_slots() -> int:
372
+ """Evict engine slots that have not been used for _SLOT_IDLE_TTL_SECONDS.
373
+
374
+ Called under no lock — must only be called from _periodic_cleanup (single-threaded).
375
+ The GPU-status poll (/api/gpu) keeps sessions alive indefinitely, so we cannot rely
376
+ on session expiry alone to release VRAM. This independently caps engine residency.
377
+ """
378
+ cutoff = time.time() - _SLOT_IDLE_TTL_SECONDS
379
+ stale = [k for k, s in engine_pool.items() if s.last_used < cutoff
380
+ and s.engine_name not in _NO_GPU_ENGINES]
381
+ for key in stale:
382
+ slot = engine_pool.pop(key)
383
+ log.info(f"Idle eviction: '{slot.engine_name}' (idle {(time.time() - slot.last_used)/3600:.1f}h)")
384
+ try:
385
+ slot.engine.unload_model()
386
+ except Exception as e:
387
+ log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
388
+ # Invalidate all sessions pointing at this slot
389
+ for session in sessions.values():
390
+ if session.pool_key == key:
391
+ session.pool_key = None
392
+ return len(stale)
393
+
394
+
395
+ async def _periodic_cleanup():
396
+ """Background task: clean up uploads + expired sessions + idle engine slots every hour."""
397
+ while True:
398
+ await asyncio.sleep(3600)
399
+ n = _cleanup_old_uploads()
400
+ m = _cleanup_expired_sessions()
401
+ p = _evict_idle_slots()
402
+ if n or m or p:
403
+ log.info(f"Periodic cleanup: {n} upload(s), {m} session(s), {p} idle engine slot(s).")
404
+
405
+
406
+ # ---------------------------------------------------------------------------
407
+ # API key resolution — keys never stored or shared server-side (Phase 3)
408
+ # Web UI users MUST provide their own keys via browser localStorage.
409
+ # Server env vars (.env) are NOT used by the web UI — they exist only for
410
+ # the PyQt GUI and CLI tools which run locally on the admin's machine.
411
+ # ---------------------------------------------------------------------------
412
+
413
+ # Known key slots (for validation only — env vars are NOT consulted)
414
+ _KEY_SLOTS = {"openai", "gemini", "claude", "openwebui"}
415
+
416
+
417
+ def _resolve_api_key(slot: str, request_value: str) -> str:
418
+ """
419
+ Return the API key from the browser request, or empty string.
420
+ Server env vars are deliberately NOT used as fallback — each web user
421
+ must supply their own key via browser localStorage.
422
+ """
423
+ if request_value and request_value.strip():
424
+ return request_value.strip()
425
+ return ""
426
+
427
+
428
+ # ---------------------------------------------------------------------------
429
+ # Startup config (web/server_config.yaml) — optional, auto-load an engine
430
+ # ---------------------------------------------------------------------------
431
+
432
+ def _load_startup_config() -> dict:
433
+ cfg_path = Path(__file__).parent / "server_config.yaml"
434
+ if not cfg_path.exists():
435
+ return {}
436
+ try:
437
+ import yaml
438
+ with open(cfg_path) as f:
439
+ return yaml.safe_load(f) or {}
440
+ except Exception as e:
441
+ log.warning(f"Could not read server_config.yaml: {e}")
442
+ return {}
443
+
444
+
445
+ @app.on_event("startup")
446
+ async def startup_event():
447
+ """Clean old uploads, start periodic cleanup, auto-load engine."""
448
+ # Clean up uploads left over from previous server runs
449
+ n = _cleanup_old_uploads()
450
+ if n:
451
+ log.info(f"Startup cleanup: removed {n} old upload file(s).")
452
+
453
+ # Schedule periodic cleanup (every hour)
454
+ asyncio.create_task(_periodic_cleanup())
455
+
456
+ # Auto-load default engine from server_config.yaml if present
457
+ cfg = _load_startup_config()
458
+ if not cfg.get("default_engine"):
459
+ return
460
+ engine_name = cfg["default_engine"]
461
+ engine_config = cfg.get("default_config", {})
462
+ log.info(f"Auto-loading engine '{engine_name}' from server_config.yaml ...")
463
+ try:
464
+ registry = get_global_registry()
465
+ reg_engine = registry.get_engine_by_name(engine_name)
466
+ if reg_engine and reg_engine.is_available():
467
+ engine = _create_engine_instance(engine_name)
468
+ if not engine:
469
+ log.warning(f"Auto-load: cannot create instance for '{engine_name}'.")
470
+ return
471
+ ok = await asyncio.to_thread(engine.load_model, engine_config)
472
+ if ok:
473
+ pool_key = _make_pool_key(engine_name, engine_config)
474
+ slot = EngineSlot(
475
+ engine=engine, engine_name=engine_name,
476
+ config=engine_config, pool_key=pool_key,
477
+ ref_count=0, # No session owns it yet
478
+ )
479
+ engine_pool[pool_key] = slot
480
+ # Update compat shims
481
+ global loaded_engine, loaded_engine_name, loaded_config
482
+ loaded_engine = engine
483
+ loaded_engine_name = engine_name
484
+ loaded_config = engine_config
485
+ log.info(f"Auto-loaded '{engine_name}' into pool as '{pool_key}'.")
486
+ else:
487
+ log.warning(f"Auto-load of '{engine_name}' failed (load_model returned False).")
488
+ else:
489
+ log.warning(f"Auto-load: engine '{engine_name}' not found or not available.")
490
+ except Exception as e:
491
+ log.warning(f"Auto-load error: {e}")
492
+
493
+
494
+ # ---------------------------------------------------------------------------
495
+ # Config schemas — replaces Qt config widgets for the web UI
496
+ # ---------------------------------------------------------------------------
497
+
498
+ def _get_pylaia_model_options() -> list:
499
+ _import_segmenters()
500
+ from inference_pylaia_native import _scan_pylaia_models
501
+ _scan_pylaia_models(str(Path(__file__).resolve().parents[1] / "models"))
502
+ options = [{"label": k, "value": k} for k in PYLAIA_MODELS.keys()]
503
+ options.append({"label": "Custom / local path…", "value": "__custom__"})
504
+ return options
505
+
506
+
507
+ def _scan_kraken_models() -> list:
508
+ """Scan models/ directory for local Kraken .mlmodel files and build select options."""
509
+ options = []
510
+ models_root = Path(__file__).resolve().parents[1] / "models"
511
+ if models_root.exists():
512
+ for p in sorted(models_root.rglob("*.mlmodel")):
513
+ rel = str(p.relative_to(models_root.parent)) # e.g. models/kraken_cs/best.mlmodel
514
+ label = f"{p.parent.name}/{p.name}"
515
+ options.append({"label": label, "value": rel, "source": "local"})
516
+ # Zenodo presets from kraken_engine (auto-download on load)
517
+ try:
518
+ from engines.kraken_engine import KRAKEN_MODELS
519
+ for preset_id, info in KRAKEN_MODELS.items():
520
+ if info.get("source") == "zenodo":
521
+ options.append({
522
+ "label": f"{info.get('label', preset_id)} [Zenodo, auto-download]",
523
+ "value": f"__zenodo__{preset_id}",
524
+ "source": "zenodo",
525
+ })
526
+ except Exception:
527
+ pass
528
+ return options
529
+
530
+
531
+ def _scan_trocr_models() -> list:
532
+ """Scan models/ directory for TrOCR checkpoints.
533
+
534
+ A directory is considered a TrOCR model if it contains
535
+ preprocessor_config.json (TrOCR/ViT-specific) AND config.json
536
+ with model_type == 'vision-encoder-decoder'.
537
+ This avoids picking up PyLaia/CRNN-CTC directories that also
538
+ contain a config.json with training parameters.
539
+ """
540
+ import json as _json
541
+ models_dir = PROJECT_ROOT / "models"
542
+ options = [
543
+ {"label": "Custom HuggingFace ID or local path…", "value": "__custom__"},
544
+ {"label": "kazars24/trocr-base-handwritten-ru (HuggingFace)",
545
+ "value": "kazars24/trocr-base-handwritten-ru",
546
+ "source": "huggingface"},
547
+ {"label": "microsoft/trocr-base-printed — printed text, base",
548
+ "value": "microsoft/trocr-base-printed",
549
+ "source": "huggingface"},
550
+ {"label": "microsoft/trocr-large-printed — printed text, large",
551
+ "value": "microsoft/trocr-large-printed",
552
+ "source": "huggingface"},
553
+ {"label": "dh-unibe/trocr-kurrent — German Kurrent 19th c. (CER 2.66%)",
554
+ "value": "dh-unibe/trocr-kurrent",
555
+ "source": "huggingface"},
556
+ {"label": "dh-unibe/trocr-kurrent-XVI-XVII — German Kurrent 16th–18th c. (CER 5.42%)",
557
+ "value": "dh-unibe/trocr-kurrent-XVI-XVII",
558
+ "source": "huggingface"},
559
+ ]
560
+ if models_dir.exists():
561
+ for d in sorted(models_dir.iterdir()):
562
+ if not d.is_dir():
563
+ continue
564
+ # Require BOTH preprocessor_config.json AND config.json with
565
+ # model_type == 'vision-encoder-decoder'.
566
+ # preprocessor_config.json is ViT/TrOCR-specific (not in PyLaia).
567
+ # config.json model_type disambiguates from Qwen3 adapters that
568
+ # also ship a preprocessor_config but have no config.json.
569
+ if not (d / "preprocessor_config.json").exists():
570
+ continue
571
+ cfg_path = d / "config.json"
572
+ if not cfg_path.exists():
573
+ continue
574
+ try:
575
+ cfg = _json.load(open(cfg_path))
576
+ if cfg.get("model_type") != "vision-encoder-decoder":
577
+ continue
578
+ except Exception:
579
+ continue
580
+ options.append({
581
+ "label": d.name,
582
+ "value": str(d),
583
+ "source": "local",
584
+ })
585
+ return options
586
+
587
+
588
+ def _scan_vlm_models(engine_type: str = "qwen3") -> list:
589
+ """Scan models/ directory for local VLM checkpoints (LoRA adapters and full models).
590
+
591
+ Looks for directories containing adapter_config.json (LoRA fine-tunes) or
592
+ config.json mentioning Qwen/VLM/vision architectures.
593
+
594
+ Returns options list ending with a __custom__ sentinel for manual entry.
595
+ """
596
+ models_dir = PROJECT_ROOT / "models"
597
+ options = []
598
+
599
+ if models_dir.exists():
600
+ for d in sorted(models_dir.iterdir()):
601
+ if not d.is_dir():
602
+ continue
603
+
604
+ # Check for LoRA adapter at top-level
605
+ if (d / "adapter_config.json").exists():
606
+ try:
607
+ import json as _json
608
+ with open(d / "adapter_config.json") as f:
609
+ adapter_cfg = _json.load(f)
610
+ base = adapter_cfg.get("base_model_name_or_path", "")
611
+ is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
612
+ is_churro = "churro" in base.lower() or "churro" in d.name.lower()
613
+ if engine_type == "qwen3" and is_qwen and not is_churro:
614
+ options.append({
615
+ "label": f"{d.name} (LoRA → {base})",
616
+ "value": str(d),
617
+ "base_model": base,
618
+ "adapter": str(d),
619
+ })
620
+ elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
621
+ options.append({
622
+ "label": f"{d.name} (LoRA → {base})",
623
+ "value": str(d),
624
+ "base_model": base,
625
+ "adapter": str(d),
626
+ })
627
+ except Exception:
628
+ pass
629
+ continue # Don't also check final_model subdirs
630
+
631
+ # Check for final_model subdirectory with adapter
632
+ final = d / "final_model"
633
+ if final.is_dir() and (final / "adapter_config.json").exists():
634
+ try:
635
+ import json as _json
636
+ with open(final / "adapter_config.json") as f:
637
+ adapter_cfg = _json.load(f)
638
+ base = adapter_cfg.get("base_model_name_or_path", "")
639
+ is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
640
+ is_churro = "churro" in base.lower() or "churro" in d.name.lower()
641
+ if engine_type == "qwen3" and is_qwen and not is_churro:
642
+ options.append({
643
+ "label": f"{d.name} (LoRA → {base})",
644
+ "value": str(final),
645
+ "base_model": base,
646
+ "adapter": str(final),
647
+ })
648
+ elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
649
+ options.append({
650
+ "label": f"{d.name} (LoRA → {base})",
651
+ "value": str(final),
652
+ "base_model": base,
653
+ "adapter": str(final),
654
+ })
655
+ except Exception:
656
+ pass
657
+
658
+ # Always append a "Custom / HuggingFace" sentinel as the last option
659
+ options.append({
660
+ "label": "Custom / HuggingFace model ID...",
661
+ "value": "__custom__",
662
+ })
663
+ return options
664
+
665
+
666
+ ENGINE_SCHEMAS = {
667
+ "CRNN-CTC (PyLaia-inspired)": lambda: {
668
+ "fields": [
669
+ {"key": "model_path", "type": "select", "label": "Model",
670
+ "options": _get_pylaia_model_options(),
671
+ "custom_key": "custom_model_path",
672
+ "custom_placeholder": "Absolute path to best_model.pt (e.g. /home/…/models/pylaia_yiddish_20260326/best_model.pt)"},
673
+ {"key": "enable_spaces", "type": "checkbox",
674
+ "label": "Convert <space> tokens", "default": True},
675
+ {"key": "flip_rtl", "type": "checkbox",
676
+ "label": "RTL manuscript (flip line images)", "default": False,
677
+ "hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
678
+ ]
679
+ },
680
+ "TrOCR": lambda: {
681
+ "fields": [
682
+ {"key": "model_path", "type": "select", "label": "Model",
683
+ "options": _scan_trocr_models(),
684
+ "custom_key": "custom_model_path",
685
+ "custom_placeholder": "HuggingFace model ID (e.g. microsoft/trocr-base-handwritten) or absolute local path"},
686
+ {"key": "num_beams", "type": "number", "label": "Beam Search",
687
+ "min": 1, "max": 10, "default": 4},
688
+ {"key": "normalize_background", "type": "checkbox",
689
+ "label": "Normalize Background", "default": False},
690
+ {"key": "flip_rtl", "type": "checkbox",
691
+ "label": "RTL manuscript (flip line images)", "default": False,
692
+ "hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
693
+ ]
694
+ },
695
+ "Qwen3-VL": lambda: {
696
+ "fields": [
697
+ {"key": "model_preset", "type": "select", "label": "Model",
698
+ "options": _scan_vlm_models("qwen3"),
699
+ "custom_key": "base_model",
700
+ "custom_placeholder": "HuggingFace model ID, e.g. Qwen/Qwen3-VL-8B-Instruct"},
701
+ {"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
702
+ "min": 512, "max": 4096, "default": 1536},
703
+ ]
704
+ },
705
+ "Churro VLM": lambda: {
706
+ "fields": [
707
+ {"key": "model_preset", "type": "select", "label": "Model",
708
+ "options": _scan_vlm_models("churro"),
709
+ "custom_key": "model_name",
710
+ "custom_placeholder": "HuggingFace model ID, e.g. stanford-oval/churro-3B"},
711
+ {"key": "device", "type": "select", "label": "Device",
712
+ "options": [{"label": "Auto", "value": "auto"},
713
+ {"label": "GPU 0", "value": "cuda:0"},
714
+ {"label": "GPU 1", "value": "cuda:1"},
715
+ {"label": "CPU", "value": "cpu"}]},
716
+ {"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
717
+ "min": 512, "max": 4096, "default": 2048},
718
+ ]
719
+ },
720
+ "Kraken": lambda: {
721
+ "fields": [
722
+ {"key": "model_path", "type": "select", "label": "Model",
723
+ "options": _scan_kraken_models(),
724
+ "custom_key": "custom_model_path",
725
+ "custom_placeholder": "Absolute path on server, e.g. /home/user/models/my.mlmodel",
726
+ "upload": True},
727
+ ]
728
+ },
729
+ "Commercial APIs": lambda: {
730
+ "fields": [
731
+ {"key": "provider", "type": "select", "label": "Provider",
732
+ "options": [
733
+ {"label": "OpenAI (GPT-4o, o1, …)", "value": "OpenAI"},
734
+ {"label": "Google Gemini", "value": "Gemini"},
735
+ {"label": "Anthropic Claude", "value": "Claude"},
736
+ ]},
737
+ {"key": "model", "type": "select", "label": "Model",
738
+ "dynamic": True,
739
+ "dynamic_hint": "Enter API key, then ↻ to load available models",
740
+ # No static lists — always fetch live from the provider API
741
+ "per_provider_options": {},
742
+ "options": [],
743
+ "custom_key": "custom_model_id",
744
+ "custom_placeholder": "e.g. gpt-4.5, gemini-exp-1206, claude-opus-4"},
745
+ {"key": "api_key", "type": "password", "label": "API Key",
746
+ "default": "", "placeholder": "Paste your API key here"},
747
+ {"key": "temperature", "type": "number", "label": "Temperature",
748
+ "min": 0.0, "max": 2.0, "default": 0.0,
749
+ "placeholder": "0.0 = deterministic (recommended for transcription)"},
750
+ {"key": "max_output_tokens", "type": "number", "label": "Max output tokens (optional)",
751
+ "min": 512, "max": 65536, "default": None,
752
+ "placeholder": "Leave blank = model maximum"},
753
+ {"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
754
+ "default": "",
755
+ "rows": 4,
756
+ "placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
757
+ "hint": "Leave blank to use the default prompt shown above"},
758
+ {"key": "thinking_mode", "type": "select", "label": "Thinking Mode (Gemini only)",
759
+ "options": [
760
+ {"label": "Auto (model decides, no cap)", "value": ""},
761
+ {"label": "Low (budget: 8k tokens)", "value": "low"},
762
+ {"label": "High (no cap, max reasoning)", "value": "high"},
763
+ ], "default": ""},
764
+ ]
765
+ },
766
+ "OpenWebUI": lambda: {
767
+ "fields": [
768
+ {"key": "base_url", "type": "text", "label": "Base URL",
769
+ "default": "",
770
+ "placeholder": "https://your-openwebui-instance/api or .../api/v1"},
771
+ {"key": "api_key", "type": "password", "label": "API Key",
772
+ "default": "", "placeholder": "Your OpenWebUI API key"},
773
+ {"key": "model", "type": "select", "label": "Model",
774
+ "dynamic": True,
775
+ "dynamic_hint": "Enter API key & base URL, then ↻ to load available models",
776
+ "options": [{"label": "Custom model ID…", "value": "__custom__"}],
777
+ "default": "__custom__",
778
+ "custom_key": "model_custom",
779
+ "custom_placeholder": "e.g. llama3.1, qwen2.5vl, gemma3, ..."},
780
+ {"key": "temperature", "type": "number", "label": "Temperature",
781
+ "min": 0.0, "max": 2.0, "default": 0.1},
782
+ {"key": "max_tokens", "type": "number", "label": "Max output tokens (optional)",
783
+ "min": 512, "max": 65536, "default": None,
784
+ "placeholder": "Leave blank = model maximum"},
785
+ {"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
786
+ "default": "",
787
+ "rows": 3,
788
+ "placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
789
+ "hint": "Leave blank to use the default prompt shown above"},
790
+ ]
791
+ },
792
+ "LightOnOCR": lambda: {
793
+ "fields": [
794
+ {"key": "model_path", "type": "select", "label": "Model",
795
+ "options": (lambda: [
796
+ {"label": f"{name} — {info.get('description','')}", "value": info["id"]}
797
+ for name, info in __import__('lighton_models', fromlist=['LIGHTON_MODELS']).LIGHTON_MODELS.items()
798
+ ] + [{"label": "Custom HuggingFace ID…", "value": "__custom__"}])(),
799
+ "custom_key": "custom_model_path",
800
+ "custom_placeholder": "HuggingFace model ID, e.g. lightonai/LightOnOCR-2-1B-base"},
801
+ {"key": "max_new_tokens", "type": "number", "label": "Max new tokens",
802
+ "min": 32, "max": 512, "default": 128},
803
+ ]
804
+ },
805
+ "PaddleOCR": lambda: {
806
+ "fields": [
807
+ {"key": "lang", "type": "select", "label": "Language / Script",
808
+ "default": "ch",
809
+ "options": [
810
+ {"label": "Chinese + English (mixed, recommended default)", "value": "ch"},
811
+ {"label": "English", "value": "en"},
812
+ {"label": "German", "value": "german"},
813
+ {"label": "French", "value": "french"},
814
+ {"label": "Japanese", "value": "japan"},
815
+ {"label": "Korean", "value": "korean"},
816
+ {"label": "Arabic", "value": "arabic"},
817
+ {"label": "Cyrillic (Russian/Ukrainian/Bulgarian)", "value": "cyrillic"},
818
+ {"label": "Latin script (generic)", "value": "latin"},
819
+ {"label": "Custom (enter code below)", "value": "__custom__"},
820
+ ],
821
+ "custom_key": "custom_lang",
822
+ "custom_placeholder": "PaddleOCR lang code, e.g. ru, uk, fr, es, it, pt, …",
823
+ "hint": "One language model per run. 'ch' is bilingual (Chinese+English) and PaddleOCR's strongest model. For mixed-script documents outside this list, run separate passes."},
824
+ {"key": "use_angle_cls", "type": "checkbox",
825
+ "label": "Text-angle classifier (correct 180° rotation)", "default": True},
826
+ {"key": "use_gpu", "type": "checkbox",
827
+ "label": "Use GPU (requires paddlepaddle-gpu)", "default": False},
828
+ ]
829
+ },
830
+ }
831
+
832
+
833
+ # ---------------------------------------------------------------------------
834
+ # Request/response models
835
+ # ---------------------------------------------------------------------------
836
+
837
+ class EngineLoadRequest(BaseModel):
838
+ engine_name: str
839
+ config: Dict[str, Any] = {}
840
+
841
+
842
+ class TranscribeRequest(BaseModel):
843
+ image_id: str
844
+ seg_method: str = "kraken" # kraken, kraken-blla, hpp
845
+ seg_device: str = "cpu"
846
+ max_columns: int = 6 # blla: max sub-columns per region (iterative splitting)
847
+ split_width_fraction: float = 0.40 # blla: min region width (fraction of page) to trigger sub-split
848
+ use_pagexml: bool = True # use attached PAGE XML for segmentation when available
849
+ text_direction: str = "horizontal-lr" # reading order for Kraken: horizontal-lr, horizontal-rl, vertical-lr, vertical-rl
850
+ engine_config_overrides: Dict[str, Any] = {} # live form values merged into stored config at transcription time
851
+
852
+
853
+ # ---------------------------------------------------------------------------
854
+ # Routes
855
+ # ---------------------------------------------------------------------------
856
+
857
+ @app.get("/")
858
+ async def index():
859
+ return FileResponse(str(STATIC_DIR / "index.html"))
860
+
861
+
862
+ @app.get("/demo")
863
+ async def pwa_demo():
864
+ return FileResponse(str(STATIC_DIR / "pwa" / "demo.html"))
865
+
866
+
867
+ @app.get("/manifest.json")
868
+ async def pwa_manifest():
869
+ """Serve the PWA manifest from root so scope / start_url are valid."""
870
+ from fastapi.responses import FileResponse as _FR
871
+ return _FR(str(STATIC_DIR / "pwa" / "manifest.json"), media_type="application/manifest+json")
872
+
873
+
874
+ @app.get("/sw.js")
875
+ async def pwa_service_worker():
876
+ """Serve the PWA service worker from root scope so it can control /demo."""
877
+ from fastapi.responses import FileResponse as _FR
878
+ resp = _FR(str(STATIC_DIR / "pwa" / "sw.js"), media_type="application/javascript")
879
+ resp.headers["Service-Worker-Allowed"] = "/"
880
+ return resp
881
+
882
+
883
+ @app.get("/api/engines")
884
+ async def list_engines():
885
+ registry = get_global_registry()
886
+ engines = []
887
+ for engine in registry.get_all_engines():
888
+ available = engine.is_available()
889
+ engines.append({
890
+ "name": engine.get_name(),
891
+ "description": engine.get_description(),
892
+ "available": available,
893
+ "unavailable_reason": engine.get_unavailable_reason() if not available else None,
894
+ "requires_line_segmentation": engine.requires_line_segmentation(),
895
+ "has_config_schema": engine.get_name() in ENGINE_SCHEMAS,
896
+ })
897
+ return engines
898
+
899
+
900
+ @app.get("/api/engine/{name}/config-schema")
901
+ async def get_config_schema(name: str):
902
+ if name not in ENGINE_SCHEMAS:
903
+ return {"fields": []}
904
+ schema = ENGINE_SCHEMAS[name]()
905
+
906
+ # Key status: always "missing" from server perspective — browser localStorage
907
+ # is the only key store. The frontend checks localStorage client-side.
908
+ for field in schema.get("fields", []):
909
+ if field.get("type") == "password":
910
+ field["key_status"] = "missing"
911
+
912
+ return schema
913
+
914
+
915
+ def _openwebui_model_urls(base_url: str) -> list[str]:
916
+ base = base_url.strip().rstrip("/")
917
+ if not base:
918
+ return []
919
+ urls = [f"{base}/models"]
920
+ if base.endswith("/api"):
921
+ urls.append(f"{base}/v1/models")
922
+ urls.append(f"{base[:-4]}/v1/models")
923
+ elif base.endswith("/api/v1"):
924
+ urls.append(f"{base[:-3]}/models")
925
+ urls.append(f"{base}/models")
926
+ elif base.endswith("/v1"):
927
+ urls.append(f"{base[:-3]}/api/models")
928
+ else:
929
+ urls.append(f"{base}/api/models")
930
+ urls.append(f"{base}/api/v1/models")
931
+ urls.append(f"{base}/v1/models")
932
+ return list(dict.fromkeys(urls))
933
+
934
+
935
+ def _extract_openwebui_model_ids(payload: Any) -> list[str]:
936
+ if isinstance(payload, dict):
937
+ for key in ("data", "models"):
938
+ items = payload.get(key)
939
+ if isinstance(items, list):
940
+ return _extract_openwebui_model_ids(items)
941
+ return [
942
+ str(value.get("id") or value.get("name"))
943
+ for value in payload.values()
944
+ if isinstance(value, dict) and (value.get("id") or value.get("name"))
945
+ ]
946
+
947
+ if isinstance(payload, list):
948
+ models = []
949
+ for item in payload:
950
+ if isinstance(item, str):
951
+ models.append(item)
952
+ elif isinstance(item, dict):
953
+ model_id = item.get("id") or item.get("name") or item.get("model")
954
+ if model_id:
955
+ models.append(str(model_id))
956
+ return sorted(set(models))
957
+
958
+ return []
959
+
960
+
961
+ def _fetch_openwebui_models(base_url: str, api_key: str) -> list[str]:
962
+ import urllib.error
963
+ import urllib.request
964
+
965
+ errors = []
966
+ for url in _openwebui_model_urls(base_url):
967
+ req = urllib.request.Request(
968
+ url,
969
+ headers={
970
+ "Authorization": f"Bearer {api_key}",
971
+ "x-api-key": api_key,
972
+ "Accept": "application/json",
973
+ "Content-Type": "application/json",
974
+ "User-Agent": "Polyscriptor-HTR-Demo/1.0",
975
+ },
976
+ )
977
+ try:
978
+ with urllib.request.urlopen(req, timeout=20) as resp:
979
+ status = resp.status
980
+ content_type = resp.headers.get("Content-Type", "")
981
+ body = resp.read().decode("utf-8", errors="replace")
982
+ try:
983
+ payload = json.loads(body)
984
+ except json.JSONDecodeError:
985
+ sample = body.strip().replace("\n", " ")[:120] or "<empty response>"
986
+ errors.append(f"{url}: HTTP {status}, non-JSON response ({content_type}): {sample}")
987
+ continue
988
+ models = _extract_openwebui_model_ids(payload)
989
+ if models:
990
+ return models
991
+ errors.append(f"{url}: no model ids in response")
992
+ except urllib.error.HTTPError as exc:
993
+ body = exc.read().decode("utf-8", errors="replace")[:200]
994
+ errors.append(f"{url}: HTTP {exc.code} {body}")
995
+ except Exception as exc:
996
+ errors.append(f"{url}: {exc}")
997
+ raise RuntimeError("; ".join(errors) if errors else "No OpenWebUI model endpoint tried")
998
+
999
+
1000
+ @app.get("/api/engine/status")
1001
+ async def engine_status(request: Request):
1002
+ session = _get_session(request)
1003
+ if session.pool_key and session.pool_key in engine_pool:
1004
+ slot = engine_pool[session.pool_key]
1005
+ return {
1006
+ "loaded": slot.engine.is_model_loaded(),
1007
+ "engine_name": slot.engine_name,
1008
+ "config": slot.config,
1009
+ }
1010
+ # Fallback: compat shim for tests / startup
1011
+ return {
1012
+ "loaded": loaded_engine is not None and loaded_engine.is_model_loaded(),
1013
+ "engine_name": loaded_engine_name,
1014
+ "config": loaded_config,
1015
+ }
1016
+
1017
+
1018
+ @app.get("/api/engine/{name}/models")
1019
+ async def get_engine_models(
1020
+ name: str,
1021
+ api_key: str = "",
1022
+ provider: str = "openai",
1023
+ base_url: str = "",
1024
+ ):
1025
+ """
1026
+ Fetch available models for engines whose model list is dynamic.
1027
+
1028
+ - OpenWebUI: queries the OpenWebUI /api/models endpoint
1029
+ - Commercial APIs: uses existing fetch_* helpers with fallback lists
1030
+ """
1031
+ if name == "OpenWebUI":
1032
+ resolved = _resolve_api_key("openwebui", api_key)
1033
+ if not resolved:
1034
+ return {"models": [], "error": "No API key — paste one in the form"}
1035
+ effective_url = base_url.strip().rstrip("/")
1036
+ if not effective_url:
1037
+ return {"models": [], "error": "Enter your OpenWebUI base URL"}
1038
+ try:
1039
+ models = await asyncio.to_thread(_fetch_openwebui_models, effective_url, resolved)
1040
+ return {"models": models}
1041
+ except Exception as e:
1042
+ return {"models": [], "error": str(e)}
1043
+
1044
+ elif name == "Commercial APIs":
1045
+ prov = provider.lower()
1046
+ resolved = _resolve_api_key(prov, api_key)
1047
+ if not resolved:
1048
+ return {"models": [], "error": "No API key — paste one in the form"}
1049
+ try:
1050
+ sys.path.insert(0, str(PROJECT_ROOT))
1051
+ if prov == "openai":
1052
+ from inference_commercial_api import fetch_openai_models
1053
+ models = await asyncio.to_thread(fetch_openai_models, resolved)
1054
+ return {"models": models}
1055
+ elif prov == "gemini":
1056
+ from inference_commercial_api import fetch_gemini_models
1057
+ models = await asyncio.to_thread(fetch_gemini_models, resolved)
1058
+ return {"models": models}
1059
+ elif prov == "claude":
1060
+ from inference_commercial_api import fetch_claude_models
1061
+ models = await asyncio.to_thread(fetch_claude_models, resolved)
1062
+ return {"models": models}
1063
+ else:
1064
+ return {"models": [], "error": f"Unknown provider: {provider}"}
1065
+ except Exception as e:
1066
+ return {"models": [], "error": str(e)}
1067
+
1068
+ return {"models": [], "error": f"Dynamic model listing not supported for '{name}'"}
1069
+
1070
+
1071
+ @app.post("/api/engine/load")
1072
+ async def load_engine(request: Request, req: EngineLoadRequest):
1073
+ global loaded_engine, loaded_engine_name, loaded_config
1074
+ session = _get_session(request)
1075
+
1076
+ registry = get_global_registry()
1077
+ reg_engine = registry.get_engine_by_name(req.engine_name)
1078
+ if not reg_engine:
1079
+ raise HTTPException(404, f"Engine '{req.engine_name}' not found")
1080
+ if not reg_engine.is_available():
1081
+ raise HTTPException(400, f"Engine not available: {reg_engine.get_unavailable_reason()}")
1082
+
1083
+ # --- Config resolution (unchanged logic) ---
1084
+ config = dict(req.config)
1085
+
1086
+ if req.engine_name == "CRNN-CTC (PyLaia-inspired)" and "model_path" in config:
1087
+ custom_val = config.pop("custom_model_path", "").strip()
1088
+ if config["model_path"] == "__custom__":
1089
+ if not custom_val:
1090
+ raise HTTPException(400, "Please enter an absolute path to a best_model.pt file")
1091
+ config["model_path"] = custom_val
1092
+ # else: named preset from PYLAIA_MODELS — engine resolves it
1093
+
1094
+ elif req.engine_name == "Kraken" and "model_path" in config:
1095
+ custom_val = config.pop("custom_model_path", "").strip()
1096
+ val = config["model_path"]
1097
+ if val == "__custom__":
1098
+ if not custom_val:
1099
+ raise HTTPException(400, "Please enter a path to a local .mlmodel file")
1100
+ config["model_path"] = custom_val
1101
+ elif val.startswith("__zenodo__"):
1102
+ # Zenodo preset: pass preset_id, let engine handle download
1103
+ config["preset_id"] = val[len("__zenodo__"):]
1104
+ config["model_path"] = None
1105
+ # else: relative local path from select (e.g. "models/kraken_cs/best.mlmodel") — use as-is
1106
+
1107
+ elif req.engine_name == "TrOCR" and "model_path" in config:
1108
+ custom_val = config.pop("custom_model_path", "").strip()
1109
+ if config["model_path"] == "__custom__":
1110
+ if not custom_val:
1111
+ raise HTTPException(400, "Please enter a HuggingFace model ID or local path")
1112
+ config["model_path"] = custom_val
1113
+ from pathlib import Path as _P
1114
+ if _P(config["model_path"]).exists():
1115
+ config["model_source"] = "local"
1116
+ else:
1117
+ config["model_source"] = "huggingface"
1118
+
1119
+ elif req.engine_name == "Qwen3-VL" and "model_preset" in config:
1120
+ preset_val = config.pop("model_preset")
1121
+ custom_val = config.pop("base_model", "").strip()
1122
+ if preset_val == "__custom__":
1123
+ config["base_model"] = custom_val or "Qwen/Qwen3-VL-8B-Instruct"
1124
+ config["adapter"] = None
1125
+ else:
1126
+ vlm_opts = _scan_vlm_models("qwen3")
1127
+ matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
1128
+ if matched:
1129
+ config["base_model"] = matched.get("base_model", preset_val)
1130
+ config["adapter"] = matched.get("adapter")
1131
+ else:
1132
+ config["base_model"] = preset_val
1133
+ config["adapter"] = None
1134
+
1135
+ elif req.engine_name == "Churro VLM" and "model_preset" in config:
1136
+ preset_val = config.pop("model_preset")
1137
+ custom_val = config.pop("model_name", "").strip()
1138
+ if preset_val == "__custom__":
1139
+ config["model_name"] = custom_val or "stanford-oval/churro-3B"
1140
+ config["adapter_path"] = None
1141
+ else:
1142
+ vlm_opts = _scan_vlm_models("churro")
1143
+ matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
1144
+ if matched:
1145
+ config["model_name"] = matched.get("base_model", preset_val)
1146
+ config["adapter_path"] = matched.get("adapter")
1147
+ else:
1148
+ config["model_name"] = preset_val
1149
+ config["adapter_path"] = None
1150
+
1151
+ elif req.engine_name == "LightOnOCR" and "model_path" in config:
1152
+ custom_val = config.pop("custom_model_path", "").strip()
1153
+ if config["model_path"] == "__custom__":
1154
+ if not custom_val:
1155
+ raise HTTPException(400, "Please enter a HuggingFace model ID for LightOnOCR")
1156
+ config["model_path"] = custom_val
1157
+
1158
+ elif req.engine_name == "PaddleOCR" and "lang" in config:
1159
+ if config["lang"] == "__custom__":
1160
+ custom_lang = config.pop("custom_lang", "").strip()
1161
+ if not custom_lang:
1162
+ raise HTTPException(400, "Please enter a PaddleOCR language code")
1163
+ config["lang"] = custom_lang
1164
+ else:
1165
+ config.pop("custom_lang", None)
1166
+
1167
+ elif req.engine_name == "Commercial APIs":
1168
+ if config.get("model") == "__custom__":
1169
+ config["model"] = config.pop("model_custom", "").strip() or "gpt-4o"
1170
+
1171
+ elif req.engine_name == "OpenWebUI":
1172
+ if config.get("model") == "__custom__":
1173
+ custom_model = config.pop("model_custom", "").strip()
1174
+ if not custom_model:
1175
+ raise HTTPException(400, "Please enter an OpenWebUI model ID")
1176
+ config["model"] = custom_model
1177
+
1178
+ # Resolve API keys
1179
+ if req.engine_name == "Commercial APIs":
1180
+ provider_slot = config.get("provider", "openai").lower()
1181
+ raw_key = config.get("api_key", "")
1182
+ resolved = _resolve_api_key(provider_slot, raw_key)
1183
+ if not resolved:
1184
+ raise HTTPException(400, f"No API key for {config.get('provider')}. "
1185
+ "Paste your API key in the field.")
1186
+ config["api_key"] = resolved
1187
+
1188
+ elif req.engine_name == "OpenWebUI":
1189
+ base_url = config.get("base_url", "").strip().rstrip("/")
1190
+ if not base_url:
1191
+ raise HTTPException(400, "No OpenWebUI base URL. "
1192
+ "Enter your own OpenWebUI API base URL.")
1193
+ config["base_url"] = base_url
1194
+ raw_key = config.get("api_key", "")
1195
+ resolved = _resolve_api_key("openwebui", raw_key)
1196
+ if not resolved:
1197
+ raise HTTPException(400, "No API key for OpenWebUI. "
1198
+ "Paste your API key in the field.")
1199
+ config["api_key"] = resolved
1200
+
1201
+ # Strip empty custom_prompt for API engines (use engine default)
1202
+ if req.engine_name in ("Commercial APIs", "OpenWebUI"):
1203
+ if not config.get("custom_prompt", "").strip():
1204
+ config["custom_prompt"] = None
1205
+
1206
+ # --- Engine pool logic ---
1207
+ pool_key = _make_pool_key(req.engine_name, config)
1208
+
1209
+ async with pool_lock:
1210
+ # Release previous engine reference for this session
1211
+ if session.pool_key and session.pool_key in engine_pool:
1212
+ prev_slot = engine_pool[session.pool_key]
1213
+ prev_slot.ref_count = max(0, prev_slot.ref_count - 1)
1214
+ if prev_slot.ref_count == 0:
1215
+ log.info(f"Immediate eviction (engine switch): '{prev_slot.engine_name}'")
1216
+ try:
1217
+ prev_slot.engine.unload_model()
1218
+ except Exception as e:
1219
+ log.warning(f"unload_model() failed for '{prev_slot.engine_name}': {e}")
1220
+ if session.pool_key in engine_pool:
1221
+ del engine_pool[session.pool_key]
1222
+
1223
+ # Check if this exact engine+model is already loaded
1224
+ if pool_key in engine_pool:
1225
+ slot = engine_pool[pool_key]
1226
+ slot.ref_count += 1
1227
+ slot.last_used = time.time()
1228
+ session.pool_key = pool_key
1229
+ # Update compat shims
1230
+ loaded_engine = slot.engine
1231
+ loaded_engine_name = slot.engine_name
1232
+ loaded_config = slot.config
1233
+ log.info(f"Pool hit: reusing '{pool_key}' (ref_count={slot.ref_count})")
1234
+ return {"success": True, "load_time_s": 0.0,
1235
+ "engine_name": req.engine_name, "reused": True}
1236
+
1237
+ # Need new slot — evict if VRAM tight
1238
+ await _maybe_evict(req.engine_name)
1239
+
1240
+ # Load model OUTSIDE pool_lock (blocking I/O)
1241
+ engine = _create_engine_instance(req.engine_name)
1242
+ if not engine:
1243
+ raise HTTPException(500, f"Cannot create engine instance for '{req.engine_name}'")
1244
+
1245
+ start = time.time()
1246
+ success = await asyncio.to_thread(engine.load_model, config)
1247
+ elapsed = time.time() - start
1248
+
1249
+ if not success:
1250
+ raise HTTPException(500, "Failed to load model")
1251
+
1252
+ slot = EngineSlot(
1253
+ engine=engine,
1254
+ engine_name=req.engine_name,
1255
+ config=config,
1256
+ pool_key=pool_key,
1257
+ ref_count=1,
1258
+ last_used=time.time(),
1259
+ )
1260
+
1261
+ async with pool_lock:
1262
+ # Double-check: another request may have loaded the same key concurrently
1263
+ if pool_key in engine_pool:
1264
+ engine.unload_model()
1265
+ slot = engine_pool[pool_key]
1266
+ slot.ref_count += 1
1267
+ slot.last_used = time.time()
1268
+ else:
1269
+ engine_pool[pool_key] = slot
1270
+
1271
+ session.pool_key = pool_key
1272
+ # Update compat shims
1273
+ loaded_engine = slot.engine
1274
+ loaded_engine_name = slot.engine_name
1275
+ loaded_config = slot.config
1276
+
1277
+ log.info(f"Pool miss: loaded '{pool_key}' in {elapsed:.1f}s (pool size={len(engine_pool)})")
1278
+ return {"success": True, "load_time_s": round(elapsed, 2),
1279
+ "engine_name": req.engine_name, "reused": False}
1280
+
1281
+
1282
+ @app.get("/api/keys")
1283
+ async def list_keys():
1284
+ """Keys are stored in browser localStorage only. Server has no key info.
1285
+
1286
+ This endpoint returns an empty dict — it exists for backwards compatibility.
1287
+ """
1288
+ return {}
1289
+
1290
+
1291
+ @app.post("/api/admin/evict-all")
1292
+ async def admin_evict_all(request: Request):
1293
+ """Force-evict all engine slots from VRAM (localhost admin only)."""
1294
+ if request.client and request.client.host not in ("127.0.0.1", "::1"):
1295
+ from fastapi import HTTPException
1296
+ raise HTTPException(status_code=403, detail="localhost only")
1297
+ async with pool_lock:
1298
+ evicted = []
1299
+ for key, slot in list(engine_pool.items()):
1300
+ try:
1301
+ slot.engine.unload_model()
1302
+ except Exception as e:
1303
+ log.warning(f"admin evict failed for '{key}': {e}")
1304
+ del engine_pool[key]
1305
+ evicted.append(key)
1306
+ for session in sessions.values():
1307
+ session.pool_key = None
1308
+ global loaded_engine, loaded_engine_name, loaded_config
1309
+ loaded_engine = None
1310
+ loaded_engine_name = ""
1311
+ loaded_config = {}
1312
+ log.info(f"Admin force-evict: cleared {len(evicted)} slot(s): {evicted}")
1313
+ return {"evicted": evicted}
1314
+
1315
+
1316
+ @app.post("/api/engine/unload")
1317
+ async def unload_engine(request: Request):
1318
+ global loaded_engine, loaded_engine_name, loaded_config
1319
+ session = _get_session(request)
1320
+
1321
+ async with pool_lock:
1322
+ if session.pool_key and session.pool_key in engine_pool:
1323
+ slot = engine_pool[session.pool_key]
1324
+ slot.ref_count = max(0, slot.ref_count - 1)
1325
+ if slot.ref_count == 0:
1326
+ log.info(f"Immediate eviction (explicit unload): '{slot.engine_name}'")
1327
+ try:
1328
+ slot.engine.unload_model()
1329
+ except Exception as e:
1330
+ log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
1331
+ if session.pool_key in engine_pool:
1332
+ del engine_pool[session.pool_key]
1333
+ session.pool_key = None
1334
+ # Update compat shims
1335
+ loaded_engine = None
1336
+ loaded_engine_name = ""
1337
+ loaded_config = {}
1338
+
1339
+ return {"success": True}
1340
+
1341
+
1342
+ def _register_image(session: UserSession, pil_image: Image.Image, filename: str, save_path: Path) -> str:
1343
+ """Store a PIL image in the session's cache and return its image_id."""
1344
+ image_id = str(uuid.uuid4())
1345
+ image_data = {
1346
+ "path": save_path,
1347
+ "xml_path": None,
1348
+ "pil_image": pil_image,
1349
+ "width": pil_image.width,
1350
+ "height": pil_image.height,
1351
+ "filename": filename,
1352
+ "lines": None,
1353
+ }
1354
+ session.image_cache[image_id] = image_data
1355
+ global_image_cache[image_id] = image_data
1356
+ return image_id
1357
+
1358
+
1359
+ def _get_image_data(session: UserSession, image_id: str) -> Optional[dict]:
1360
+ """Return image data, tolerating missing cookies in embedded Space contexts."""
1361
+ if image_id in session.image_cache:
1362
+ return session.image_cache[image_id]
1363
+ img_data = global_image_cache.get(image_id)
1364
+ if img_data is not None:
1365
+ session.image_cache[image_id] = img_data
1366
+ return img_data
1367
+
1368
+
1369
+ @app.post("/api/image/upload")
1370
+ async def upload_image(
1371
+ request: Request,
1372
+ file: UploadFile = File(...),
1373
+ max_dim: Optional[int] = Query(default=None, ge=100, description="Resize long edge to this many pixels (mobile upload only)"),
1374
+ ):
1375
+ session = _get_session(request)
1376
+ filename = file.filename or "upload"
1377
+ is_pdf = (
1378
+ filename.lower().endswith(".pdf") or
1379
+ (file.content_type or "").startswith("application/pdf")
1380
+ )
1381
+ image_exts = {
1382
+ ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp"
1383
+ }
1384
+ is_image = (
1385
+ (file.content_type or "").startswith("image/") or
1386
+ Path(filename).suffix.lower() in image_exts
1387
+ )
1388
+
1389
+ content = await file.read()
1390
+ if len(content) > 200 * 1024 * 1024:
1391
+ raise HTTPException(400, "File too large (max 200MB)")
1392
+
1393
+ # ── PDF: render each page as a separate image ──────────────────────────
1394
+ if is_pdf:
1395
+ if not PDF_AVAILABLE:
1396
+ raise HTTPException(400, "PDF support requires PyMuPDF. Install with: pip install pymupdf")
1397
+ try:
1398
+ import asyncio
1399
+ from concurrent.futures import ThreadPoolExecutor
1400
+
1401
+ def _render_pdf(data: bytes, stem: str, sess: UserSession) -> list:
1402
+ mat = _fitz.Matrix(150 / 72, 150 / 72)
1403
+ doc = _fitz.open(stream=data, filetype="pdf")
1404
+ results = []
1405
+ for i, page in enumerate(doc):
1406
+ pix = page.get_pixmap(matrix=mat, colorspace=_fitz.csRGB)
1407
+ pil_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
1408
+ page_filename = f"{stem}_page{i+1:03d}.png"
1409
+ save_path = UPLOAD_DIR / f"{uuid.uuid4()}.png"
1410
+ pil_page.save(save_path)
1411
+ pid = _register_image(sess, pil_page, page_filename, save_path)
1412
+ results.append({
1413
+ "image_id": pid,
1414
+ "filename": page_filename,
1415
+ "width": pil_page.width,
1416
+ "height": pil_page.height,
1417
+ "page": i + 1,
1418
+ })
1419
+ doc.close()
1420
+ return results
1421
+
1422
+ stem = Path(filename).stem
1423
+ loop = asyncio.get_event_loop()
1424
+ with ThreadPoolExecutor(max_workers=1) as pool:
1425
+ pages_out = await loop.run_in_executor(pool, _render_pdf, content, stem, session)
1426
+ return {
1427
+ "is_pdf": True,
1428
+ "filename": filename,
1429
+ "num_pages": len(pages_out),
1430
+ "pages": pages_out,
1431
+ }
1432
+ except Exception as e:
1433
+ raise HTTPException(400, f"Failed to render PDF: {e}")
1434
+
1435
+ # ── Regular image ───────────────────────────────────────────────────────
1436
+ if not is_image:
1437
+ raise HTTPException(400, "File must be an image or PDF")
1438
+
1439
+ ext = Path(filename).suffix or ".jpg"
1440
+ save_path = UPLOAD_DIR / f"{uuid.uuid4()}{ext}"
1441
+ save_path.write_bytes(content)
1442
+
1443
+ try:
1444
+ pil_image = Image.open(save_path)
1445
+ pil_image = ImageOps.exif_transpose(pil_image)
1446
+ pil_image = pil_image.convert("RGB")
1447
+ if max_dim and max(pil_image.width, pil_image.height) > max_dim:
1448
+ pil_image.thumbnail((max_dim, max_dim), Image.LANCZOS)
1449
+ pil_image.save(save_path)
1450
+ except Exception as e:
1451
+ save_path.unlink(missing_ok=True)
1452
+ raise HTTPException(400, f"Invalid image: {e}")
1453
+
1454
+ image_id = _register_image(session, pil_image, filename, save_path)
1455
+ return {
1456
+ "image_id": image_id,
1457
+ "width": pil_image.width,
1458
+ "height": pil_image.height,
1459
+ "filename": filename,
1460
+ }
1461
+
1462
+
1463
+ @app.post("/api/image/{image_id}/xml")
1464
+ async def upload_xml(request: Request, image_id: str, file: UploadFile = File(...)):
1465
+ """Attach a PAGE XML file to an already-uploaded image."""
1466
+ session = _get_session(request)
1467
+ img_data = _get_image_data(session, image_id)
1468
+ if img_data is None:
1469
+ raise HTTPException(404, "Image not found — upload image first")
1470
+ content = await file.read()
1471
+ if len(content) > 10 * 1024 * 1024:
1472
+ raise HTTPException(400, "XML too large (max 10MB)")
1473
+ xml_path = UPLOAD_DIR / f"{image_id}.xml"
1474
+ xml_path.write_bytes(content)
1475
+ img_data["xml_path"] = xml_path
1476
+ return {"success": True, "filename": file.filename}
1477
+
1478
+
1479
+ @app.get("/api/image/{image_id}")
1480
+ async def get_image(request: Request, image_id: str):
1481
+ session = _get_session(request)
1482
+ img_data = _get_image_data(session, image_id)
1483
+ if img_data is None:
1484
+ raise HTTPException(404, "Image not found")
1485
+ return FileResponse(str(img_data["path"]))
1486
+
1487
+
1488
+ @app.get("/api/image/{image_id}/info")
1489
+ async def image_info(request: Request, image_id: str):
1490
+ session = _get_session(request)
1491
+ d = _get_image_data(session, image_id)
1492
+ if d is None:
1493
+ raise HTTPException(404, "Image not found")
1494
+ return {
1495
+ "image_id": image_id,
1496
+ "filename": d["filename"],
1497
+ "width": d["width"],
1498
+ "height": d["height"],
1499
+ "has_xml": d["xml_path"] is not None,
1500
+ }
1501
+
1502
+
1503
+ async def _run_segmentation(img_data: dict, method: str, device: str = "cpu",
1504
+ max_columns: int = 6,
1505
+ split_width_fraction: float = 0.40,
1506
+ text_direction: str = "horizontal-lr") -> dict:
1507
+ """
1508
+ Shared segmentation helper. Runs the appropriate segmenter, stores
1509
+ results in img_data, and returns a serialisable dict ready for SSE or JSON.
1510
+ Also populates img_data["line_regions"] with a per-line region index list
1511
+ so the transcription loop can tag each line with its column.
1512
+ """
1513
+ if DEMO_MODE == "hf_space" and method == "kraken-blla":
1514
+ method = "kraken"
1515
+ device = "cpu"
1516
+ pil_image = img_data["pil_image"]
1517
+ xml_path = img_data.get("xml_path")
1518
+
1519
+ if DEMO_MODE == "hf_space" and xml_path is None and method == "hpp":
1520
+ return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data)
1521
+
1522
+ _import_segmenters()
1523
+
1524
+ regions: list = []
1525
+ lines: list = []
1526
+
1527
+ xml_region_data: list = [] # TextRegion bboxes from PAGE XML (for visualization)
1528
+ if xml_path is not None:
1529
+ from inference_page import PageXMLSegmenter as _PXSeg
1530
+ segmenter = _PXSeg(str(xml_path))
1531
+ lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
1532
+ source = "pagexml"
1533
+ xml_region_data = getattr(segmenter, 'region_data', []) or []
1534
+
1535
+
1536
+ elif method == "kraken-blla":
1537
+ segmenter = KrakenLineSegmenter(device=device)
1538
+ regions, lines = await asyncio.to_thread(
1539
+ segmenter.segment_with_regions, pil_image,
1540
+ device=device,
1541
+ max_columns=max_columns,
1542
+ split_width_fraction=split_width_fraction,
1543
+ text_direction=text_direction,
1544
+ )
1545
+ source = "kraken-blla"
1546
+
1547
+ elif method == "kraken":
1548
+ try:
1549
+ segmenter = KrakenLineSegmenter()
1550
+ # Use column-aware segmentation so multi-column pages read correctly
1551
+ regions, lines = await asyncio.to_thread(
1552
+ segmenter.segment_classical_with_regions, pil_image,
1553
+ max_columns=max_columns,
1554
+ )
1555
+ source = "kraken"
1556
+ except Exception as exc:
1557
+ if DEMO_MODE == "hf_space":
1558
+ log.warning("Kraken segmentation failed in HF Space; falling back to HPP: %s", exc)
1559
+ return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")
1560
+ raise
1561
+
1562
+ else: # hpp
1563
+ segmenter = LineSegmenter()
1564
+ lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
1565
+ source = "hpp"
1566
+
1567
+ if DEMO_MODE == "hf_space" and method == "kraken" and not lines:
1568
+ log.warning("Kraken returned no lines in HF Space; falling back to HPP")
1569
+ return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")
1570
+
1571
+ # Build per-line region index (used by transcription loop for column view)
1572
+ line_regions: list[int] = []
1573
+ if regions:
1574
+ offset = 0
1575
+ for ri, r in enumerate(regions):
1576
+ for _ in r.line_ids:
1577
+ line_regions.append(ri)
1578
+ offset += len(r.line_ids)
1579
+ else:
1580
+ line_regions = [0] * len(lines)
1581
+
1582
+ img_data["lines"] = lines
1583
+ img_data["line_regions"] = line_regions
1584
+ img_data["seg_source"] = source
1585
+ # PAGE XML provides region bboxes directly; Kraken/blla provide SegRegion objects
1586
+ if xml_region_data:
1587
+ img_data["seg_regions"] = xml_region_data
1588
+ elif regions:
1589
+ img_data["seg_regions"] = [
1590
+ {"id": r.id, "bbox": list(r.bbox), "num_lines": len(r.line_ids)}
1591
+ for r in regions
1592
+ ]
1593
+ else:
1594
+ img_data["seg_regions"] = []
1595
+
1596
+ result: dict = {
1597
+ "num_lines": len(lines),
1598
+ "bboxes": [list(l.bbox) for l in lines],
1599
+ "source": source,
1600
+ }
1601
+ if img_data["seg_regions"]:
1602
+ result["regions"] = img_data["seg_regions"]
1603
+ return result
1604
+
1605
+
1606
+ def _run_demo_hpp_segmentation(img_data: dict, source: str = "hpp") -> dict:
1607
+ """Small dependency-light line segmenter for the hosted CPU demo fallback."""
1608
+ pil_image = img_data["pil_image"]
1609
+ gray = np.array(pil_image.convert("L"))
1610
+ if gray.size == 0:
1611
+ lines = []
1612
+ else:
1613
+ threshold = min(220, max(90, float(np.percentile(gray, 42))))
1614
+ ink = gray < threshold
1615
+ row_density = ink.mean(axis=1)
1616
+ kernel = np.ones(9, dtype=np.float32) / 9.0
1617
+ smooth = np.convolve(row_density, kernel, mode="same")
1618
+ active_threshold = max(0.01, float(smooth.max()) * 0.13)
1619
+ min_height = max(10, int(pil_image.height * 0.008))
1620
+
1621
+ bands = []
1622
+ start = None
1623
+ for y, value in enumerate(smooth):
1624
+ if value > active_threshold and start is None:
1625
+ start = y
1626
+ elif (value <= active_threshold or y == len(smooth) - 1) and start is not None:
1627
+ end = y if y == len(smooth) - 1 else y - 1
1628
+ if end - start + 1 >= min_height:
1629
+ bands.append((start, end))
1630
+ start = None
1631
+
1632
+ lines = []
1633
+ for y1, y2 in bands[:100]:
1634
+ pad_y = max(3, int((y2 - y1 + 1) * 0.25))
1635
+ top = max(0, y1 - pad_y)
1636
+ bottom = min(pil_image.height, y2 + pad_y + 1)
1637
+ band_ink = ink[top:bottom, :]
1638
+ cols = np.where(band_ink.any(axis=0))[0]
1639
+ if cols.size:
1640
+ left = max(0, int(cols[0]) - 8)
1641
+ right = min(pil_image.width, int(cols[-1]) + 9)
1642
+ else:
1643
+ left = 0
1644
+ right = pil_image.width
1645
+ bbox = (left, top, right, bottom)
1646
+ lines.append(SimpleNamespace(
1647
+ image=pil_image.crop(bbox),
1648
+ bbox=bbox,
1649
+ coords=None,
1650
+ ))
1651
+
1652
+ img_data["lines"] = lines
1653
+ img_data["line_regions"] = [0] * len(lines)
1654
+ img_data["seg_source"] = source
1655
+ img_data["seg_regions"] = []
1656
+ return {
1657
+ "num_lines": len(lines),
1658
+ "bboxes": [list(line.bbox) for line in lines],
1659
+ "source": source,
1660
+ }
1661
+
1662
+
1663
+ @app.delete("/api/image/{image_id}/region/{region_index}")
1664
+ async def delete_region(request: Request, image_id: str, region_index: int):
1665
+ """
1666
+ Remove one detected region and its lines from the cached segmentation.
1667
+ Returns updated segmentation data in the same format as /segment,
1668
+ so the client can redraw the canvas.
1669
+ """
1670
+ session = _get_session(request)
1671
+ img_data = _get_image_data(session, image_id)
1672
+ if img_data is None:
1673
+ raise HTTPException(404, "Image not found")
1674
+
1675
+ seg_regions = img_data.get("seg_regions") or []
1676
+ if not seg_regions:
1677
+ raise HTTPException(400, "No segmentation data — run Segment first")
1678
+ if region_index < 0 or region_index >= len(seg_regions):
1679
+ raise HTTPException(400, f"Region index out of range (0–{len(seg_regions)-1})")
1680
+
1681
+ lines = img_data.get("lines") or []
1682
+ line_regions = img_data.get("line_regions") or ([0] * len(lines))
1683
+
1684
+ # Keep lines that are NOT in the deleted region; re-index later regions
1685
+ new_lines: list = []
1686
+ new_line_regions: list = []
1687
+ for line, lr in zip(lines, line_regions):
1688
+ if lr == region_index:
1689
+ continue
1690
+ new_lines.append(line)
1691
+ new_line_regions.append(lr if lr < region_index else lr - 1)
1692
+
1693
+ new_regions = [r for i, r in enumerate(seg_regions) if i != region_index]
1694
+
1695
+ img_data["lines"] = new_lines
1696
+ img_data["line_regions"] = new_line_regions
1697
+ img_data["seg_regions"] = new_regions
1698
+
1699
+ result: dict = {
1700
+ "num_lines": len(new_lines),
1701
+ "bboxes": [list(l.bbox) for l in new_lines],
1702
+ "source": img_data.get("seg_source", "modified"),
1703
+ }
1704
+ if new_regions:
1705
+ result["regions"] = new_regions
1706
+ return result
1707
+
1708
+
1709
+ @app.get("/api/image/{image_id}/segment")
1710
+ async def segment_image(
1711
+ request: Request,
1712
+ image_id: str,
1713
+ method: str = "kraken",
1714
+ device: str = "cpu",
1715
+ max_columns: int = 6,
1716
+ split_width_fraction: float = 0.40,
1717
+ text_direction: str = "horizontal-lr",
1718
+ ):
1719
+ """
1720
+ Run segmentation only (no transcription) and return line bboxes as JSON.
1721
+ Useful for previewing line layout before transcribing.
1722
+ """
1723
+ session = _get_session(request)
1724
+ img_data = _get_image_data(session, image_id)
1725
+ if img_data is None:
1726
+ raise HTTPException(404, "Image not found — upload first")
1727
+
1728
+ try:
1729
+ return await _run_segmentation(img_data, method, device,
1730
+ max_columns, split_width_fraction, text_direction)
1731
+ except Exception as e:
1732
+ raise HTTPException(500, f"Segmentation failed: {e}")
1733
+
1734
+
1735
+ @app.post("/api/transcribe")
1736
+ async def transcribe(request: Request, req: TranscribeRequest):
1737
+ session = _get_session(request)
1738
+
1739
+ # Resolve engine from session's pool slot
1740
+ if not session.pool_key or session.pool_key not in engine_pool:
1741
+ # Fallback: check compat shims (e.g. auto-loaded engine, no session yet)
1742
+ if not loaded_engine or not loaded_engine.is_model_loaded():
1743
+ raise HTTPException(400, "No engine loaded")
1744
+ slot = engine_pool.get(session.pool_key) if session.pool_key else None
1745
+ # Build effective engine/config references
1746
+ eff_engine = slot.engine if slot else loaded_engine
1747
+ _base_config = slot.config if slot else loaded_config
1748
+ # Merge live form overrides into a copy of the stored config so changes to
1749
+ # runtime-only fields (custom_prompt, thinking_mode, temperature, …) take
1750
+ # effect without requiring a model reload. Never overwrite security-sensitive
1751
+ # keys that were set during load (api_key, provider, model, model_path, …).
1752
+ _RELOAD_ONLY_KEYS = {"api_key", "provider", "model", "model_path", "model_source",
1753
+ "base_model", "adapter", "model_name", "preset_id", "lang",
1754
+ "use_gpu", "venv_path"}
1755
+ if req.engine_config_overrides:
1756
+ eff_config = dict(_base_config)
1757
+ for k, v in req.engine_config_overrides.items():
1758
+ if k not in _RELOAD_ONLY_KEYS:
1759
+ eff_config[k] = v
1760
+ else:
1761
+ eff_config = _base_config
1762
+ eff_engine_name = slot.engine_name if slot else loaded_engine_name
1763
+
1764
+ if not eff_engine or not eff_engine.is_model_loaded():
1765
+ raise HTTPException(400, "No engine loaded")
1766
+
1767
+ img_data = _get_image_data(session, req.image_id)
1768
+ if img_data is None:
1769
+ raise HTTPException(404, "Image not found — upload first")
1770
+
1771
+ pil_image = img_data["pil_image"]
1772
+
1773
+ # Per-request cancel event (replaces global cancel_event)
1774
+ request_id = str(uuid.uuid4())
1775
+ cancel_evt = asyncio.Event()
1776
+ session.cancel_events[request_id] = cancel_evt
1777
+
1778
+ async def event_stream():
1779
+ _import_segmenters()
1780
+
1781
+ try:
1782
+ # --- Segmentation ---
1783
+ xml_path = img_data.get("xml_path") if req.use_pagexml else None
1784
+
1785
+ if not eff_engine.requires_line_segmentation() and not xml_path:
1786
+ # Page-level engine with no PAGE XML — send whole page as single line
1787
+ from inference_page import LineSegment
1788
+ lines = [LineSegment(
1789
+ image=pil_image,
1790
+ bbox=(0, 0, pil_image.width, pil_image.height),
1791
+ coords=None,
1792
+ )]
1793
+ img_data["lines"] = lines
1794
+ img_data["line_regions"] = [0]
1795
+ img_data["seg_source"] = "page"
1796
+ img_data["seg_regions"] = []
1797
+ yield _sse("segmentation", {
1798
+ "num_lines": 1,
1799
+ "bboxes": [[0, 0, pil_image.width, pil_image.height]],
1800
+ "source": "page",
1801
+ })
1802
+ else:
1803
+ # Reuse cached segmentation if method matches (e.g. user clicked Segment first)
1804
+ cached_lines = img_data.get("lines")
1805
+ cached_source = img_data.get("seg_source")
1806
+ desired_source = "pagexml" if (xml_path and req.use_pagexml) else req.seg_method
1807
+
1808
+ if cached_lines and cached_source == desired_source:
1809
+ lines = cached_lines
1810
+ yield _sse("status", {"message": "Using cached segmentation..."})
1811
+ seg_event: dict = {
1812
+ "num_lines": len(lines),
1813
+ "bboxes": [list(l.bbox) for l in lines],
1814
+ "source": cached_source,
1815
+ }
1816
+ if img_data.get("seg_regions"):
1817
+ seg_event["regions"] = img_data["seg_regions"]
1818
+ yield _sse("segmentation", seg_event)
1819
+ elif xml_path is not None:
1820
+ yield _sse("status", {"message": "Reading line layout from PAGE XML..."})
1821
+ seg_result = await _run_segmentation(img_data, "pagexml",
1822
+ req.seg_device, req.max_columns,
1823
+ req.split_width_fraction,
1824
+ req.text_direction)
1825
+ lines = img_data["lines"]
1826
+ yield _sse("segmentation", seg_result)
1827
+ else:
1828
+ yield _sse("status", {"message": f"Segmenting with {req.seg_method}..."})
1829
+ seg_result = await _run_segmentation(img_data, req.seg_method,
1830
+ req.seg_device, req.max_columns,
1831
+ req.split_width_fraction,
1832
+ req.text_direction)
1833
+ lines = img_data["lines"]
1834
+ yield _sse("segmentation", seg_result)
1835
+
1836
+ # --- Transcription ---
1837
+ results = []
1838
+ token_usage: Dict[str, Any] = {}
1839
+ start_time = time.time()
1840
+ line_regions = img_data.get("line_regions") or ([0] * len(lines))
1841
+
1842
+ for i, line in enumerate(lines):
1843
+ # Check for cancellation before each line
1844
+ if cancel_evt.is_set():
1845
+ yield _sse("cancelled", {})
1846
+ return
1847
+
1848
+ line_img = line.image if line.image is not None else pil_image.crop(line.bbox)
1849
+ img_array = np.array(line_img.convert("RGB"))
1850
+
1851
+ # Use slot lock to serialize access to this engine instance
1852
+ if slot:
1853
+ async with slot.lock:
1854
+ slot.last_used = time.time()
1855
+ result = await asyncio.to_thread(
1856
+ eff_engine.transcribe_line, img_array, eff_config
1857
+ )
1858
+ else:
1859
+ result = await asyncio.to_thread(
1860
+ eff_engine.transcribe_line, img_array, eff_config
1861
+ )
1862
+
1863
+ text = str(result.text) if hasattr(result, "text") else str(result)
1864
+ confidence = None
1865
+ if hasattr(result, "confidence") and result.confidence is not None:
1866
+ confidence = float(result.confidence)
1867
+ if confidence > 1:
1868
+ confidence = confidence / 100.0
1869
+ # Accumulate token usage and extract thinking text from API engines (e.g. Gemini)
1870
+ thinking_text = None
1871
+ if hasattr(result, "metadata") and isinstance(result.metadata, dict):
1872
+ tu = result.metadata.get("token_usage")
1873
+ if tu:
1874
+ for k, v in tu.items():
1875
+ if v is not None:
1876
+ token_usage[k] = token_usage.get(k, 0) + v
1877
+ thinking_text = result.metadata.get("thinking_text")
1878
+
1879
+ line_data = {
1880
+ "index": i,
1881
+ "text": text,
1882
+ "confidence": confidence,
1883
+ "bbox": list(line.bbox),
1884
+ "region": line_regions[i] if i < len(line_regions) else 0,
1885
+ }
1886
+ if thinking_text:
1887
+ line_data["thinking_text"] = thinking_text
1888
+ results.append(line_data)
1889
+ progress_data: Dict[str, Any] = {
1890
+ "current": i + 1,
1891
+ "total": len(lines),
1892
+ "line": line_data,
1893
+ }
1894
+ if token_usage:
1895
+ progress_data["token_usage"] = dict(token_usage)
1896
+ yield _sse("progress", progress_data)
1897
+
1898
+ # Check for cancellation after each line's progress event
1899
+ if cancel_evt.is_set():
1900
+ yield _sse("cancelled", {})
1901
+ return
1902
+
1903
+ # Store completed results in session image_cache for export
1904
+ img_data["results"] = results
1905
+
1906
+ elapsed = time.time() - start_time
1907
+ complete_data: Dict[str, Any] = {
1908
+ "lines": results,
1909
+ "total_time_s": round(elapsed, 2),
1910
+ "engine": eff_engine_name,
1911
+ }
1912
+ if token_usage:
1913
+ complete_data["token_usage"] = token_usage
1914
+ yield _sse("complete", complete_data)
1915
+
1916
+ except Exception as e:
1917
+ log.exception("Transcription error")
1918
+ yield _sse("error", {"message": str(e)})
1919
+ finally:
1920
+ # Clean up this request's cancel event
1921
+ session.cancel_events.pop(request_id, None)
1922
+
1923
+ return StreamingResponse(
1924
+ event_stream(),
1925
+ media_type="text/event-stream",
1926
+ headers={
1927
+ "Cache-Control": "no-cache",
1928
+ "X-Accel-Buffering": "no", # Disable nginx buffering if behind proxy
1929
+ },
1930
+ )
1931
+
1932
+
1933
+ @app.post("/api/transcribe/cancel")
1934
+ async def cancel_transcription(request: Request):
1935
+ """Signal all running transcriptions for this session to stop."""
1936
+ session = _get_session(request)
1937
+ for evt in session.cancel_events.values():
1938
+ evt.set()
1939
+ return {"success": True}
1940
+
1941
+
1942
+ @app.post("/api/image/{image_id}/export-xml")
1943
+ async def export_xml(request: Request, image_id: str):
1944
+ """Export transcription results for image_id as PAGE XML."""
1945
+ session = _get_session(request)
1946
+ pretty, stem = _build_xml_bytes(session, image_id)
1947
+ return Response(
1948
+ content=pretty,
1949
+ media_type="application/xml",
1950
+ headers={"Content-Disposition": f'attachment; filename="{stem}.xml"'},
1951
+ )
1952
+
1953
+
1954
+ def _build_xml_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
1955
+ """Return (xml_bytes, stem) for a cached image, or raise HTTPException."""
1956
+ import xml.etree.ElementTree as ET
1957
+ from xml.dom import minidom
1958
+ from page_xml_exporter import PageXMLExporter
1959
+
1960
+ img_data = _get_image_data(session, image_id)
1961
+ if img_data is None:
1962
+ raise HTTPException(404, f"Image {image_id} not found")
1963
+ results = img_data.get("results")
1964
+ if not results:
1965
+ raise HTTPException(400, f"No results for {image_id}")
1966
+
1967
+ filename = img_data.get("filename", img_data["path"].name)
1968
+ width = img_data["width"]
1969
+ height = img_data["height"]
1970
+
1971
+ class _SegProxy:
1972
+ __slots__ = ("bbox", "coords", "text", "confidence")
1973
+ def __init__(self, r):
1974
+ bbox = r.get("bbox")
1975
+ self.bbox = tuple(bbox) if bbox else (0, 0, width, height)
1976
+ self.coords = None
1977
+ self.text = r.get("text", "")
1978
+ self.confidence = r.get("confidence")
1979
+
1980
+ segments = [_SegProxy(r) for r in results]
1981
+ exporter = PageXMLExporter(str(filename), width, height)
1982
+ root, page = exporter._make_root("Polyscriptor Web UI", None)
1983
+
1984
+ reading_order = ET.SubElement(page, 'ReadingOrder')
1985
+ ordered_group = ET.SubElement(reading_order, 'OrderedGroup',
1986
+ {'id': 'ro_1', 'caption': 'Regions reading order'})
1987
+ ET.SubElement(ordered_group, 'RegionRefIndexed', {'index': '0', 'regionRef': 'region_1'})
1988
+
1989
+ text_region = ET.SubElement(page, 'TextRegion',
1990
+ {'id': 'region_1', 'type': 'paragraph', 'custom': 'readingOrder {index:0;}'})
1991
+ if segments:
1992
+ x1 = min(s.bbox[0] for s in segments)
1993
+ y1 = min(s.bbox[1] for s in segments)
1994
+ x2 = max(s.bbox[2] for s in segments)
1995
+ y2 = max(s.bbox[3] for s in segments)
1996
+ ET.SubElement(text_region, 'Coords').set('points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}')
1997
+ for idx, seg in enumerate(segments):
1998
+ exporter._add_text_line(text_region, f'line_{idx + 1}', seg, seg.text, idx)
1999
+
2000
+ xml_bytes = ET.tostring(root, encoding='utf-8', method='xml')
2001
+ pretty = minidom.parseString(xml_bytes).toprettyxml(indent=' ', encoding='utf-8')
2002
+ return pretty, Path(filename).stem
2003
+
2004
+
2005
+ def _build_thinking_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
2006
+ """Return (thinking_bytes, stem) for a cached image, or raise HTTPException(404) if no thinking."""
2007
+ img_data = _get_image_data(session, image_id)
2008
+ if img_data is None:
2009
+ raise HTTPException(404, f"Image {image_id} not found")
2010
+ results = img_data.get("results")
2011
+ if not results:
2012
+ raise HTTPException(400, f"No results for {image_id}")
2013
+ filename = img_data.get("filename", img_data["path"].name)
2014
+ stem = Path(filename).stem
2015
+ blocks = []
2016
+ for i, r in enumerate(results):
2017
+ t = r.get("thinking_text", "")
2018
+ if t:
2019
+ if len(results) > 1:
2020
+ blocks.append(f"=== Line {i + 1} ===\n{t}")
2021
+ else:
2022
+ blocks.append(t)
2023
+ if not blocks:
2024
+ raise HTTPException(404, f"No thinking text for {image_id}")
2025
+ return "\n\n".join(blocks).encode("utf-8"), stem
2026
+
2027
+
2028
+ def _build_txt_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
2029
+ """Return (txt_bytes, stem) for a cached image, or raise HTTPException."""
2030
+ img_data = _get_image_data(session, image_id)
2031
+ if img_data is None:
2032
+ raise HTTPException(404, f"Image {image_id} not found")
2033
+ results = img_data.get("results")
2034
+ if not results:
2035
+ raise HTTPException(400, f"No results for {image_id}")
2036
+ filename = img_data.get("filename", img_data["path"].name)
2037
+ text = "\n".join(r.get("text", "") for r in results)
2038
+ return text.encode("utf-8"), Path(filename).stem
2039
+
2040
+
2041
+ class BatchXMLRequest(BaseModel):
2042
+ image_ids: list[str]
2043
+
2044
+
2045
+ @app.post("/api/batch/export-thinking")
2046
+ async def batch_export_thinking(request: Request, req: BatchXMLRequest):
2047
+ """Return a ZIP archive containing one thinking-text file per image (skips pages without thinking)."""
2048
+ session = _get_session(request)
2049
+ import zipfile, io
2050
+ buf = io.BytesIO()
2051
+ with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
2052
+ for image_id in req.image_ids:
2053
+ try:
2054
+ thinking_bytes, stem = _build_thinking_bytes(session, image_id)
2055
+ zf.writestr(f"{stem}_thinking.txt", thinking_bytes)
2056
+ except HTTPException:
2057
+ pass # skip pages without thinking
2058
+ buf.seek(0)
2059
+ return Response(
2060
+ content=buf.read(),
2061
+ media_type="application/zip",
2062
+ headers={"Content-Disposition": 'attachment; filename="batch_thinking.zip"'},
2063
+ )
2064
+
2065
+
2066
+ @app.post("/api/batch/export-txt")
2067
+ async def batch_export_txt(request: Request, req: BatchXMLRequest):
2068
+ """Return a ZIP archive containing one plain-text file per image."""
2069
+ session = _get_session(request)
2070
+ import zipfile, io
2071
+ buf = io.BytesIO()
2072
+ with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
2073
+ for image_id in req.image_ids:
2074
+ try:
2075
+ txt_bytes, stem = _build_txt_bytes(session, image_id)
2076
+ zf.writestr(f"{stem}.txt", txt_bytes)
2077
+ except HTTPException:
2078
+ pass # skip images without results
2079
+ buf.seek(0)
2080
+ return Response(
2081
+ content=buf.read(),
2082
+ media_type="application/zip",
2083
+ headers={"Content-Disposition": 'attachment; filename="batch_export_txt.zip"'},
2084
+ )
2085
+
2086
+
2087
+ @app.post("/api/batch/export-xml")
2088
+ async def batch_export_xml(request: Request, req: BatchXMLRequest):
2089
+ """Return a ZIP archive containing one PAGE XML file per image."""
2090
+ session = _get_session(request)
2091
+ import zipfile, io
2092
+ buf = io.BytesIO()
2093
+ with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
2094
+ for image_id in req.image_ids:
2095
+ try:
2096
+ xml_bytes, stem = _build_xml_bytes(session, image_id)
2097
+ zf.writestr(f"{stem}.xml", xml_bytes)
2098
+ except HTTPException:
2099
+ pass # skip images without results
2100
+ buf.seek(0)
2101
+ return Response(
2102
+ content=buf.read(),
2103
+ media_type="application/zip",
2104
+ headers={"Content-Disposition": 'attachment; filename="batch_export.zip"'},
2105
+ )
2106
+
2107
+
2108
+ @app.get("/api/session")
2109
+ async def session_info(request: Request):
2110
+ """Return info about the current session (useful for debugging)."""
2111
+ session = _get_session(request)
2112
+ return {
2113
+ "session_id": session.session_id[:8] + "...",
2114
+ "images": len(session.image_cache),
2115
+ "active_transcriptions": len(session.cancel_events),
2116
+ "pool_key": session.pool_key,
2117
+ "created_at": session.created_at,
2118
+ "last_active": session.last_active,
2119
+ "total_sessions": len(sessions),
2120
+ }
2121
+
2122
+
2123
+ @app.get("/api/engine/pool")
2124
+ async def pool_status():
2125
+ """Return current engine pool state (admin/debug endpoint)."""
2126
+ slots = []
2127
+ for key, slot in engine_pool.items():
2128
+ slots.append({
2129
+ "pool_key": key,
2130
+ "engine_name": slot.engine_name,
2131
+ "ref_count": slot.ref_count,
2132
+ "loaded": slot.engine.is_model_loaded(),
2133
+ "last_used": slot.last_used,
2134
+ "age_s": round(time.time() - slot.last_used, 0),
2135
+ })
2136
+ return {
2137
+ "pool_size": len(engine_pool),
2138
+ "slots": slots,
2139
+ "total_sessions": len(sessions),
2140
+ }
2141
+
2142
+
2143
+ @app.get("/api/kraken/presets")
2144
+ async def kraken_presets():
2145
+ """Return list of available Kraken model presets (local + Zenodo)."""
2146
+ try:
2147
+ from engines.kraken_engine import KRAKEN_MODELS
2148
+ except ImportError:
2149
+ return {"presets": []}
2150
+ presets = []
2151
+ for model_id, info in KRAKEN_MODELS.items():
2152
+ presets.append({
2153
+ "id": model_id,
2154
+ "label": info.get("description", model_id),
2155
+ "language": info.get("language", ""),
2156
+ "source": info.get("source", ""),
2157
+ })
2158
+ return {"presets": presets}
2159
+
2160
+
2161
+ @app.post("/api/models/upload")
2162
+ async def upload_model(file: UploadFile = File(...)):
2163
+ """Upload a Kraken .mlmodel file to the server's models/kraken_uploads/ directory."""
2164
+ filename = file.filename or "model.mlmodel"
2165
+ if not filename.lower().endswith(".mlmodel"):
2166
+ raise HTTPException(400, "Only .mlmodel files are accepted")
2167
+
2168
+ content = await file.read()
2169
+ if len(content) > 500 * 1024 * 1024:
2170
+ raise HTTPException(400, "File too large (max 500 MB)")
2171
+
2172
+ upload_dir = PROJECT_ROOT / "models" / "kraken_uploads"
2173
+ upload_dir.mkdir(parents=True, exist_ok=True)
2174
+
2175
+ # Sanitize filename — keep only safe characters
2176
+ safe_name = Path(filename).name
2177
+ safe_name = "".join(c for c in safe_name if c.isalnum() or c in "._- ")
2178
+ safe_name = safe_name.strip() or "uploaded.mlmodel"
2179
+
2180
+ dest = upload_dir / safe_name
2181
+ dest.write_bytes(content)
2182
+ log.info(f"Uploaded Kraken model: {dest} ({len(content)} bytes)")
2183
+
2184
+ rel_path = str(dest.relative_to(PROJECT_ROOT)) # e.g. models/kraken_uploads/foo.mlmodel
2185
+ return {
2186
+ "path": rel_path,
2187
+ "filename": safe_name,
2188
+ "size": len(content),
2189
+ "options": _scan_kraken_models(), # refreshed list for frontend to repopulate select
2190
+ }
2191
+
2192
+
2193
+ @app.get("/api/gpu")
2194
+ async def gpu_status():
2195
+ try:
2196
+ import torch
2197
+ if not torch.cuda.is_available():
2198
+ return {"available": False, "gpus": []}
2199
+
2200
+ # pynvml (nvidia-ml-py) for utilization %; graceful fallback if missing
2201
+ nvml_utils: dict[int, dict] = {}
2202
+ try:
2203
+ import pynvml
2204
+ pynvml.nvmlInit()
2205
+ for _i in range(pynvml.nvmlDeviceGetCount()):
2206
+ h = pynvml.nvmlDeviceGetHandleByIndex(_i)
2207
+ u = pynvml.nvmlDeviceGetUtilizationRates(h)
2208
+ nvml_utils[_i] = {"gpu_pct": u.gpu, "mem_pct": u.memory}
2209
+ except Exception:
2210
+ pass # pynvml unavailable — utilization fields omitted
2211
+
2212
+ gpus = []
2213
+ for i in range(torch.cuda.device_count()):
2214
+ free, total = torch.cuda.mem_get_info(i)
2215
+ entry: dict = {
2216
+ "index": i,
2217
+ "name": torch.cuda.get_device_name(i),
2218
+ "memory_total_mb": round(total / 1e6),
2219
+ "memory_used_mb": round((total - free) / 1e6),
2220
+ "memory_free_mb": round(free / 1e6),
2221
+ }
2222
+ if i in nvml_utils:
2223
+ entry["utilization_gpu_pct"] = nvml_utils[i]["gpu_pct"]
2224
+ entry["utilization_mem_pct"] = nvml_utils[i]["mem_pct"]
2225
+ gpus.append(entry)
2226
+ return {"available": True, "gpus": gpus}
2227
+ except Exception:
2228
+ return {"available": False, "gpus": []}
2229
+
2230
+
2231
+ # ---------------------------------------------------------------------------
2232
+ # Helpers
2233
+ # ---------------------------------------------------------------------------
2234
+
2235
+ def _sse(event: str, data: dict) -> str:
2236
+ """Format a Server-Sent Event."""
2237
+ return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
web/server_config.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Polyscriptor Web UI — server startup configuration
2
+ # Uncomment and adjust to auto-load an engine on server start.
3
+ #
4
+ # Usage:
5
+ # python -m uvicorn web.polyscriptor_server:app --host 0.0.0.0 --port 8765
6
+ #
7
+ # The server will load the specified engine at startup so the first
8
+ # transcription request doesn't need to wait for model loading.
9
+
10
+ # --- Auto-load (optional) ---
11
+ # Set default_engine to the engine name shown in the UI dropdown.
12
+ # Leave blank or comment out to start without a loaded model.
13
+
14
+ # Example: auto-load Church Slavonic CRNN-CTC model
15
+ # default_engine: "CRNN-CTC (PyLaia-inspired)"
16
+ # default_config:
17
+ # model_path: "Church Slavonic (2.89% CER)"
18
+ # enable_spaces: true
19
+
20
+ # Example: auto-load TrOCR from HuggingFace
21
+ # default_engine: "TrOCR"
22
+ # default_config:
23
+ # model_path: "kazars24/trocr-base-handwritten-ru"
24
+ # num_beams: 4
25
+ # normalize_background: false
web/static/app.css ADDED
@@ -0,0 +1,1269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ── Self-hosted fonts ─── */
2
+ @font-face {
3
+ font-family: 'Monomakh';
4
+ src: url('fonts/MonomakhUnicode-Regular.woff2') format('woff2');
5
+ font-weight: normal;
6
+ font-style: normal;
7
+ font-display: swap;
8
+ unicode-range: U+0000-007F, U+0080-00FF, U+0300-036F, U+0400-04FF,
9
+ U+0500-052F, U+1C80-1C8F, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F;
10
+ }
11
+
12
+ /* ── Design tokens ───────────────────────────────────────────────────── */
13
+ :root {
14
+ --bg: #111827;
15
+ --bg-panel: #1f2937;
16
+ --bg-section: #1a2333;
17
+ --bg-input: #111827;
18
+ --bg-hover: #2a3a52;
19
+ --text: #e2e8f0;
20
+ --text-muted: #64748b;
21
+ --text-dim: #94a3b8;
22
+ --accent: #e94560;
23
+ --accent-hover:#ff6b81;
24
+ --primary: #3b82f6;
25
+ --primary-hover:#60a5fa;
26
+ --success: #22c55e;
27
+ --warning: #f59e0b;
28
+ --danger: #ef4444;
29
+ --border: #2d3f59;
30
+ --border-light:#3a4f6e;
31
+ --radius: 6px;
32
+ --radius-lg: 10px;
33
+ --font: 'Segoe UI', system-ui, -apple-system, sans-serif;
34
+ --font-mono: 'Consolas', 'Fira Code', 'Cascadia Code', monospace;
35
+ --header-h: 44px;
36
+ --tabs-h: 56px;
37
+ --shadow: 0 4px 20px rgba(0,0,0,0.4);
38
+ }
39
+
40
+ /* ── Reset & base ───────────────────────────────────────────────────── */
41
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
42
+
43
+ html { height: 100%; }
44
+
45
+ body {
46
+ font-family: var(--font);
47
+ background: var(--bg);
48
+ color: var(--text);
49
+ height: 100%;
50
+ display: flex;
51
+ flex-direction: column;
52
+ overflow: hidden;
53
+ }
54
+
55
+ /* ── Header ─────────────────────────────────────────────────────────── */
56
+ #header {
57
+ height: var(--header-h);
58
+ display: flex;
59
+ align-items: center;
60
+ justify-content: space-between;
61
+ padding: 0 14px;
62
+ background: var(--bg-panel);
63
+ border-bottom: 1px solid var(--border);
64
+ flex-shrink: 0;
65
+ gap: 12px;
66
+ }
67
+
68
+ .header-left {
69
+ display: flex;
70
+ align-items: center;
71
+ gap: 8px;
72
+ min-width: 0;
73
+ }
74
+
75
+ .header-logo {
76
+ font-size: 1.3rem;
77
+ color: var(--primary);
78
+ line-height: 1;
79
+ }
80
+
81
+ #header h1 {
82
+ font-size: 1rem;
83
+ font-weight: 700;
84
+ letter-spacing: 0.3px;
85
+ white-space: nowrap;
86
+ }
87
+
88
+ .header-sub {
89
+ font-weight: 400;
90
+ color: var(--text-muted);
91
+ font-size: 0.9rem;
92
+ letter-spacing: 2px;
93
+ margin-left: 2px;
94
+ }
95
+
96
+ .header-right {
97
+ display: flex;
98
+ align-items: center;
99
+ gap: 8px;
100
+ flex-shrink: 0;
101
+ }
102
+
103
+ .gpu-badge {
104
+ font-size: 0.75rem;
105
+ padding: 3px 10px;
106
+ border-radius: 12px;
107
+ background: var(--bg-input);
108
+ color: var(--text-muted);
109
+ border: 1px solid var(--border);
110
+ white-space: nowrap;
111
+ max-width: 280px;
112
+ overflow: hidden;
113
+ text-overflow: ellipsis;
114
+ }
115
+
116
+ .btn-icon {
117
+ width: 30px;
118
+ height: 30px;
119
+ border: 1px solid var(--border);
120
+ border-radius: 50%;
121
+ background: var(--bg-input);
122
+ color: var(--text-muted);
123
+ font-size: 0.85rem;
124
+ font-weight: 700;
125
+ cursor: pointer;
126
+ display: flex;
127
+ align-items: center;
128
+ justify-content: center;
129
+ flex-shrink: 0;
130
+ transition: border-color 0.15s, color 0.15s;
131
+ }
132
+ .btn-icon:hover { border-color: var(--primary); color: var(--primary); }
133
+
134
+ /* ── GPU widget ─────────────────────────────────────────────────────── */
135
+ .gpu-widget {
136
+ display: flex;
137
+ gap: 8px;
138
+ align-items: center;
139
+ }
140
+ .gpu-card {
141
+ display: flex;
142
+ flex-direction: column;
143
+ gap: 3px;
144
+ font-size: 0.7rem;
145
+ color: var(--text-muted);
146
+ min-width: 90px;
147
+ max-width: 160px;
148
+ }
149
+ .gpu-card-name {
150
+ display: flex;
151
+ justify-content: space-between;
152
+ align-items: center;
153
+ gap: 4px;
154
+ white-space: nowrap;
155
+ overflow: hidden;
156
+ }
157
+ .gpu-card-name span { overflow: hidden; text-overflow: ellipsis; }
158
+ .gpu-util-pct {
159
+ font-size: 0.68rem;
160
+ color: var(--text-dim);
161
+ flex-shrink: 0;
162
+ }
163
+ .gpu-mem-bar {
164
+ height: 4px;
165
+ background: var(--bg-input);
166
+ border-radius: 2px;
167
+ overflow: hidden;
168
+ }
169
+ .gpu-mem-fill {
170
+ height: 100%;
171
+ border-radius: 2px;
172
+ background: var(--primary);
173
+ transition: width 0.5s ease;
174
+ }
175
+ .gpu-mem-fill.warm { background: var(--warning); }
176
+ .gpu-mem-fill.hot { background: var(--danger); }
177
+ .gpu-mem-label {
178
+ font-size: 0.65rem;
179
+ color: var(--text-muted);
180
+ }
181
+
182
+ /* ── Toast notifications ─────────────────────────────────────────────── */
183
+ #toast-container {
184
+ position: fixed;
185
+ bottom: 20px;
186
+ right: 20px;
187
+ display: flex;
188
+ flex-direction: column;
189
+ gap: 8px;
190
+ z-index: 9999;
191
+ pointer-events: none;
192
+ }
193
+ .toast {
194
+ padding: 10px 16px;
195
+ border-radius: var(--radius);
196
+ font-size: 0.85rem;
197
+ box-shadow: var(--shadow);
198
+ pointer-events: auto;
199
+ animation: toast-in 0.2s ease;
200
+ max-width: 320px;
201
+ }
202
+ .toast-error { background: #7f1d1d; color: #fca5a5; border: 1px solid #991b1b; }
203
+ .toast-success { background: #14532d; color: #86efac; border: 1px solid #15803d; }
204
+ .toast-info { background: #1e3a5f; color: #93c5fd; border: 1px solid #1d4ed8; }
205
+ @keyframes toast-in { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: none; } }
206
+
207
+ /* ── Main layout (3 columns) ────────────────────────────────────────── */
208
+ #app {
209
+ display: grid;
210
+ grid-template-columns: var(--panel-left, 260px) 5px 1fr 5px var(--panel-right, 360px);
211
+ grid-template-rows: 1fr;
212
+ gap: 0;
213
+ flex: 1;
214
+ min-height: 0;
215
+ background: var(--border);
216
+ }
217
+ .panel-resize-handle {
218
+ background: var(--border);
219
+ cursor: col-resize;
220
+ transition: background 0.15s;
221
+ z-index: 10;
222
+ position: relative;
223
+ }
224
+ .panel-resize-handle:hover,
225
+ .panel-resize-handle.dragging {
226
+ background: var(--primary);
227
+ }
228
+ .panel-resize-handle::after {
229
+ content: '';
230
+ position: absolute;
231
+ inset: 0 -4px; /* wider hit area */
232
+ }
233
+
234
+ .panel {
235
+ background: var(--bg-panel);
236
+ overflow-y: auto;
237
+ overflow-x: hidden;
238
+ min-height: 0;
239
+ }
240
+
241
+ /* Left panel flex column */
242
+ #engine-panel {
243
+ display: flex;
244
+ flex-direction: column;
245
+ gap: 0;
246
+ padding: 0;
247
+ }
248
+
249
+ .panel-section {
250
+ padding: 12px 12px 8px;
251
+ display: flex;
252
+ flex-direction: column;
253
+ gap: 7px;
254
+ }
255
+
256
+ .panel-footer {
257
+ padding: 10px 12px;
258
+ border-top: 1px solid var(--border);
259
+ margin-top: auto;
260
+ }
261
+ .footer-btn-row {
262
+ display: flex;
263
+ gap: 6px;
264
+ }
265
+ .footer-btn-row .btn {
266
+ flex: 1;
267
+ }
268
+
269
+ .panel h2 {
270
+ font-size: 0.7rem;
271
+ text-transform: uppercase;
272
+ letter-spacing: 1.2px;
273
+ color: var(--text-muted);
274
+ margin-bottom: 2px;
275
+ }
276
+
277
+ #engine-panel hr {
278
+ border: none;
279
+ border-top: 1px solid var(--border);
280
+ flex-shrink: 0;
281
+ }
282
+
283
+ /* ── Form elements ──────────────────────────────────────────────────── */
284
+ label {
285
+ font-size: 0.78rem;
286
+ color: var(--text-dim);
287
+ }
288
+
289
+ select,
290
+ input[type="text"],
291
+ input[type="number"],
292
+ input[type="password"] {
293
+ width: 100%;
294
+ padding: 6px 9px;
295
+ background: var(--bg-input);
296
+ color: var(--text);
297
+ border: 1px solid var(--border);
298
+ border-radius: var(--radius);
299
+ font-size: 0.83rem;
300
+ font-family: var(--font);
301
+ transition: border-color 0.15s;
302
+ }
303
+
304
+ select:focus, input:focus, textarea:focus {
305
+ outline: none;
306
+ border-color: var(--primary);
307
+ box-shadow: 0 0 0 2px rgba(59,130,246,0.12);
308
+ }
309
+
310
+ textarea {
311
+ width: 100%;
312
+ padding: 6px 9px;
313
+ background: var(--bg-input);
314
+ color: var(--text);
315
+ border: 1px solid var(--border);
316
+ border-radius: var(--radius);
317
+ font-size: 0.83rem;
318
+ font-family: var(--font);
319
+ transition: border-color 0.15s;
320
+ box-sizing: border-box;
321
+ }
322
+
323
+ input::placeholder,
324
+ textarea::placeholder {
325
+ color: var(--text-dim);
326
+ font-style: italic;
327
+ opacity: 0.65;
328
+ }
329
+
330
+ select option { background: var(--bg-panel); color: var(--text); }
331
+
332
+ /* Config form fields */
333
+ .config-field {
334
+ display: flex;
335
+ flex-direction: column;
336
+ gap: 3px;
337
+ }
338
+
339
+ /* Select + refresh button row */
340
+ .select-row {
341
+ display: flex;
342
+ gap: 4px;
343
+ align-items: center;
344
+ }
345
+ .select-row select { flex: 1; min-width: 0; width: auto; }
346
+
347
+ .btn-refresh {
348
+ flex-shrink: 0;
349
+ width: 28px;
350
+ height: 28px;
351
+ border: 1px solid var(--border);
352
+ border-radius: var(--radius);
353
+ background: var(--bg-input);
354
+ color: var(--text-muted);
355
+ font-size: 1rem;
356
+ cursor: pointer;
357
+ display: flex;
358
+ align-items: center;
359
+ justify-content: center;
360
+ transition: background 0.15s, border-color 0.15s, color 0.15s;
361
+ }
362
+ .btn-refresh:hover:not(:disabled) { background: var(--bg-hover); border-color: var(--primary); color: var(--primary); }
363
+ .btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
364
+
365
+ .dynamic-hint {
366
+ font-size: 0.7rem;
367
+ color: var(--text-muted);
368
+ min-height: 1em;
369
+ }
370
+
371
+ .config-field label { font-size: 0.75rem; color: var(--text-muted); }
372
+
373
+ .config-field-checkbox {
374
+ flex-direction: row;
375
+ align-items: center;
376
+ gap: 7px;
377
+ }
378
+ .config-field-checkbox input[type="checkbox"] {
379
+ width: auto;
380
+ accent-color: var(--primary);
381
+ cursor: pointer;
382
+ }
383
+ .config-field-checkbox label { font-size: 0.82rem; color: var(--text); cursor: pointer; }
384
+
385
+ #blla-options {
386
+ display: flex;
387
+ align-items: center;
388
+ gap: 8px;
389
+ }
390
+ #blla-options label { flex-shrink: 0; }
391
+ #blla-options input { width: 64px; }
392
+
393
+ /* ── Buttons ────────────────────────────────────────────────────────── */
394
+ .btn {
395
+ padding: 8px 14px;
396
+ border: none;
397
+ border-radius: var(--radius);
398
+ font-size: 0.83rem;
399
+ font-family: var(--font);
400
+ cursor: pointer;
401
+ transition: background 0.15s, transform 0.1s;
402
+ white-space: nowrap;
403
+ }
404
+ .btn:active:not(:disabled) { transform: translateY(1px); }
405
+ .btn:disabled { opacity: 0.38; cursor: not-allowed; }
406
+
407
+ .btn-full { width: 100%; }
408
+
409
+ .btn-primary { background: var(--primary); color: white; }
410
+ .btn-primary:hover:not(:disabled) { background: var(--primary-hover); }
411
+
412
+ .btn-accent { background: var(--accent); color: white; }
413
+ .btn-accent:hover:not(:disabled) { background: var(--accent-hover); }
414
+
415
+ .btn-small {
416
+ padding: 5px 10px;
417
+ font-size: 0.78rem;
418
+ background: var(--bg-input);
419
+ color: var(--text-dim);
420
+ border: 1px solid var(--border);
421
+ }
422
+ .btn-small:hover:not(:disabled) { background: var(--bg-hover); border-color: var(--border-light); color: var(--text); }
423
+
424
+ .btn-outline {
425
+ background: transparent;
426
+ border: 1px solid var(--border);
427
+ cursor: pointer;
428
+ border-radius: var(--radius);
429
+ display: inline-flex;
430
+ align-items: center;
431
+ font-size: 0.78rem;
432
+ color: var(--text-dim);
433
+ padding: 4px 8px;
434
+ transition: background 0.15s, border-color 0.15s;
435
+ }
436
+ .btn-outline:hover { background: var(--bg-hover); border-color: var(--primary); color: var(--text); }
437
+
438
+ .btn-row {
439
+ display: flex;
440
+ gap: 6px;
441
+ flex-wrap: wrap;
442
+ margin-top: 6px;
443
+ }
444
+
445
+ /* Save key row */
446
+ .key-save-row {
447
+ display: flex;
448
+ align-items: center;
449
+ gap: 6px;
450
+ margin-top: 4px;
451
+ font-size: 0.76rem;
452
+ color: var(--text-muted);
453
+ }
454
+ .key-save-row input[type="checkbox"] { width: auto; margin: 0; accent-color: var(--primary); }
455
+ .key-save-row label { cursor: pointer; }
456
+
457
+ input[disabled] { opacity: 0.45; cursor: not-allowed; }
458
+
459
+ /* ── Upload area ────────────────────────────────────────────────────── */
460
+ .upload-area {
461
+ border: 2px dashed var(--border);
462
+ border-radius: var(--radius-lg);
463
+ padding: 18px 12px;
464
+ text-align: center;
465
+ cursor: pointer;
466
+ transition: border-color 0.2s, background 0.2s;
467
+ font-size: 0.83rem;
468
+ color: var(--text-muted);
469
+ display: flex;
470
+ flex-direction: column;
471
+ align-items: center;
472
+ gap: 8px;
473
+ }
474
+ .upload-area:hover, .upload-area.dragover {
475
+ border-color: var(--primary);
476
+ background: rgba(59,130,246,0.06);
477
+ color: var(--text-dim);
478
+ }
479
+ .upload-icon {
480
+ width: 28px;
481
+ height: 28px;
482
+ opacity: 0.5;
483
+ }
484
+ .upload-area:hover .upload-icon,
485
+ .upload-area.dragover .upload-icon { opacity: 0.8; }
486
+
487
+ /* XML row */
488
+ .xml-row {
489
+ display: flex;
490
+ align-items: center;
491
+ gap: 8px;
492
+ }
493
+ .xml-row .muted {
494
+ flex: 1;
495
+ font-size: 0.78rem;
496
+ overflow: hidden;
497
+ text-overflow: ellipsis;
498
+ white-space: nowrap;
499
+ }
500
+ .xml-ok { color: var(--success) !important; }
501
+
502
+ /* ── Image viewer (center) ──────────────────────────────────────────── */
503
+ #viewer-panel {
504
+ padding: 0;
505
+ position: relative;
506
+ overflow: hidden;
507
+ display: flex;
508
+ flex-direction: column;
509
+ }
510
+
511
+ /* Zoom toolbar */
512
+ .zoom-toolbar {
513
+ display: flex;
514
+ align-items: center;
515
+ gap: 4px;
516
+ padding: 5px 8px;
517
+ background: var(--bg-panel);
518
+ border-bottom: 1px solid var(--border);
519
+ flex-shrink: 0;
520
+ z-index: 2;
521
+ }
522
+ .zoom-btn {
523
+ width: 26px;
524
+ height: 26px;
525
+ border: 1px solid var(--border);
526
+ border-radius: var(--radius);
527
+ background: var(--bg-input);
528
+ color: var(--text-dim);
529
+ font-size: 1rem;
530
+ line-height: 1;
531
+ cursor: pointer;
532
+ display: flex;
533
+ align-items: center;
534
+ justify-content: center;
535
+ transition: background 0.15s, border-color 0.15s;
536
+ }
537
+ .zoom-btn:hover { background: var(--bg-hover); border-color: var(--border-light); color: var(--text); }
538
+ .zoom-fit { font-size: 0.8rem; width: auto; padding: 0 7px; }
539
+ .zoom-toolbar-sep { width: 1px; background: var(--border); margin: 0 4px; align-self: stretch; }
540
+ .nav-btn { padding: 2px 8px; font-size: .8rem; line-height: 1.6; }
541
+ .nav-btn:disabled { opacity: 0.3; cursor: default; }
542
+ .batch-nav-label-toolbar { font-size: .78rem; color: var(--text-muted); min-width: 36px; text-align: center; }
543
+ .zoom-level {
544
+ font-size: 0.75rem;
545
+ color: var(--text-muted);
546
+ min-width: 3.5em;
547
+ text-align: center;
548
+ font-family: var(--font-mono);
549
+ }
550
+
551
+ /* Scrollable image area */
552
+ #viewer-scroll {
553
+ flex: 1;
554
+ overflow: auto;
555
+ display: flex;
556
+ align-items: flex-start;
557
+ justify-content: flex-start;
558
+ min-height: 0;
559
+ position: relative;
560
+ }
561
+
562
+ /* Placeholder — fills scroll area and centers content */
563
+ .viewer-placeholder {
564
+ width: 100%;
565
+ height: 100%;
566
+ min-height: 200px;
567
+ display: flex;
568
+ flex-direction: column;
569
+ align-items: center;
570
+ justify-content: center;
571
+ gap: 14px;
572
+ color: var(--text-muted);
573
+ font-size: 0.9rem;
574
+ user-select: none;
575
+ }
576
+ .viewer-placeholder.dragover {
577
+ color: var(--primary);
578
+ background: rgba(59, 130, 246, 0.08);
579
+ }
580
+ .viewer-placeholder svg {
581
+ width: 56px;
582
+ height: 56px;
583
+ opacity: 0.25;
584
+ }
585
+ .viewer-placeholder p { opacity: 0.6; }
586
+
587
+ /* Image container — only shows when image is loaded */
588
+ #image-container {
589
+ position: relative;
590
+ flex-shrink: 0;
591
+ line-height: 0;
592
+ }
593
+
594
+ #page-image {
595
+ display: block;
596
+ /* width controlled by zoom JS; height auto */
597
+ transition: width 0.08s ease-out, height 0.08s ease-out;
598
+ }
599
+
600
+ #overlay-canvas {
601
+ position: absolute;
602
+ top: 0;
603
+ left: 0;
604
+ pointer-events: auto;
605
+ cursor: crosshair;
606
+ transition: width 0.08s ease-out, height 0.08s ease-out;
607
+ }
608
+
609
+ /* ── Results panel (right) ──────────────────────────────────────────── */
610
+ #results-panel {
611
+ display: flex;
612
+ flex-direction: column;
613
+ }
614
+
615
+ .results-header {
616
+ padding: 12px 12px 8px;
617
+ border-bottom: 1px solid var(--border);
618
+ flex-shrink: 0;
619
+ }
620
+ .results-header-row {
621
+ display: flex;
622
+ align-items: center;
623
+ justify-content: space-between;
624
+ margin-bottom: 0;
625
+ }
626
+ .results-header-row h2 { margin-bottom: 0; }
627
+ .results-header-controls {
628
+ display: flex;
629
+ align-items: center;
630
+ gap: 5px;
631
+ }
632
+ .btn-icon.active { border-color: var(--primary); color: var(--primary); background: rgba(59,130,246,0.1); }
633
+
634
+ /* Font selector in results header */
635
+ .font-select {
636
+ width: auto !important;
637
+ padding: 3px 5px !important;
638
+ font-size: 0.72rem !important;
639
+ height: 26px;
640
+ border-radius: var(--radius);
641
+ color: var(--text-muted);
642
+ max-width: 140px;
643
+ }
644
+
645
+ #transcription-lines {
646
+ flex: 1;
647
+ overflow-y: auto;
648
+ font-family: var(--font-results, var(--font-mono));
649
+ font-size: 0.83rem;
650
+ line-height: 1.5;
651
+ padding: 4px 0;
652
+ }
653
+
654
+ .line-result {
655
+ padding: 5px 10px;
656
+ border-bottom: 1px solid rgba(45,63,89,0.5);
657
+ cursor: pointer;
658
+ transition: background 0.1s;
659
+ }
660
+ .line-result:last-child { border-bottom: none; }
661
+ .line-result:hover { background: var(--bg-hover); }
662
+
663
+ .line-num {
664
+ color: var(--text-muted);
665
+ font-size: 0.68rem;
666
+ margin-right: 7px;
667
+ user-select: none;
668
+ display: inline-block;
669
+ min-width: 2.2em;
670
+ text-align: right;
671
+ }
672
+
673
+ .confidence {
674
+ float: right;
675
+ font-size: 0.68rem;
676
+ padding: 1px 6px;
677
+ border-radius: 8px;
678
+ margin-left: 6px;
679
+ margin-top: 2px;
680
+ }
681
+ .conf-high { background: rgba(34,197,94,0.15); color: var(--success); }
682
+ .conf-mid { background: rgba(245,158,11,0.15); color: var(--warning); }
683
+ .conf-low { background: rgba(239,68,68,0.15); color: var(--danger); }
684
+
685
+ .line-result.line-active {
686
+ background: rgba(233,69,96,0.12);
687
+ border-left: 3px solid var(--accent);
688
+ }
689
+ .line-result.highlight {
690
+ background: rgba(59,130,246,0.12);
691
+ border-left: 3px solid var(--primary);
692
+ }
693
+
694
+ /* Dimmed lines (below confidence threshold) */
695
+ .line-result.line-dimmed {
696
+ opacity: 0.28;
697
+ }
698
+
699
+ /* Inline editing */
700
+ .line-text {
701
+ display: inline;
702
+ outline: none;
703
+ border-radius: 2px;
704
+ }
705
+ .line-text[contenteditable="true"] {
706
+ background: rgba(58, 134, 255, 0.08);
707
+ outline: 1px dashed var(--primary);
708
+ padding: 0 3px;
709
+ cursor: text;
710
+ }
711
+ /* Gemini thinking/reasoning block */
712
+ .thinking-block {
713
+ display: block;
714
+ width: 100%;
715
+ margin-top: 4px;
716
+ }
717
+ .thinking-toggle {
718
+ font-size: 0.7rem;
719
+ color: var(--text-dim);
720
+ cursor: pointer;
721
+ user-select: none;
722
+ letter-spacing: 0.04em;
723
+ text-transform: uppercase;
724
+ }
725
+ .thinking-toggle:hover { color: var(--primary); }
726
+ .thinking-text {
727
+ margin: 4px 0 0 0;
728
+ padding: 6px 10px;
729
+ font-size: 0.72rem;
730
+ font-family: monospace;
731
+ white-space: pre-wrap;
732
+ word-break: break-word;
733
+ background: var(--bg-input);
734
+ border-left: 2px solid var(--border);
735
+ color: var(--text-dim);
736
+ border-radius: 0 3px 3px 0;
737
+ max-height: 300px;
738
+ overflow-y: auto;
739
+ }
740
+
741
+ .line-result.line-edited .line-num::after {
742
+ content: '✎';
743
+ color: var(--primary);
744
+ font-size: 0.6rem;
745
+ margin-left: 2px;
746
+ }
747
+
748
+ /* Confidence filter row */
749
+ .results-search-row {
750
+ display: flex;
751
+ align-items: center;
752
+ gap: 6px;
753
+ padding: 4px 0;
754
+ }
755
+ .results-search-row input[type="search"] {
756
+ flex: 1;
757
+ min-width: 0;
758
+ background: var(--bg-input);
759
+ border: 1px solid var(--border);
760
+ border-radius: var(--radius);
761
+ color: var(--text);
762
+ font-size: 0.8rem;
763
+ padding: 3px 8px;
764
+ }
765
+ .results-search-row input[type="search"]:focus {
766
+ outline: none;
767
+ border-color: var(--primary);
768
+ }
769
+ #results-search-count {
770
+ font-size: 0.72rem;
771
+ white-space: nowrap;
772
+ }
773
+ .line-result.line-hidden { display: none; }
774
+ .line-result mark { background: color-mix(in srgb, var(--accent) 35%, transparent); border-radius: 2px; }
775
+
776
+ /* Thinking / reasoning block (Gemini, Claude) */
777
+ .thinking-block {
778
+ margin-top: 4px;
779
+ border-left: 2px solid var(--accent);
780
+ border-radius: 0 var(--radius-sm) var(--radius-sm) 0;
781
+ background: color-mix(in srgb, var(--accent) 6%, var(--bg-secondary));
782
+ font-size: 0.75rem;
783
+ }
784
+ .thinking-toggle {
785
+ cursor: pointer;
786
+ padding: 2px 6px;
787
+ color: var(--accent);
788
+ user-select: none;
789
+ font-style: italic;
790
+ list-style: none;
791
+ }
792
+ .thinking-toggle::marker,
793
+ .thinking-toggle::-webkit-details-marker { display: none; }
794
+ .thinking-toggle::before {
795
+ content: '▶ ';
796
+ font-style: normal;
797
+ font-size: 0.65rem;
798
+ transition: transform 0.15s;
799
+ }
800
+ details[open] > .thinking-toggle::before { content: '▼ '; }
801
+ .thinking-text {
802
+ margin: 0;
803
+ padding: 4px 8px 6px;
804
+ white-space: pre-wrap;
805
+ word-break: break-word;
806
+ color: var(--text-secondary);
807
+ font-family: inherit;
808
+ line-height: 1.45;
809
+ max-height: 200px;
810
+ overflow-y: auto;
811
+ }
812
+
813
+ .conf-filter-row {
814
+ display: flex;
815
+ align-items: center;
816
+ gap: 8px;
817
+ padding: 4px 0 6px;
818
+ font-size: 0.75rem;
819
+ color: var(--text-muted);
820
+ }
821
+ .conf-filter-row input[type="range"] {
822
+ flex: 1;
823
+ width: auto;
824
+ height: 3px;
825
+ cursor: pointer;
826
+ accent-color: var(--primary);
827
+ padding: 0;
828
+ background: none;
829
+ border: none;
830
+ }
831
+
832
+ /* Batch queue */
833
+ #batch-queue-section {
834
+ margin-top: 6px;
835
+ border-top: 1px solid var(--border);
836
+ padding-top: 8px;
837
+ }
838
+ .batch-queue-header {
839
+ display: flex;
840
+ align-items: center;
841
+ justify-content: space-between;
842
+ margin-bottom: 4px;
843
+ }
844
+ .section-label {
845
+ font-size: 0.7rem;
846
+ font-weight: 600;
847
+ text-transform: uppercase;
848
+ letter-spacing: .06em;
849
+ color: var(--text-muted);
850
+ }
851
+ .batch-overall-progress {
852
+ font-size: 0.72rem;
853
+ font-family: var(--font-mono);
854
+ color: var(--accent);
855
+ background: color-mix(in srgb, var(--accent) 12%, transparent);
856
+ padding: 1px 7px;
857
+ border-radius: 10px;
858
+ }
859
+ .batch-item {
860
+ display: flex;
861
+ align-items: center;
862
+ gap: 6px;
863
+ padding: 4px 2px;
864
+ font-size: 0.78rem;
865
+ border-bottom: 1px solid rgba(45,63,89,0.4);
866
+ }
867
+ .batch-item:last-child { border-bottom: none; }
868
+ .batch-drag-handle {
869
+ cursor: grab;
870
+ color: var(--text-muted);
871
+ font-size: 0.9rem;
872
+ line-height: 1;
873
+ padding: 0 2px;
874
+ flex-shrink: 0;
875
+ user-select: none;
876
+ }
877
+ .batch-drag-handle:active { cursor: grabbing; }
878
+ .batch-item.batch-dragging { opacity: 0.4; }
879
+ .batch-item.batch-drag-over {
880
+ border-top: 2px solid var(--accent);
881
+ margin-top: -1px;
882
+ }
883
+ .batch-item-name {
884
+ flex: 1;
885
+ overflow: hidden;
886
+ text-overflow: ellipsis;
887
+ white-space: nowrap;
888
+ color: var(--text);
889
+ }
890
+ .batch-status {
891
+ font-size: 0.68rem;
892
+ flex-shrink: 0;
893
+ min-width: 56px;
894
+ text-align: right;
895
+ color: var(--text-muted);
896
+ }
897
+ .batch-status.done { color: var(--success); }
898
+ .batch-status.error { color: var(--danger); }
899
+ .batch-status.active { color: var(--primary); }
900
+
901
+ .batch-nav-row {
902
+ display: flex;
903
+ align-items: center;
904
+ gap: 4px;
905
+ margin-top: 6px;
906
+ }
907
+ .batch-options-row {
908
+ display: flex;
909
+ gap: 14px;
910
+ align-items: center;
911
+ margin-top: 6px;
912
+ flex-wrap: wrap;
913
+ }
914
+ .checkbox-label {
915
+ display: flex;
916
+ align-items: center;
917
+ gap: 5px;
918
+ font-size: .8rem;
919
+ color: var(--text-muted);
920
+ cursor: pointer;
921
+ user-select: none;
922
+ }
923
+ .checkbox-label input[type="checkbox"] { cursor: pointer; }
924
+
925
+ /* Column layout (multi-region side-by-side) */
926
+ #transcription-lines.col-layout {
927
+ display: flex;
928
+ flex-direction: row;
929
+ align-items: flex-start; /* columns grow to their content height */
930
+ overflow-x: auto;
931
+ /* overflow-y stays 'auto' from the base rule — unified scroll */
932
+ padding: 0;
933
+ gap: 0;
934
+ }
935
+ .region-column {
936
+ flex: 0 0 auto; /* don't shrink; grow to content */
937
+ min-width: 220px;
938
+ width: max-content; /* each column as wide as its widest line */
939
+ max-width: min(520px, 75vw);
940
+ display: flex;
941
+ flex-direction: column;
942
+ border-right: 1px solid var(--border);
943
+ /* No overflow-y — parent handles the single scrollbar */
944
+ }
945
+ .region-column:last-child { border-right: none; }
946
+ /* Prevent line text from wrapping inside column cells */
947
+ .region-column .line-result { white-space: nowrap; }
948
+ .region-col-header {
949
+ display: flex;
950
+ align-items: center;
951
+ justify-content: space-between;
952
+ gap: 4px;
953
+ padding: 4px 8px;
954
+ font-size: 0.68rem;
955
+ font-weight: 600;
956
+ text-transform: uppercase;
957
+ letter-spacing: 0.8px;
958
+ color: var(--text-muted);
959
+ background: var(--bg-section);
960
+ border-bottom: 1px solid var(--border);
961
+ position: sticky;
962
+ top: 0;
963
+ z-index: 1;
964
+ }
965
+ .region-col-close {
966
+ flex-shrink: 0;
967
+ background: none;
968
+ border: none;
969
+ color: var(--text-muted);
970
+ cursor: pointer;
971
+ font-size: 1rem;
972
+ line-height: 1;
973
+ padding: 0 2px;
974
+ border-radius: 3px;
975
+ transition: color 0.1s, background 0.1s;
976
+ }
977
+ .region-col-close:hover { color: var(--danger); background: rgba(239,68,68,0.1); }
978
+
979
+ /* Detected region list (below segmentation controls) */
980
+ #seg-regions-list {
981
+ margin: 0 12px 8px;
982
+ border: 1px solid var(--border);
983
+ border-radius: var(--radius);
984
+ background: var(--bg-section);
985
+ overflow: hidden;
986
+ }
987
+ .seg-regions-header {
988
+ padding: 5px 10px;
989
+ font-size: 0.68rem;
990
+ font-weight: 600;
991
+ text-transform: uppercase;
992
+ letter-spacing: 0.8px;
993
+ color: var(--text-muted);
994
+ border-bottom: 1px solid var(--border);
995
+ }
996
+ .seg-region-row {
997
+ display: flex;
998
+ align-items: center;
999
+ gap: 7px;
1000
+ padding: 5px 10px;
1001
+ border-bottom: 1px solid rgba(45,63,89,0.4);
1002
+ font-size: 0.78rem;
1003
+ }
1004
+ .seg-region-row:last-child { border-bottom: none; }
1005
+ .seg-region-dot {
1006
+ width: 10px;
1007
+ height: 10px;
1008
+ border-radius: 50%;
1009
+ flex-shrink: 0;
1010
+ }
1011
+ .seg-region-label { font-weight: 600; color: var(--text); min-width: 2em; }
1012
+ .seg-region-count { flex: 1; color: var(--text-muted); }
1013
+ .seg-region-del {
1014
+ width: 22px !important;
1015
+ height: 22px !important;
1016
+ font-size: 0.9rem !important;
1017
+ flex-shrink: 0;
1018
+ }
1019
+
1020
+ /* Region separator */
1021
+ .region-separator {
1022
+ padding: 4px 10px;
1023
+ font-size: 0.68rem;
1024
+ color: var(--text-muted);
1025
+ background: var(--bg-section);
1026
+ border-bottom: 1px solid var(--border);
1027
+ letter-spacing: 0.5px;
1028
+ text-transform: uppercase;
1029
+ }
1030
+
1031
+ #results-footer {
1032
+ padding: 8px 12px;
1033
+ border-top: 1px solid var(--border);
1034
+ flex-shrink: 0;
1035
+ }
1036
+
1037
+ /* ── Progress bar ───────────────────────────────────────────────────── */
1038
+ .progress-row {
1039
+ display: flex;
1040
+ align-items: center;
1041
+ justify-content: space-between;
1042
+ gap: 8px;
1043
+ margin-top: 4px;
1044
+ }
1045
+
1046
+ #progress-bar {
1047
+ height: 4px;
1048
+ background: var(--bg-input);
1049
+ border-radius: 2px;
1050
+ overflow: hidden;
1051
+ margin-top: 8px;
1052
+ }
1053
+ #progress-fill {
1054
+ height: 100%;
1055
+ width: 0%;
1056
+ background: linear-gradient(90deg, var(--primary), var(--accent));
1057
+ transition: width 0.25s ease;
1058
+ border-radius: 2px;
1059
+ }
1060
+
1061
+ /* ── Status badges ──────────────────────────────────────────────────── */
1062
+ .status-badge {
1063
+ font-size: 0.78rem;
1064
+ padding: 4px 10px;
1065
+ border-radius: var(--radius);
1066
+ text-align: center;
1067
+ }
1068
+ .status-loaded { background: rgba(34,197,94,0.12); color: var(--success); border: 1px solid rgba(34,197,94,0.25); }
1069
+ .status-loading { background: rgba(59,130,246,0.12); color: var(--primary); border: 1px solid rgba(59,130,246,0.25); }
1070
+
1071
+ /* ── Spinner on buttons ─────────────────────────────────────────────── */
1072
+ .btn.loading { pointer-events: none; opacity: 0.7; }
1073
+ .btn.loading::after {
1074
+ content: '';
1075
+ display: inline-block;
1076
+ width: 11px;
1077
+ height: 11px;
1078
+ margin-left: 7px;
1079
+ border: 2px solid transparent;
1080
+ border-top-color: currentColor;
1081
+ border-radius: 50%;
1082
+ animation: spin 0.65s linear infinite;
1083
+ vertical-align: middle;
1084
+ }
1085
+ @keyframes spin { to { transform: rotate(360deg); } }
1086
+
1087
+ /* ── Utilities ──────────────────────────────────────────────────────── */
1088
+ .muted { color: var(--text-muted); font-size: 0.8rem; }
1089
+ .hidden { display: none !important; }
1090
+
1091
+ /* ── Help modal ─────────────────────────────────────────────────────── */
1092
+ #help-modal {
1093
+ background: var(--bg-panel);
1094
+ color: var(--text);
1095
+ border: 1px solid var(--border);
1096
+ border-radius: var(--radius-lg);
1097
+ box-shadow: var(--shadow);
1098
+ padding: 0;
1099
+ width: min(680px, 96vw);
1100
+ max-height: 82vh;
1101
+ overflow: hidden;
1102
+ }
1103
+ #help-modal[open] {
1104
+ display: flex;
1105
+ flex-direction: column;
1106
+ }
1107
+ #help-modal::backdrop {
1108
+ background: rgba(0,0,0,0.65);
1109
+ backdrop-filter: blur(2px);
1110
+ }
1111
+
1112
+ .modal-header {
1113
+ display: flex;
1114
+ align-items: center;
1115
+ justify-content: space-between;
1116
+ padding: 14px 18px;
1117
+ border-bottom: 1px solid var(--border);
1118
+ flex-shrink: 0;
1119
+ }
1120
+ .modal-header h2 { font-size: 1rem; font-weight: 600; }
1121
+
1122
+ .modal-body {
1123
+ overflow-y: auto;
1124
+ padding: 18px;
1125
+ display: flex;
1126
+ flex-direction: column;
1127
+ gap: 16px;
1128
+ font-size: 0.88rem;
1129
+ line-height: 1.6;
1130
+ }
1131
+
1132
+ .modal-body h3 {
1133
+ font-size: 0.8rem;
1134
+ text-transform: uppercase;
1135
+ letter-spacing: 0.8px;
1136
+ color: var(--text-muted);
1137
+ border-bottom: 1px solid var(--border);
1138
+ padding-bottom: 4px;
1139
+ margin-top: 4px;
1140
+ }
1141
+
1142
+ .modal-body ol, .modal-body ul { padding-left: 1.4em; display: flex; flex-direction: column; gap: 5px; }
1143
+ .modal-body li { color: var(--text-dim); }
1144
+ .modal-body strong { color: var(--text); font-weight: 600; }
1145
+
1146
+ .modal-body table {
1147
+ width: 100%;
1148
+ border-collapse: collapse;
1149
+ font-size: 0.83rem;
1150
+ }
1151
+ .modal-body th, .modal-body td {
1152
+ padding: 5px 10px;
1153
+ text-align: left;
1154
+ border-bottom: 1px solid var(--border);
1155
+ }
1156
+ .modal-body th { color: var(--text-muted); font-weight: 600; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.5px; }
1157
+ .modal-body td:first-child { color: var(--text); font-weight: 500; white-space: nowrap; }
1158
+ .modal-body tr:last-child td { border-bottom: none; }
1159
+ .modal-body tr:hover td { background: var(--bg-hover); }
1160
+
1161
+ .modal-body kbd {
1162
+ display: inline-block;
1163
+ padding: 1px 6px;
1164
+ background: var(--bg-input);
1165
+ border: 1px solid var(--border);
1166
+ border-radius: 4px;
1167
+ font-family: var(--font-mono);
1168
+ font-size: 0.78rem;
1169
+ color: var(--text-dim);
1170
+ }
1171
+
1172
+ .demo-badge {
1173
+ font-size: 0.72rem;
1174
+ padding: 1px 6px;
1175
+ border-radius: 8px;
1176
+ display: inline-block;
1177
+ margin: 0 2px;
1178
+ }
1179
+
1180
+ /* ── Mobile tab bar ─────────────────────────────────────────────────── */
1181
+ #mobile-tabs {
1182
+ display: none; /* hidden on desktop */
1183
+ height: var(--tabs-h);
1184
+ background: var(--bg-panel);
1185
+ border-top: 1px solid var(--border);
1186
+ flex-shrink: 0;
1187
+ }
1188
+
1189
+ .tab-btn {
1190
+ flex: 1;
1191
+ display: flex;
1192
+ flex-direction: column;
1193
+ align-items: center;
1194
+ justify-content: center;
1195
+ gap: 3px;
1196
+ background: none;
1197
+ border: none;
1198
+ color: var(--text-muted);
1199
+ font-size: 0.68rem;
1200
+ cursor: pointer;
1201
+ padding: 6px 4px;
1202
+ transition: color 0.15s;
1203
+ }
1204
+ .tab-btn svg { width: 20px; height: 20px; }
1205
+ .tab-btn.active { color: var(--primary); }
1206
+ .tab-btn:hover { color: var(--text-dim); }
1207
+ .tab-btn.active:hover { color: var(--primary-hover); }
1208
+
1209
+ /* ── Responsive — tablet (≤ 960px) ─────────────────────────────────── */
1210
+ @media (max-width: 960px) {
1211
+ #app { grid-template-columns: var(--panel-left, 240px) 5px 1fr 5px var(--panel-right, 300px); }
1212
+ .gpu-badge { max-width: 160px; font-size: 0.7rem; }
1213
+ }
1214
+
1215
+ @media (max-width: 780px) and (min-width: 641px) {
1216
+ #app { grid-template-columns: var(--panel-left, 200px) 5px 1fr 5px var(--panel-right, 240px); }
1217
+ .gpu-badge { max-width: 120px; font-size: 0.68rem; }
1218
+ }
1219
+
1220
+ /* ── Responsive — mobile (≤ 640px) ─────────────────────────────────── */
1221
+ @media (max-width: 640px) {
1222
+ :root { --header-h: 48px; }
1223
+
1224
+ #header h1 { font-size: 0.9rem; }
1225
+ .gpu-badge { display: none; } /* too little space */
1226
+
1227
+ /* Single-column; tab bar controls which panel is visible */
1228
+ #app {
1229
+ grid-template-columns: 1fr;
1230
+ grid-template-rows: 1fr;
1231
+ }
1232
+ .panel-resize-handle { display: none; }
1233
+
1234
+ #mobile-tabs { display: flex; }
1235
+
1236
+ /* All panels are hidden by default; JS adds panel-active */
1237
+ [data-panel] {
1238
+ display: none;
1239
+ }
1240
+ [data-panel].panel-active {
1241
+ display: flex;
1242
+ flex-direction: column;
1243
+ }
1244
+ /* Engine panel needs special treatment (flex column) */
1245
+ [data-panel="settings"].panel-active {
1246
+ overflow-y: auto;
1247
+ }
1248
+
1249
+ body { overflow: hidden; }
1250
+ /* Account for tab bar height */
1251
+ #app { height: calc(100vh - var(--header-h) - var(--tabs-h)); }
1252
+
1253
+ /* Results panel: stack vertically */
1254
+ #results-panel.panel-active { gap: 0; }
1255
+
1256
+ /* Larger touch targets */
1257
+ .btn { padding: 10px 16px; font-size: 0.9rem; }
1258
+ .btn-small { padding: 7px 12px; font-size: 0.82rem; }
1259
+ select, input[type="text"], input[type="number"], input[type="password"] {
1260
+ padding: 8px 10px;
1261
+ font-size: 0.9rem;
1262
+ }
1263
+
1264
+ /* Upload area takes less vertical space */
1265
+ .upload-area { padding: 14px 10px; }
1266
+
1267
+ /* Full-width help modal */
1268
+ #help-modal { width: 100vw; max-height: 90vh; border-radius: var(--radius-lg) var(--radius-lg) 0 0; }
1269
+ }
web/static/app.js ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Polyscriptor Web UI — Main application entry point
3
+ *
4
+ * Central state + event bus, wires up components.
5
+ * No framework, no build step — native ES modules.
6
+ */
7
+
8
+ import { initEnginePanel } from './components/engine-panel.js';
9
+ import { initImageViewer } from './components/image-viewer.js';
10
+ import { initTranscriptionPanel } from './components/transcription-panel.js';
11
+ import { initBatchPanel } from './components/batch-panel.js';
12
+
13
+ // ── Global state ───────────────────────────────────────────────────────
14
+ export const state = {
15
+ engines: [],
16
+ currentEngine: null,
17
+ engineLoaded: false,
18
+ imageId: null,
19
+ imageInfo: null,
20
+ lines: [], // [{index, text, confidence, bbox, region}]
21
+ regions: [], // [{id, bbox, num_lines}] — from latest segmentation
22
+ isProcessing: false,
23
+ };
24
+
25
+ // ── Event bus ──────────────────────────────────────────────────────────
26
+ export const events = new EventTarget();
27
+ export function emit(name, detail) {
28
+ events.dispatchEvent(new CustomEvent(name, { detail }));
29
+ }
30
+ export function on(name, fn) {
31
+ events.addEventListener(name, e => fn(e.detail));
32
+ }
33
+
34
+ // ── API helper ─────────────────────────────────────────────────────────
35
+ export async function api(path, options = {}) {
36
+ const resp = await fetch(path, {
37
+ headers: { 'Content-Type': 'application/json', ...options.headers },
38
+ ...options,
39
+ });
40
+ if (!resp.ok) {
41
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
42
+ throw new Error(err.detail || err.message || 'API error');
43
+ }
44
+ return resp;
45
+ }
46
+
47
+ // ── Toast notifications ────────────────────────────────────────────────
48
+ export function toast(message, type = 'info', durationMs = 4000) {
49
+ const container = document.getElementById('toast-container');
50
+ const el = document.createElement('div');
51
+ el.className = `toast toast-${type}`;
52
+ el.textContent = message;
53
+ container.appendChild(el);
54
+ setTimeout(() => el.remove(), durationMs);
55
+ }
56
+
57
+ // ── GPU status widget ──────────────────────────────────────────────────
58
+ function shortName(name) {
59
+ // Abbreviate long GPU names for the header
60
+ return name
61
+ .replace('NVIDIA ', '')
62
+ .replace('GeForce ', '')
63
+ .replace('Tesla ', '')
64
+ .replace('Quadro ', '');
65
+ }
66
+
67
+ async function updateGpuStatus() {
68
+ const widget = document.getElementById('gpu-status');
69
+ try {
70
+ const resp = await api('/api/gpu');
71
+ const data = await resp.json();
72
+
73
+ if (!data.available || data.gpus.length === 0) {
74
+ widget.innerHTML = '<span class="gpu-card-name"><span>GPU: N/A</span></span>';
75
+ return;
76
+ }
77
+
78
+ widget.innerHTML = data.gpus.map(g => {
79
+ const usedPct = Math.round((g.memory_used_mb / g.memory_total_mb) * 100);
80
+ const fillClass = usedPct >= 85 ? 'hot' : usedPct >= 60 ? 'warm' : '';
81
+ const usedGb = (g.memory_used_mb / 1000).toFixed(1);
82
+ const totalGb = (g.memory_total_mb / 1000).toFixed(0);
83
+ const utilHtml = g.utilization_gpu_pct != null
84
+ ? `<span class="gpu-util-pct">${g.utilization_gpu_pct}%</span>` : '';
85
+
86
+ return `<div class="gpu-card">
87
+ <div class="gpu-card-name">
88
+ <span title="${g.name}">${shortName(g.name)}</span>${utilHtml}
89
+ </div>
90
+ <div class="gpu-mem-bar">
91
+ <div class="gpu-mem-fill ${fillClass}" style="width:${usedPct}%"></div>
92
+ </div>
93
+ <div class="gpu-mem-label">${usedGb}/${totalGb} GB VRAM</div>
94
+ </div>`;
95
+ }).join('');
96
+ } catch {
97
+ widget.innerHTML = '<span style="font-size:.75rem;color:var(--text-muted)">GPU: error</span>';
98
+ }
99
+ }
100
+
101
+ // ── Zoom controls ──────────────────────────────────────────────────────
102
+ let zoomLevel = 1.0;
103
+ const ZOOM_STEP = 0.25;
104
+ let ZOOM_MIN = 0.25; // updated per image in fitZoom() so large images are always reachable
105
+ const ZOOM_MAX = 4.0;
106
+
107
+ function applyZoom(level) {
108
+ const img = document.getElementById('page-image');
109
+ const canvas = document.getElementById('overlay-canvas');
110
+ if (!img || !img.naturalWidth) return;
111
+
112
+ zoomLevel = Math.max(ZOOM_MIN, Math.min(ZOOM_MAX, level));
113
+ const w = Math.round(img.naturalWidth * zoomLevel);
114
+ const h = Math.round(img.naturalHeight * zoomLevel);
115
+
116
+ img.style.width = w + 'px';
117
+ img.style.height = h + 'px';
118
+ canvas.style.width = w + 'px';
119
+ canvas.style.height = h + 'px';
120
+
121
+ document.getElementById('zoom-level').textContent =
122
+ Math.round(zoomLevel * 100) + '%';
123
+ }
124
+
125
+ export function fitZoom() {
126
+ const img = document.getElementById('page-image');
127
+ const scroll = document.getElementById('viewer-scroll');
128
+ if (!img || !img.naturalWidth || !scroll) return;
129
+ const scaleW = scroll.clientWidth / img.naturalWidth;
130
+ const scaleH = scroll.clientHeight / img.naturalHeight;
131
+ const fit = Math.min(scaleW, scaleH, 1.0); // never zoom in beyond 100% on fit
132
+ // Ensure the fit level is always reachable: lower ZOOM_MIN for large images (min 5%)
133
+ ZOOM_MIN = Math.min(0.25, Math.max(0.05, fit));
134
+ applyZoom(fit);
135
+ }
136
+
137
+ function initZoomControls() {
138
+ document.getElementById('btn-zoom-in') .addEventListener('click', () => applyZoom(zoomLevel + ZOOM_STEP));
139
+ document.getElementById('btn-zoom-out').addEventListener('click', () => applyZoom(zoomLevel - ZOOM_STEP));
140
+ document.getElementById('btn-zoom-fit').addEventListener('click', fitZoom);
141
+
142
+ // Mouse-wheel zoom in viewer — multiplicative for smooth feel
143
+ document.getElementById('viewer-scroll').addEventListener('wheel', e => {
144
+ if (!e.ctrlKey && !e.metaKey) return;
145
+ e.preventDefault();
146
+ const factor = e.deltaY < 0 ? 1.10 : 1 / 1.10;
147
+ applyZoom(zoomLevel * factor);
148
+ }, { passive: false });
149
+
150
+ on('image-uploaded', () => {
151
+ document.getElementById('zoom-toolbar').classList.remove('hidden');
152
+ // fit after short delay to let image render
153
+ setTimeout(fitZoom, 80);
154
+ });
155
+
156
+ // Also show toolbar when a batch item is displayed in the viewer
157
+ on('batch-item-start', () => {
158
+ document.getElementById('zoom-toolbar').classList.remove('hidden');
159
+ });
160
+ }
161
+
162
+ // ── Sticky engine config (localStorage) ───────────────────────────────
163
+ const LS_ENGINE = 'polyscriptor_last_engine';
164
+ const LS_CONFIG = name => `polyscriptor_config_${name}`;
165
+
166
+ export function saveEngineConfig(engineName, configObj) {
167
+ try {
168
+ localStorage.setItem(LS_ENGINE, engineName);
169
+ localStorage.setItem(LS_CONFIG(engineName), JSON.stringify(configObj));
170
+ } catch { /* storage full or private mode */ }
171
+ }
172
+
173
+ export function loadSavedEngineName() {
174
+ try { return localStorage.getItem(LS_ENGINE); } catch { return null; }
175
+ }
176
+
177
+ export function loadSavedEngineConfig(engineName) {
178
+ try {
179
+ const raw = localStorage.getItem(LS_CONFIG(engineName));
180
+ return raw ? JSON.parse(raw) : null;
181
+ } catch { return null; }
182
+ }
183
+
184
+ // ── Mobile tab helper ───────────────────────────────────────────────────
185
+ function mobileActivateTab(target) {
186
+ const tabBtns = document.querySelectorAll('.tab-btn');
187
+ const panels = document.querySelectorAll('[data-panel]');
188
+ if (!tabBtns.length) return;
189
+ tabBtns.forEach(b => b.classList.toggle('active', b.dataset.target === target));
190
+ panels.forEach(p => p.classList.toggle('panel-active', p.dataset.panel === target));
191
+ }
192
+
193
+ // ── Resizable panels ───────────────────────────────────────────────────
194
+ const LS_PANEL_LEFT = 'polyscriptor_panel_left';
195
+ const LS_PANEL_RIGHT = 'polyscriptor_panel_right';
196
+
197
+ function initResizablePanels() {
198
+ const app = document.getElementById('app');
199
+ const handleLeft = document.getElementById('resize-left');
200
+ const handleRight = document.getElementById('resize-right');
201
+ if (!handleLeft || !handleRight) return;
202
+
203
+ // Restore saved widths
204
+ const savedLeft = localStorage.getItem(LS_PANEL_LEFT);
205
+ const savedRight = localStorage.getItem(LS_PANEL_RIGHT);
206
+ if (savedLeft) document.documentElement.style.setProperty('--panel-left', savedLeft);
207
+ if (savedRight) document.documentElement.style.setProperty('--panel-right', savedRight);
208
+
209
+ function startDrag(handle, isLeft) {
210
+ handle.classList.add('dragging');
211
+ document.body.style.cursor = 'col-resize';
212
+ document.body.style.userSelect = 'none';
213
+
214
+ const onMove = (e) => {
215
+ const appRect = app.getBoundingClientRect();
216
+ const x = (e.touches ? e.touches[0].clientX : e.clientX) - appRect.left;
217
+ const totalW = appRect.width;
218
+
219
+ if (isLeft) {
220
+ const w = Math.max(160, Math.min(x, totalW * 0.4));
221
+ const val = Math.round(w) + 'px';
222
+ document.documentElement.style.setProperty('--panel-left', val);
223
+ localStorage.setItem(LS_PANEL_LEFT, val);
224
+ } else {
225
+ const w = Math.max(200, Math.min(totalW - x, totalW * 0.5));
226
+ const val = Math.round(w) + 'px';
227
+ document.documentElement.style.setProperty('--panel-right', val);
228
+ localStorage.setItem(LS_PANEL_RIGHT, val);
229
+ }
230
+ };
231
+
232
+ const onUp = () => {
233
+ handle.classList.remove('dragging');
234
+ document.body.style.cursor = '';
235
+ document.body.style.userSelect = '';
236
+ document.removeEventListener('mousemove', onMove);
237
+ document.removeEventListener('mouseup', onUp);
238
+ document.removeEventListener('touchmove', onMove);
239
+ document.removeEventListener('touchend', onUp);
240
+ };
241
+
242
+ document.addEventListener('mousemove', onMove);
243
+ document.addEventListener('mouseup', onUp);
244
+ document.addEventListener('touchmove', onMove, { passive: true });
245
+ document.addEventListener('touchend', onUp);
246
+ }
247
+
248
+ handleLeft.addEventListener('mousedown', e => { e.preventDefault(); startDrag(handleLeft, true); });
249
+ handleRight.addEventListener('mousedown', e => { e.preventDefault(); startDrag(handleRight, false); });
250
+ handleLeft.addEventListener('touchstart', e => startDrag(handleLeft, true), { passive: true });
251
+ handleRight.addEventListener('touchstart', e => startDrag(handleRight, false), { passive: true });
252
+ }
253
+
254
+ // ── Keyboard shortcuts ─────────────────────────────────────────────────
255
+ function initKeyboardShortcuts() {
256
+ document.addEventListener('keydown', e => {
257
+ // Ignore when typing in an input / textarea / contenteditable
258
+ const tag = e.target.tagName;
259
+ const editable = e.target.isContentEditable;
260
+ if (tag === 'INPUT' || tag === 'TEXTAREA' || tag === 'SELECT' || editable) return;
261
+
262
+ // Ctrl+Enter — transcribe
263
+ if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
264
+ e.preventDefault();
265
+ document.getElementById('btn-transcribe')?.click();
266
+ return;
267
+ }
268
+
269
+ // ArrowLeft / ArrowRight — batch prev/next
270
+ if (e.key === 'ArrowLeft') { e.preventDefault(); document.getElementById('btn-nav-prev')?.click(); }
271
+ if (e.key === 'ArrowRight') { e.preventDefault(); document.getElementById('btn-nav-next')?.click(); }
272
+ });
273
+ }
274
+
275
+ // ── Prevent browser from opening dropped files in a new tab ────────────
276
+ function initGlobalDropBlocker() {
277
+ document.addEventListener('dragover', e => e.preventDefault());
278
+ document.addEventListener('drop', e => e.preventDefault());
279
+ }
280
+
281
+ // ── Init ───────────────────────────────────────────────────────────────
282
+ document.addEventListener('DOMContentLoaded', () => {
283
+ initEnginePanel();
284
+ initImageViewer();
285
+ initTranscriptionPanel();
286
+ initBatchPanel();
287
+ initZoomControls();
288
+ initResizablePanels();
289
+ initKeyboardShortcuts();
290
+ initGlobalDropBlocker();
291
+ updateGpuStatus();
292
+ setInterval(updateGpuStatus, 15000); // refresh every 15s
293
+
294
+ // On mobile: auto-switch tab after key events
295
+ on('image-uploaded', () => mobileActivateTab('image'));
296
+ on('segment-preview', () => mobileActivateTab('image'));
297
+ on('transcription-start', () => mobileActivateTab('results'));
298
+ });
web/static/components/batch-panel.js ADDED
@@ -0,0 +1,735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Batch Panel — multi-image queue, sequential processing, combined export
3
+ *
4
+ * Activated when the user selects/drops multiple images.
5
+ * Each item is processed using the existing upload + transcribe flow.
6
+ * Results are stored per-item and can be exported as combined TXT or CSV.
7
+ */
8
+
9
+ import { state, emit, on, api, toast } from '../app.js';
10
+
11
+ const $ = id => document.getElementById(id);
12
+
13
+ // Batch state (separate from state.lines which tracks the current single image)
14
+ const batch = {
15
+ items: [], // { file, imageId, status, lines, filename }
16
+ running: false,
17
+ cancelled: false,
18
+ currentIndex: -1, // item currently shown in the viewer
19
+ processingIndex: -1, // item currently being transcribed (may differ when user navigates away)
20
+ userNavigated: false, // user manually navigated away from auto-advance
21
+ abortController: null,
22
+ };
23
+
24
+ export function initBatchPanel() {
25
+ // Hook into the file input to detect multiple files, PDFs, or second image.
26
+ // Use capture:true so this fires before image-viewer's bubble listener, letting us
27
+ // stopImmediatePropagation() and own the upload when batch-panel takes over.
28
+ const fileInput = $('file-input');
29
+ fileInput.addEventListener('change', e => {
30
+ const files = Array.from(fileInput.files);
31
+ const hasPdf = files.some(f => f.name.toLowerCase().endsWith('.pdf'));
32
+ // Intercept: multiple files, PDF, or single image when one is already loaded
33
+ if (files.length > 1 || hasPdf || (files.length === 1 && !hasPdf && state.imageId)) {
34
+ e.stopImmediatePropagation(); // prevent image-viewer from also uploading the PDF
35
+ handleMultipleFiles(files);
36
+ fileInput.value = '';
37
+ }
38
+ // Single non-PDF image with no existing image → handled by image-viewer.js
39
+ }, true); // capture:true — fires before image-viewer's non-capture listener
40
+
41
+ // Multiple XML selection from the Upload XML button
42
+ const xmlInput = $('xml-input');
43
+ xmlInput.addEventListener('change', e => {
44
+ if (xmlInput.files.length <= 1) return; // single XML → image-viewer handles normally
45
+ e.stopImmediatePropagation();
46
+ uploadXmlFiles(Array.from(xmlInput.files));
47
+ xmlInput.value = '';
48
+ }, true); // capture — fires before image-viewer's listener
49
+
50
+ // Drag-drop: intercept multiple images/PDFs or any drop when image already loaded
51
+ const uploadArea = $('upload-area');
52
+ uploadArea.addEventListener('drop', e => {
53
+ const files = Array.from(e.dataTransfer.files);
54
+ const xmlFiles = files.filter(f => f.name.toLowerCase().endsWith('.xml'));
55
+ const nonXml = files.filter(f => !f.name.toLowerCase().endsWith('.xml'));
56
+ const hasPdf = nonXml.some(f => f.name.toLowerCase().endsWith('.pdf'));
57
+
58
+ // Take over if: multiple images, a PDF, a second image on top of existing, or multiple XMLs
59
+ const takeBatch = nonXml.length > 1 || hasPdf || (nonXml.length === 1 && state.imageId);
60
+ const takeXml = xmlFiles.length > 1 || (xmlFiles.length === 1 && batch.items.length > 0);
61
+
62
+ if (takeBatch || takeXml) {
63
+ e.preventDefault();
64
+ e.stopImmediatePropagation();
65
+ if (nonXml.length > 0) handleMultipleFiles(nonXml);
66
+ if (xmlFiles.length > 0) uploadXmlFiles(xmlFiles);
67
+ }
68
+ }, true); // capture phase — fires before image-viewer's bubble handler
69
+
70
+ // PDF pages from single-PDF drop on image-viewer — add to batch
71
+ on('pdf-pages-ready', data => {
72
+ const existing = new Set(batch.items.map(i => i.filename));
73
+ for (const page of data.pages) {
74
+ if (!existing.has(page.filename)) {
75
+ batch.items.push({
76
+ file: null,
77
+ imageId: page.image_id,
78
+ status: 'pending',
79
+ lines: [],
80
+ filename: page.filename,
81
+ preUploaded: true,
82
+ });
83
+ existing.add(page.filename);
84
+ }
85
+ }
86
+ if (batch.items.length > 0) {
87
+ renderQueue();
88
+ // PDF pages are already uploaded — always preview the first one directly,
89
+ // bypassing the state.imageId guard in previewFirstBatchItem().
90
+ const first = batch.items[0];
91
+ if (first && first.preUploaded && first.imageId) {
92
+ batch.currentIndex = 0;
93
+ emit('batch-item-start', { imageId: first.imageId, filename: first.filename });
94
+ updateNavButtons();
95
+ }
96
+ }
97
+ });
98
+
99
+ $('btn-process-batch').addEventListener('click', processBatch);
100
+ $('btn-clear-batch').addEventListener('click', clearBatch);
101
+ $('btn-export-batch-txt').addEventListener('click', exportAllTxt);
102
+ $('btn-export-batch-csv').addEventListener('click', exportAllCsv);
103
+ $('btn-export-batch-txt-zip').addEventListener('click', exportAllTxtZip);
104
+ $('btn-export-batch-thinking-zip').addEventListener('click', exportAllThinkingZip);
105
+ $('btn-export-batch-xml').addEventListener('click', exportAllXml);
106
+
107
+ $('btn-nav-prev').addEventListener('click', () => navigate(-1));
108
+ $('btn-nav-next').addEventListener('click', () => navigate(+1));
109
+
110
+ // Persist PAGE XML and resume checkboxes across sessions
111
+ const usePageXmlEl = $('batch-use-pagexml');
112
+ const resumeEl = $('batch-resume');
113
+ const savedPageXml = localStorage.getItem('batch_use_pagexml');
114
+ const savedResume = localStorage.getItem('batch_resume');
115
+ if (savedPageXml !== null) usePageXmlEl.checked = savedPageXml === 'true';
116
+ if (savedResume !== null) resumeEl.checked = savedResume === 'true';
117
+ usePageXmlEl.addEventListener('change', () => localStorage.setItem('batch_use_pagexml', usePageXmlEl.checked));
118
+ resumeEl.addEventListener('change', () => localStorage.setItem('batch_resume', resumeEl.checked));
119
+
120
+ // Cancel during batch: abort current SSE + stop the queue loop
121
+ $('btn-cancel').addEventListener('click', () => {
122
+ if (!batch.running) return;
123
+ batch.cancelled = true;
124
+ batch.abortController?.abort();
125
+ }, { capture: true });
126
+ }
127
+
128
+ // ── XML matching for batch ────────────────────────────────────────────────────
129
+
130
+ // Match XML files to batch items by filename stem (e.g. page001.xml → page001.jpg)
131
+ async function uploadXmlFiles(xmlFiles) {
132
+ if (!xmlFiles.length) return;
133
+ const stem = name => name.replace(/\.[^/.]+$/, '').toLowerCase();
134
+
135
+ let matched = 0, deferred = 0, skipped = 0;
136
+
137
+ for (const xml of xmlFiles) {
138
+ const xmlStem = stem(xml.name);
139
+ const item = batch.items.find(it => stem(it.filename) === xmlStem);
140
+ if (!item) { skipped++; continue; }
141
+
142
+ if (item.imageId) {
143
+ // Already uploaded → send to server immediately
144
+ try {
145
+ const fd = new FormData();
146
+ fd.append('file', xml);
147
+ const resp = await fetch(`/api/image/${item.imageId}/xml`, { method: 'POST', body: fd });
148
+ if (!resp.ok) throw new Error((await resp.json()).detail);
149
+ item.xmlUploaded = true;
150
+ matched++;
151
+ } catch (err) {
152
+ toast(`XML ${xml.name}: ${err.message}`, 'error');
153
+ }
154
+ } else {
155
+ // Image not yet uploaded — store XML, send during processBatch
156
+ item.xmlFile = xml;
157
+ deferred++;
158
+ }
159
+ }
160
+
161
+ const parts = [];
162
+ if (matched > 0) parts.push(`${matched} uploaded`);
163
+ if (deferred > 0) parts.push(`${deferred} queued for batch`);
164
+ if (skipped > 0) parts.push(`${skipped} unmatched`);
165
+ toast(`XML files: ${parts.join(', ')}`, matched + deferred > 0 ? 'success' : 'error');
166
+ }
167
+
168
+ // ── Queue management ─────────────────────────────────────────────────────────
169
+
170
+ function handleMultipleFiles(files) {
171
+ // If a single image is already loaded (not yet in batch), add it first
172
+ if (batch.items.length === 0 && state.imageId) {
173
+ batch.items.push({
174
+ file: null,
175
+ imageId: state.imageId,
176
+ status: 'pending',
177
+ lines: state.lines.length ? state.lines : [],
178
+ filename: (state.imageInfo && state.imageInfo.filename) || 'current image',
179
+ preUploaded: true,
180
+ });
181
+ }
182
+ // Add new files (skip duplicates by name)
183
+ const existing = new Set(batch.items.map(i => i.filename));
184
+ const added = files.filter(f => !existing.has(f.name));
185
+ added.forEach(f => {
186
+ batch.items.push({ file: f, imageId: null, status: 'pending', lines: [], filename: f.name });
187
+ });
188
+ if (batch.items.length > 0) { renderQueue(); previewFirstBatchItem(); }
189
+ }
190
+
191
+ // Auto-preview all batch items (upload if needed), expanding PDFs into pages immediately
192
+ async function previewFirstBatchItem() {
193
+ if (batch.running) return;
194
+
195
+ let i = 0;
196
+ let safetyCounter = 0;
197
+ while (i < batch.items.length && safetyCounter < 100) {
198
+ safetyCounter++;
199
+ const item = batch.items[i];
200
+
201
+ if (item.preUploaded && item.imageId) {
202
+ i++;
203
+ continue;
204
+ }
205
+
206
+ if (item.file) {
207
+ try {
208
+ const fd = new FormData();
209
+ fd.append('file', item.file);
210
+ const resp = await fetch('/api/image/upload', { method: 'POST', body: fd });
211
+ if (!resp.ok) { i++; continue; }
212
+ const data = await resp.json();
213
+
214
+ if (data.is_pdf) {
215
+ const newItems = data.pages.map(p => ({
216
+ file: null, imageId: p.image_id, status: 'pending',
217
+ lines: [], filename: p.filename, preUploaded: true,
218
+ }));
219
+ batch.items.splice(i, 1, ...newItems);
220
+ renderQueue();
221
+ continue;
222
+ }
223
+
224
+ item.imageId = data.image_id;
225
+ item.preUploaded = true;
226
+ renderQueue();
227
+
228
+ if (i === 0 && !state.imageId) {
229
+ batch.currentIndex = 0;
230
+ emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
231
+ updateNavButtons();
232
+ }
233
+ i++;
234
+ } catch (err) {
235
+ console.error('Error pre-uploading batch item:', err);
236
+ i++;
237
+ }
238
+ } else {
239
+ i++;
240
+ }
241
+ }
242
+ }
243
+
244
+ function clearBatch() {
245
+ if (batch.running) return;
246
+ batch.items = [];
247
+ batch.currentIndex = -1;
248
+ $('batch-queue-section').classList.add('hidden');
249
+ $('batch-export-row').classList.add('hidden');
250
+ updateNavButtons();
251
+ }
252
+
253
+ let _dragSrcIndex = null;
254
+
255
+ function renderQueue() {
256
+ const section = $('batch-queue-section');
257
+ const list = $('batch-list');
258
+ section.classList.remove('hidden');
259
+ list.innerHTML = '';
260
+ batch.items.forEach((item, i) => {
261
+ const row = document.createElement('div');
262
+ row.className = 'batch-item';
263
+ row.id = `batch-item-${i}`;
264
+ row.dataset.index = i;
265
+
266
+ // Drag handle
267
+ const handle = document.createElement('span');
268
+ handle.className = 'batch-drag-handle';
269
+ handle.textContent = '⠿';
270
+ handle.title = 'Drag to reorder';
271
+
272
+ const name = document.createElement('span');
273
+ name.className = 'batch-item-name';
274
+ name.title = item.filename;
275
+ name.textContent = item.filename;
276
+
277
+ const status = document.createElement('span');
278
+ status.className = 'batch-status';
279
+ status.id = `batch-status-${i}`;
280
+ _setStatusEl(status, item.status, item.lines.length);
281
+
282
+ row.appendChild(handle);
283
+ row.appendChild(name);
284
+ row.appendChild(status);
285
+
286
+ // Click a done item to reload it, or a preUploaded pending item to load for manual transcription
287
+ const canPreview = item.status === 'done' || (item.preUploaded && item.imageId);
288
+ if (canPreview) {
289
+ row.style.cursor = 'pointer';
290
+ row.addEventListener('click', e => {
291
+ if (e.target === handle) return; // don't trigger on drag handle click
292
+ if (item.status === 'done') {
293
+ loadBatchItem(i);
294
+ } else {
295
+ // Load preUploaded pending page so user can manually segment/transcribe it
296
+ batch.currentIndex = i;
297
+ emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
298
+ updateNavButtons();
299
+ }
300
+ });
301
+ }
302
+
303
+ // Drag-to-reorder (only when not running)
304
+ if (!batch.running) {
305
+ row.draggable = true;
306
+ row.addEventListener('dragstart', e => {
307
+ _dragSrcIndex = i;
308
+ e.dataTransfer.effectAllowed = 'move';
309
+ row.classList.add('batch-dragging');
310
+ });
311
+ row.addEventListener('dragend', () => {
312
+ row.classList.remove('batch-dragging');
313
+ list.querySelectorAll('.batch-item').forEach(r => r.classList.remove('batch-drag-over'));
314
+ });
315
+ row.addEventListener('dragover', e => {
316
+ e.preventDefault();
317
+ e.dataTransfer.dropEffect = 'move';
318
+ list.querySelectorAll('.batch-item').forEach(r => r.classList.remove('batch-drag-over'));
319
+ row.classList.add('batch-drag-over');
320
+ });
321
+ row.addEventListener('dragleave', () => row.classList.remove('batch-drag-over'));
322
+ row.addEventListener('drop', e => {
323
+ e.preventDefault();
324
+ row.classList.remove('batch-drag-over');
325
+ const destIndex = parseInt(row.dataset.index, 10);
326
+ if (_dragSrcIndex == null || _dragSrcIndex === destIndex) return;
327
+
328
+ // Reorder batch.items
329
+ const [moved] = batch.items.splice(_dragSrcIndex, 1);
330
+ batch.items.splice(destIndex, 0, moved);
331
+
332
+ // Fix currentIndex if it pointed to a moved item
333
+ if (batch.currentIndex === _dragSrcIndex) {
334
+ batch.currentIndex = destIndex;
335
+ } else if (_dragSrcIndex < destIndex) {
336
+ if (batch.currentIndex > _dragSrcIndex && batch.currentIndex <= destIndex) batch.currentIndex--;
337
+ } else {
338
+ if (batch.currentIndex >= destIndex && batch.currentIndex < _dragSrcIndex) batch.currentIndex++;
339
+ }
340
+
341
+ _dragSrcIndex = null;
342
+ renderQueue();
343
+ });
344
+ }
345
+
346
+ list.appendChild(row);
347
+ });
348
+
349
+ // Show export row if any item is done
350
+ const anyDone = batch.items.some(i => i.status === 'done');
351
+ $('batch-export-row').classList.toggle('hidden', !anyDone);
352
+ updateNavButtons();
353
+ }
354
+
355
+ function _setStatusEl(el, status, lineCount) {
356
+ el.className = 'batch-status';
357
+ if (status === 'pending') { el.textContent = 'pending'; }
358
+ else if (status === 'active'){ el.textContent = 'running…'; el.classList.add('active'); }
359
+ else if (status === 'done') { el.textContent = `✓ ${lineCount} lines`; el.classList.add('done'); }
360
+ else if (status === 'error') { el.textContent = 'error'; el.classList.add('error'); }
361
+ }
362
+
363
+ function updateItemStatus(index, status, lineCount = 0) {
364
+ batch.items[index].status = status;
365
+ const el = $(`batch-status-${index}`);
366
+ if (el) _setStatusEl(el, status, lineCount);
367
+ }
368
+
369
+ function updateOverallProgress(current = null, total = null) {
370
+ const el = $('batch-overall-progress');
371
+ if (current == null) {
372
+ el.classList.add('hidden');
373
+ el.textContent = '';
374
+ } else {
375
+ el.textContent = `${current} / ${total}`;
376
+ el.classList.remove('hidden');
377
+ }
378
+ }
379
+
380
+ function updateNavButtons() {
381
+ const done = batch.items.filter(i => i.status === 'done');
382
+ const hasBatch = done.length > 0;
383
+ const idx = batch.currentIndex;
384
+ // Allow navigation to done items even while batch is running
385
+ const prevDone = hasBatch && batch.items.slice(0, idx).some(i => i.status === 'done');
386
+ const nextDone = hasBatch && batch.items.slice(idx + 1).some(i => i.status === 'done');
387
+ $('btn-nav-prev').disabled = !prevDone;
388
+ $('btn-nav-next').disabled = !nextDone;
389
+ const label = $('batch-nav-label');
390
+ if (hasBatch && idx >= 0) {
391
+ const pos = done.indexOf(batch.items[idx]) + 1;
392
+ label.textContent = `${pos}/${done.length}`;
393
+ } else {
394
+ label.textContent = '';
395
+ }
396
+ }
397
+
398
+ function navigate(delta) {
399
+ const indices = batch.items
400
+ .map((item, i) => item.status === 'done' ? i : -1)
401
+ .filter(i => i >= 0);
402
+ if (indices.length < 2) return;
403
+ const cur = indices.indexOf(batch.currentIndex);
404
+ const next = indices[cur + delta];
405
+ if (next != null) loadBatchItem(next);
406
+ }
407
+
408
+ // ── Processing ───────────────────────────────────────────────────────────────
409
+
410
+ async function processBatch() {
411
+ if (batch.running || !state.engineLoaded) {
412
+ if (!state.engineLoaded) toast('Load an engine first', 'error');
413
+ return;
414
+ }
415
+ batch.running = true;
416
+ batch.cancelled = false;
417
+ batch.userNavigated = false; // reset: auto-advance viewer from scratch
418
+ $('btn-process-batch').disabled = true;
419
+ $('btn-cancel').classList.remove('hidden');
420
+
421
+ const segMethod = $('seg-method').value;
422
+ const segDevice = $('seg-device').value;
423
+ const maxColumns = parseInt($('seg-max-columns')?.value || '6', 10);
424
+ const splitWidth = parseFloat($('seg-split-width')?.value || '40') / 100;
425
+ const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
426
+ const usePageXml = $('batch-use-pagexml').checked;
427
+ const resume = $('batch-resume').checked;
428
+ const pending = batch.items.filter(i => resume ? i.status === 'pending' : i.status !== 'done').length;
429
+ let doneThisRun = 0;
430
+ updateOverallProgress(0, pending);
431
+
432
+ for (let i = 0; i < batch.items.length; i++) {
433
+ if (batch.cancelled) {
434
+ // Mark remaining pending items back to pending (they stay pending)
435
+ break;
436
+ }
437
+
438
+ const item = batch.items[i];
439
+ if (item.status === 'done') {
440
+ // Resume mode: skip done; non-resume mode: also skip done
441
+ continue;
442
+ }
443
+
444
+ batch.processingIndex = i;
445
+ updateItemStatus(i, 'active');
446
+ updateNavButtons();
447
+
448
+ try {
449
+ // 1. Upload image (skip if already uploaded, e.g. PDF page pre-rendered by server)
450
+ if (item.preUploaded && item.imageId) {
451
+ // Already registered server-side — no upload needed
452
+ } else {
453
+ const fd = new FormData();
454
+ fd.append('file', item.file);
455
+ const upResp = await fetch('/api/image/upload', { method: 'POST', body: fd });
456
+ if (!upResp.ok) throw new Error(`Upload failed: ${upResp.statusText}`);
457
+ const upData = await upResp.json();
458
+ // PDF uploaded directly: expand into sub-items and skip this placeholder
459
+ if (upData.is_pdf) {
460
+ const newItems = upData.pages.map(p => ({
461
+ file: null, imageId: p.image_id, status: 'pending',
462
+ lines: [], filename: p.filename, preUploaded: true,
463
+ }));
464
+ batch.items.splice(i + 1, 0, ...newItems);
465
+ updateItemStatus(i, 'done', 0);
466
+ renderQueue();
467
+ continue;
468
+ }
469
+ item.imageId = upData.image_id;
470
+ }
471
+
472
+ // Upload deferred XML if one was matched earlier
473
+ if (item.xmlFile && item.imageId) {
474
+ try {
475
+ const fd = new FormData();
476
+ fd.append('file', item.xmlFile);
477
+ await fetch(`/api/image/${item.imageId}/xml`, { method: 'POST', body: fd });
478
+ item.xmlUploaded = true;
479
+ } catch { /* non-fatal */ }
480
+ }
481
+
482
+ // Show in viewer — skip if user manually navigated to a different item
483
+ if (!batch.userNavigated) {
484
+ batch.currentIndex = i;
485
+ emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
486
+ }
487
+
488
+ // 2. Transcribe via SSE (abortable)
489
+ batch.abortController = new AbortController();
490
+ const result = await transcribeSSE(
491
+ item.imageId, segMethod, segDevice, maxColumns, splitWidth, usePageXml, batch.abortController.signal, textDirection
492
+ );
493
+ item.lines = result.lines;
494
+ item.time_s = result.time_s;
495
+ item.token_usage = result.token_usage;
496
+ updateItemStatus(i, 'done', result.lines.length);
497
+ doneThisRun++;
498
+ updateOverallProgress(doneThisRun, pending);
499
+ // Fire sse-complete so the panel shows footer, column toggle, confidence filter, etc.
500
+ if (batch.currentIndex === i) {
501
+ emit('sse-complete', { lines: item.lines, total_time_s: item.time_s, engine: '(batch)', token_usage: item.token_usage });
502
+ }
503
+
504
+ } catch (err) {
505
+ if (err.name === 'AbortError' || batch.cancelled) {
506
+ updateItemStatus(i, 'pending');
507
+ } else {
508
+ updateItemStatus(i, 'error');
509
+ toast(`${item.filename}: ${err.message}`, 'error');
510
+ }
511
+ }
512
+
513
+ // Re-render to make done items clickable
514
+ renderQueue();
515
+ }
516
+
517
+ batch.running = false;
518
+ batch.processingIndex = -1;
519
+ batch.userNavigated = false;
520
+ batch.abortController = null;
521
+ $('btn-process-batch').disabled = false;
522
+ $('btn-cancel').classList.add('hidden');
523
+ $('batch-export-row').classList.remove('hidden');
524
+ updateOverallProgress(null);
525
+ updateNavButtons();
526
+
527
+ const doneCount = batch.items.filter(i => i.status === 'done').length;
528
+ if (batch.cancelled) {
529
+ toast(`Batch cancelled — ${doneCount} image(s) done`, 'info', 4000);
530
+ } else {
531
+ toast(`Batch complete: ${doneCount}/${batch.items.length} images`, 'success', 5000);
532
+ }
533
+ emit('batch-complete', { items: batch.items });
534
+ }
535
+
536
+ function _collectLiveOverrides() {
537
+ const overrides = {};
538
+ const form = document.getElementById('config-form');
539
+ if (!form) return overrides;
540
+ for (const el of form.querySelectorAll('[data-key]')) {
541
+ if (el.dataset.saveFor) continue;
542
+ if (el.dataset.passwordField) continue;
543
+ const key = el.dataset.key;
544
+ if (el.type === 'checkbox') overrides[key] = el.checked;
545
+ else if (el.type === 'number') overrides[key] = Number(el.value);
546
+ else overrides[key] = el.value;
547
+ }
548
+ return overrides;
549
+ }
550
+
551
+ function transcribeSSE(imageId, segMethod, segDevice, maxColumns, splitWidthFraction = 0.4, usePageXml = true, signal = null, textDirection = 'horizontal-lr') {
552
+ return new Promise((resolve, reject) => {
553
+ const lines = [];
554
+ let startTime = null;
555
+ let lastTokenUsage = null;
556
+ const body = JSON.stringify({
557
+ image_id: imageId, seg_method: segMethod,
558
+ seg_device: segDevice, max_columns: maxColumns,
559
+ split_width_fraction: splitWidthFraction,
560
+ text_direction: textDirection,
561
+ use_pagexml: usePageXml,
562
+ engine_config_overrides: _collectLiveOverrides(),
563
+ });
564
+
565
+ const finish = (cancelled = false) => {
566
+ const time_s = startTime ? Math.round((Date.now() - startTime) / 100) / 10 : 0;
567
+ resolve({ lines, time_s, token_usage: lastTokenUsage, cancelled });
568
+ };
569
+
570
+ fetch('/api/transcribe', {
571
+ method: 'POST',
572
+ headers: { 'Content-Type': 'application/json' },
573
+ body,
574
+ signal,
575
+ }).then(resp => {
576
+ if (!resp.ok) return reject(new Error(resp.statusText));
577
+ const reader = resp.body.getReader();
578
+ const decoder = new TextDecoder();
579
+ let buf = '';
580
+
581
+ const pump = () => reader.read().then(({ done, value }) => {
582
+ if (done) { finish(); return; }
583
+ buf += decoder.decode(value, { stream: true });
584
+ const parts = buf.split('\n\n');
585
+ buf = parts.pop();
586
+ for (const chunk of parts) {
587
+ const evLine = chunk.split('\n').find(l => l.startsWith('event:'));
588
+ const dataLine = chunk.split('\n').find(l => l.startsWith('data:'));
589
+ if (!evLine || !dataLine) continue;
590
+ const event = evLine.slice(7).trim();
591
+ const data = JSON.parse(dataLine.slice(5).trim());
592
+ if (event === 'progress') {
593
+ if (!startTime) startTime = Date.now();
594
+ if (data.token_usage) lastTokenUsage = data.token_usage;
595
+ lines.push(data.line);
596
+ // Only stream to panel when user is watching this item
597
+ if (batch.currentIndex === batch.processingIndex) emit('sse-progress', data);
598
+ } else if (event === 'segmentation') {
599
+ // Store bboxes/regions so loadBatchItem can restore them later
600
+ if (batch.items[batch.processingIndex]) {
601
+ batch.items[batch.processingIndex].bboxes = data.bboxes || [];
602
+ batch.items[batch.processingIndex].regions = data.regions || [];
603
+ }
604
+ if (batch.currentIndex === batch.processingIndex) emit('sse-segmentation', data);
605
+ } else if (event === 'complete') {
606
+ if (data.token_usage) lastTokenUsage = data.token_usage;
607
+ finish();
608
+ } else if (event === 'error') {
609
+ reject(new Error(data.message));
610
+ } else if (event === 'cancelled') {
611
+ finish(true);
612
+ }
613
+ }
614
+ pump();
615
+ }).catch(reject);
616
+ pump();
617
+ }).catch(reject);
618
+ });
619
+ }
620
+
621
+ // Load a completed batch item back into the viewer / results panel
622
+ function loadBatchItem(index) {
623
+ const item = batch.items[index];
624
+ if (item.status !== 'done') return;
625
+ batch.currentIndex = index;
626
+ batch.userNavigated = true; // user left auto-advance mode
627
+ emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
628
+ updateNavButtons();
629
+ // Restore segmentation data so line-click highlighting works.
630
+ // batch-item-start clears currentBboxes in the image viewer; re-populate them here.
631
+ const bboxes = item.bboxes || [];
632
+ const regions = item.regions || [];
633
+ emit('sse-segmentation', { num_lines: item.lines.length, bboxes, regions, source: 'batch-restore' });
634
+ // Re-populate state.lines so exports and confidence filter work
635
+ state.lines = item.lines.map((l, i) => ({ ...l, index: i }));
636
+ // Re-emit each line to rebuild the transcription panel
637
+ $('transcription-lines').innerHTML = '';
638
+ $('conf-filter-row').classList.add('hidden');
639
+ state.lines.forEach(l => emit('sse-progress', {
640
+ current: l.index + 1, total: state.lines.length, line: l
641
+ }));
642
+ emit('sse-complete', { lines: state.lines, total_time_s: item.time_s || 0, engine: '(batch)', token_usage: item.token_usage || null });
643
+ }
644
+
645
+ // ── Export ────────────────────────────────────────────────────────────────────
646
+
647
+ function exportAllTxt() {
648
+ const done = batch.items.filter(i => i.status === 'done');
649
+ if (!done.length) return;
650
+ const text = done.map(item =>
651
+ `=== ${item.filename} ===\n` + item.lines.map(l => l.text).join('\n')
652
+ ).join('\n\n');
653
+ downloadFile('batch_transcription.txt', text, 'text/plain');
654
+ }
655
+
656
+ function exportAllCsv() {
657
+ const done = batch.items.filter(i => i.status === 'done');
658
+ if (!done.length) return;
659
+ const header = 'File,Line,Text,Confidence\n';
660
+ const rows = done.flatMap(item =>
661
+ item.lines.map(l => {
662
+ const conf = l.confidence != null ? l.confidence.toFixed(4) : '';
663
+ return `"${item.filename.replace(/"/g,'""')}",${l.index + 1},"${l.text.replace(/"/g,'""')}",${conf}`;
664
+ })
665
+ );
666
+ downloadFile('batch_transcription.csv', header + rows.join('\n'), 'text/csv');
667
+ }
668
+
669
+ async function exportAllThinkingZip() {
670
+ const done = batch.items.filter(i => i.status === 'done' && i.imageId);
671
+ if (!done.length) return;
672
+ try {
673
+ const resp = await fetch('/api/batch/export-thinking', {
674
+ method: 'POST',
675
+ headers: { 'Content-Type': 'application/json' },
676
+ body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
677
+ });
678
+ if (!resp.ok) throw new Error(await resp.text());
679
+ const blob = await resp.blob();
680
+ const url = URL.createObjectURL(blob);
681
+ const a = document.createElement('a');
682
+ a.href = url; a.download = 'batch_thinking.zip'; a.click();
683
+ URL.revokeObjectURL(url);
684
+ } catch (err) {
685
+ toast(`Thinking export failed: ${err.message}`, 'error');
686
+ }
687
+ }
688
+
689
+ async function exportAllTxtZip() {
690
+ const done = batch.items.filter(i => i.status === 'done' && i.imageId);
691
+ if (!done.length) return;
692
+ try {
693
+ const resp = await fetch('/api/batch/export-txt', {
694
+ method: 'POST',
695
+ headers: { 'Content-Type': 'application/json' },
696
+ body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
697
+ });
698
+ if (!resp.ok) throw new Error(await resp.text());
699
+ const blob = await resp.blob();
700
+ const url = URL.createObjectURL(blob);
701
+ const a = document.createElement('a');
702
+ a.href = url; a.download = 'batch_export_txt.zip'; a.click();
703
+ URL.revokeObjectURL(url);
704
+ } catch (err) {
705
+ toast(`TXT ZIP export failed: ${err.message}`, 'error');
706
+ }
707
+ }
708
+
709
+ async function exportAllXml() {
710
+ const done = batch.items.filter(i => i.status === 'done' && i.imageId);
711
+ if (!done.length) return;
712
+ try {
713
+ const resp = await fetch('/api/batch/export-xml', {
714
+ method: 'POST',
715
+ headers: { 'Content-Type': 'application/json' },
716
+ body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
717
+ });
718
+ if (!resp.ok) throw new Error(await resp.text());
719
+ const blob = await resp.blob();
720
+ const url = URL.createObjectURL(blob);
721
+ const a = document.createElement('a');
722
+ a.href = url; a.download = 'batch_export.zip'; a.click();
723
+ URL.revokeObjectURL(url);
724
+ } catch (err) {
725
+ toast(`XML export failed: ${err.message}`, 'error');
726
+ }
727
+ }
728
+
729
+ function downloadFile(filename, content, mime) {
730
+ const blob = new Blob([content], { type: mime });
731
+ const url = URL.createObjectURL(blob);
732
+ const a = document.createElement('a');
733
+ a.href = url; a.download = filename; a.click();
734
+ URL.revokeObjectURL(url);
735
+ }
web/static/components/engine-panel.js ADDED
@@ -0,0 +1,1091 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Engine Panel — engine selection, dynamic config form, model loading
3
+ */
4
+
5
+ import { state, emit, on, api, saveEngineConfig, loadSavedEngineName, loadSavedEngineConfig, toast } from '../app.js';
6
+
7
+ const $ = id => document.getElementById(id);
8
+
9
+ // --- API Key localStorage helpers (keys never stored on server) ---
10
+ const _KEY_PREFIX = 'polyscriptor_key_';
11
+ let _browserOpenWebUIConfig = null;
12
+ let _browserOpenWebUIAbort = null;
13
+
14
+ function _loadBrowserKey(slot) {
15
+ try { return localStorage.getItem(_KEY_PREFIX + slot) || ''; }
16
+ catch (_) { return ''; }
17
+ }
18
+
19
+ function _saveBrowserKey(slot, key) {
20
+ try {
21
+ if (key) localStorage.setItem(_KEY_PREFIX + slot, key);
22
+ else localStorage.removeItem(_KEY_PREFIX + slot);
23
+ return true;
24
+ } catch (_) { /* private browsing etc. */ }
25
+ return false;
26
+ }
27
+
28
+ function _hasBrowserKey(slot) {
29
+ return !!_loadBrowserKey(slot);
30
+ }
31
+
32
+ function _normalizeBaseUrl(baseUrl) {
33
+ return (baseUrl || '').trim().replace(/\/+$/, '');
34
+ }
35
+
36
+ function _openWebUIModelUrls(baseUrl) {
37
+ const base = _normalizeBaseUrl(baseUrl);
38
+ if (!base) return [];
39
+ const urls = [`${base}/models`];
40
+ if (base.endsWith('/api')) {
41
+ urls.push(`${base}/v1/models`);
42
+ urls.push(`${base.slice(0, -4)}/v1/models`);
43
+ } else if (base.endsWith('/api/v1')) {
44
+ urls.push(`${base.slice(0, -3)}/models`);
45
+ urls.push(`${base}/models`);
46
+ } else if (base.endsWith('/v1')) {
47
+ urls.push(`${base.slice(0, -3)}/api/models`);
48
+ } else {
49
+ urls.push(`${base}/api/models`);
50
+ urls.push(`${base}/api/v1/models`);
51
+ urls.push(`${base}/v1/models`);
52
+ }
53
+ return [...new Set(urls)];
54
+ }
55
+
56
+ function _extractModelIds(payload) {
57
+ if (Array.isArray(payload)) {
58
+ return [...new Set(payload.map(item => {
59
+ if (typeof item === 'string') return item;
60
+ if (item && typeof item === 'object') return item.id || item.name || item.model;
61
+ return null;
62
+ }).filter(Boolean))].sort();
63
+ }
64
+ if (payload && typeof payload === 'object') {
65
+ for (const key of ['data', 'models']) {
66
+ if (Array.isArray(payload[key])) return _extractModelIds(payload[key]);
67
+ }
68
+ return _extractModelIds(Object.values(payload));
69
+ }
70
+ return [];
71
+ }
72
+
73
+ async function _fetchOpenWebUIModelsInBrowser(baseUrl, apiKey) {
74
+ const errors = [];
75
+ for (const url of _openWebUIModelUrls(baseUrl)) {
76
+ try {
77
+ const resp = await fetch(url, {
78
+ headers: {
79
+ 'Authorization': `Bearer ${apiKey}`,
80
+ 'Content-Type': 'application/json',
81
+ 'Accept': 'application/json',
82
+ },
83
+ });
84
+ const contentType = resp.headers.get('content-type') || '';
85
+ const text = await resp.text();
86
+ if (!resp.ok) {
87
+ errors.push(`${url}: HTTP ${resp.status}`);
88
+ continue;
89
+ }
90
+ if (!contentType.includes('json')) {
91
+ const sample = text.trim().replace(/\s+/g, ' ').slice(0, 120) || '<empty response>';
92
+ errors.push(`${url}: non-JSON response: ${sample}`);
93
+ continue;
94
+ }
95
+ const models = _extractModelIds(JSON.parse(text));
96
+ if (models.length) return models;
97
+ errors.push(`${url}: no model ids in response`);
98
+ } catch (err) {
99
+ errors.push(`${url}: ${err.message}`);
100
+ }
101
+ }
102
+ throw new Error(errors.join('; ') || 'No OpenWebUI model endpoint tried');
103
+ }
104
+
105
+ async function _blobToDataUrl(blob) {
106
+ return await new Promise((resolve, reject) => {
107
+ const reader = new FileReader();
108
+ reader.onload = () => resolve(reader.result);
109
+ reader.onerror = () => reject(reader.error || new Error('Could not read image'));
110
+ reader.readAsDataURL(blob);
111
+ });
112
+ }
113
+
114
+ function _resolveOpenWebUIModel(config) {
115
+ if (config.model === '__custom__') return (config.model_custom || '').trim();
116
+ return (config.model || '').trim();
117
+ }
118
+
119
+ export function initEnginePanel() {
120
+ loadEngines();
121
+
122
+ $('engine-select').addEventListener('change', onEngineSelected);
123
+ $('btn-load-model').addEventListener('click', onLoadModel);
124
+ $('btn-transcribe').addEventListener('click', onTranscribe);
125
+ $('btn-segment').addEventListener('click', onSegment);
126
+
127
+ // Show/hide blla-specific options
128
+ const segMethodSel = $('seg-method');
129
+ const bllaopts = $('blla-options');
130
+ const syncBllaOpts = () => {
131
+ if (bllaopts) bllaopts.style.display = segMethodSel.value === 'kraken-blla' ? '' : 'none';
132
+ };
133
+ segMethodSel.addEventListener('change', syncBllaOpts);
134
+ syncBllaOpts();
135
+
136
+ // Cancel button — visible during transcription
137
+ $('btn-cancel').addEventListener('click', async () => {
138
+ if ($('engine-select')?.value === 'OpenWebUI' && _browserOpenWebUIAbort) {
139
+ _browserOpenWebUIAbort.abort();
140
+ return;
141
+ }
142
+ try {
143
+ await fetch('/api/transcribe/cancel', { method: 'POST' });
144
+ } catch (_) { /* ignore */ }
145
+ });
146
+
147
+ // Enable transcribe/segment buttons when image is ready
148
+ on('engine-loaded', () => { updateTranscribeBtn(); updateSegmentBtn(); });
149
+ on('image-uploaded', () => { updateTranscribeBtn(); updateSegmentBtn(); });
150
+ on('batch-item-start', () => { updateTranscribeBtn(); updateSegmentBtn(); });
151
+ on('transcription-complete', () => {
152
+ state.isProcessing = false;
153
+ $('btn-transcribe').classList.remove('loading');
154
+ $('btn-transcribe').textContent = 'Transcribe';
155
+ $('btn-cancel').classList.add('hidden');
156
+ updateTranscribeBtn();
157
+ updateSegmentBtn();
158
+ });
159
+
160
+ // Region list — appears after segmentation, cleared on new image/transcription
161
+ on('sse-segmentation', data => renderRegionList(data.regions || []));
162
+ on('image-uploaded', () => { $('seg-regions-list').classList.add('hidden'); $('seg-regions-list').innerHTML = ''; });
163
+ }
164
+
165
+ async function loadEngines() {
166
+ try {
167
+ const resp = await api('/api/engines');
168
+ state.engines = await resp.json();
169
+
170
+ const select = $('engine-select');
171
+ select.innerHTML = '';
172
+
173
+ const available = state.engines.filter(e => e.available);
174
+ const unavailable = state.engines.filter(e => !e.available);
175
+
176
+ if (available.length === 0) {
177
+ select.innerHTML = '<option>No engines available</option>';
178
+ return;
179
+ }
180
+
181
+ const savedEngine = loadSavedEngineName();
182
+
183
+ for (const eng of available) {
184
+ const opt = document.createElement('option');
185
+ opt.value = eng.name;
186
+ opt.textContent = eng.name;
187
+ select.appendChild(opt);
188
+ }
189
+
190
+ if (unavailable.length > 0) {
191
+ const group = document.createElement('optgroup');
192
+ group.label = 'Unavailable';
193
+ for (const eng of unavailable) {
194
+ const opt = document.createElement('option');
195
+ opt.value = eng.name;
196
+ opt.textContent = `${eng.name} (${eng.unavailable_reason || 'missing deps'})`;
197
+ opt.disabled = true;
198
+ group.appendChild(opt);
199
+ }
200
+ select.appendChild(group);
201
+ }
202
+
203
+ // Restore last used engine if available
204
+ if (savedEngine && available.find(e => e.name === savedEngine)) {
205
+ select.value = savedEngine;
206
+ }
207
+ select.disabled = false;
208
+ onEngineSelected();
209
+ } catch (err) {
210
+ $('engine-description').textContent = `Error loading engines: ${err.message}`;
211
+ }
212
+ }
213
+
214
+ async function onEngineSelected() {
215
+ const name = $('engine-select').value;
216
+ const eng = state.engines.find(e => e.name === name);
217
+ state.currentEngine = eng;
218
+
219
+ // Description
220
+ $('engine-description').textContent = eng?.description || '';
221
+
222
+ // Show/hide segmentation controls based on engine capability
223
+ updateSegmentationVisibility(eng);
224
+
225
+ // Load config schema
226
+ const configForm = $('config-form');
227
+ configForm.innerHTML = '';
228
+
229
+ if (!eng) return;
230
+
231
+ try {
232
+ const resp = await api(`/api/engine/${encodeURIComponent(name)}/config-schema`);
233
+ const schema = await resp.json();
234
+
235
+ for (const field of schema.fields || []) {
236
+ configForm.appendChild(createField(field));
237
+ }
238
+
239
+ // Restore saved config values for this engine (skip password fields for security)
240
+ const savedCfg = loadSavedEngineConfig(name);
241
+ if (savedCfg) {
242
+ for (const el of configForm.querySelectorAll('[data-key]')) {
243
+ if (el.dataset.passwordField) continue; // never prefill secrets
244
+ const val = savedCfg[el.dataset.key];
245
+ if (val == null) continue;
246
+ if (el.type === 'checkbox') el.checked = !!val;
247
+ else el.value = val;
248
+ }
249
+ }
250
+
251
+ $('btn-load-model').disabled = false;
252
+
253
+ // For Commercial APIs: when provider changes, swap model list and update key hint
254
+ const providerSel = $('cfg-provider');
255
+ const modelSel = $('cfg-model');
256
+ if (providerSel && modelSel) {
257
+ const syncModelList = async () => {
258
+ // Clear model list and auto-fetch from live API if a key is available
259
+ _populateSelect(modelSel, []); // show "— click ↻ to load —"
260
+ modelSel.dispatchEvent(new Event('change'));
261
+
262
+ // Auto-trigger fetch if we have a browser key for this provider
263
+ const prov = providerSel.value.toLowerCase();
264
+ const keyEl = $('cfg-api_key');
265
+ const hasBrowser = _hasBrowserKey(prov);
266
+ const hasTyped = keyEl?.value?.trim().length > 0;
267
+ if (hasBrowser || hasTyped) {
268
+ const refreshBtn = modelSel.closest('.config-field')?.querySelector('.btn-refresh');
269
+ if (refreshBtn) refreshBtn.click();
270
+ }
271
+ };
272
+ providerSel.addEventListener('change', syncModelList);
273
+ syncModelList(); // run once on load to match default provider
274
+ }
275
+
276
+ const keyInput = $('cfg-api_key');
277
+ if (providerSel && keyInput) {
278
+ const updateKeyHint = () => {
279
+ const slot = providerSel.value.toLowerCase();
280
+ const hasBrowser = _hasBrowserKey(slot);
281
+ const saveRow = keyInput.closest('.config-field')?.querySelector('.key-save-row');
282
+ const saveBox = saveRow?.querySelector('input[type="checkbox"]');
283
+ if (hasBrowser) {
284
+ keyInput.placeholder = '•••••••• (saved in browser — leave blank to keep)';
285
+ keyInput.dataset.hasBrowser = 'true';
286
+ keyInput.disabled = false;
287
+ if (saveRow) { saveRow.style.display = ''; saveRow.querySelector('label').textContent = 'Key saved in browser'; }
288
+ if (saveBox) saveBox.checked = true;
289
+ } else {
290
+ keyInput.placeholder = 'Paste API key here';
291
+ keyInput.disabled = false;
292
+ delete keyInput.dataset.hasBrowser;
293
+ if (saveRow) { saveRow.style.display = ''; saveRow.querySelector('label').textContent = 'Save key in browser'; }
294
+ if (saveBox) saveBox.checked = false;
295
+ }
296
+ };
297
+ providerSel.addEventListener('change', updateKeyHint);
298
+ updateKeyHint(); // run once on load
299
+ }
300
+
301
+ // Kraken: show preset dropdown and load preset list
302
+ const krakenPresetRow = $('kraken-preset-row');
303
+ if (krakenPresetRow) {
304
+ if (name === 'Kraken') {
305
+ krakenPresetRow.classList.remove('hidden');
306
+ _loadKrakenPresets();
307
+ } else {
308
+ krakenPresetRow.classList.add('hidden');
309
+ }
310
+ }
311
+
312
+ // Auto-load model if this engine was previously configured.
313
+ // Skip engines with dynamic model lists (need live fetch first — user loads manually).
314
+ const hasDynamic = schema.fields?.some(f => f.dynamic);
315
+ if (savedCfg && !hasDynamic) {
316
+ onLoadModel();
317
+ }
318
+ } catch (err) {
319
+ configForm.innerHTML = `<p class="muted">Error: ${err.message}</p>`;
320
+ }
321
+ }
322
+
323
+ let _krakenPresetsLoaded = false;
324
+ async function _loadKrakenPresets() {
325
+ if (_krakenPresetsLoaded) return;
326
+ const sel = $('kraken-preset-select');
327
+ const status = $('kraken-preset-status');
328
+ if (!sel) return;
329
+ try {
330
+ const resp = await fetch('/api/kraken/presets');
331
+ const data = await resp.json();
332
+ sel.innerHTML = '';
333
+ const blank = document.createElement('option');
334
+ blank.value = '';
335
+ blank.textContent = '— use model path above —';
336
+ sel.appendChild(blank);
337
+ for (const p of data.presets || []) {
338
+ const opt = document.createElement('option');
339
+ opt.value = p.id;
340
+ const icon = p.source === 'local' ? '📁' : '⬇️';
341
+ opt.textContent = `${icon} ${p.label} (${p.language})`;
342
+ sel.appendChild(opt);
343
+ }
344
+ _krakenPresetsLoaded = true;
345
+ } catch (e) {
346
+ if (status) status.textContent = 'Could not load presets';
347
+ }
348
+ sel.addEventListener('change', () => {
349
+ const status = $('kraken-preset-status');
350
+ const modelPathEl = $('cfg-model_path');
351
+ const val = sel.value;
352
+ if (!val) {
353
+ if (status) status.textContent = '';
354
+ return;
355
+ }
356
+ if (status) {
357
+ status.textContent = val === 'blla-local'
358
+ ? '📁 Local model — loads instantly'
359
+ : '⬇️ Auto-downloads from Zenodo on first use (~30–120s)';
360
+ }
361
+ // Pre-fill model_path field with the preset ID so server knows what to load
362
+ if (modelPathEl) modelPathEl.value = ''; // clear — preset_id takes priority
363
+ });
364
+ }
365
+
366
+ /**
367
+ * Show or hide segmentation controls depending on whether the selected engine
368
+ * requires line segmentation. Page-level engines (VLMs, Commercial APIs, etc.)
369
+ * do their own segmentation internally — showing these controls is misleading.
370
+ */
371
+ function updateSegmentationVisibility(eng) {
372
+ const needsSeg = eng ? eng.requires_line_segmentation : true;
373
+ const segControls = $('seg-controls');
374
+ if (segControls) {
375
+ segControls.style.display = needsSeg ? '' : 'none';
376
+ }
377
+ }
378
+
379
+ function createField(field) {
380
+ const wrapper = document.createElement('div');
381
+
382
+ if (field.type === 'checkbox') {
383
+ wrapper.className = 'config-field config-field-checkbox';
384
+ const input = document.createElement('input');
385
+ input.type = 'checkbox';
386
+ input.id = `cfg-${field.key}`;
387
+ input.dataset.key = field.key;
388
+ input.checked = field.default ?? false;
389
+
390
+ const label = document.createElement('label');
391
+ label.htmlFor = input.id;
392
+ label.textContent = field.label;
393
+
394
+ wrapper.appendChild(input);
395
+ wrapper.appendChild(label);
396
+ } else {
397
+ wrapper.className = 'config-field';
398
+ const label = document.createElement('label');
399
+ label.htmlFor = `cfg-${field.key}`;
400
+ label.textContent = field.label;
401
+ wrapper.appendChild(label);
402
+
403
+ if (field.type === 'select') {
404
+ // Row: select + optional refresh button
405
+ const selectRow = document.createElement('div');
406
+ selectRow.className = 'select-row';
407
+
408
+ const select = document.createElement('select');
409
+ select.id = `cfg-${field.key}`;
410
+ select.dataset.key = field.key;
411
+ if (field.per_provider_options) {
412
+ // Store for later use when provider changes
413
+ select.dataset.perProviderOptions = JSON.stringify(field.per_provider_options);
414
+ }
415
+ _populateSelect(select, field.options || [], field.default);
416
+ selectRow.appendChild(select);
417
+
418
+ // Dynamic refresh button — fetches live model list from server
419
+ if (field.dynamic) {
420
+ const hint = document.createElement('span');
421
+ hint.className = 'dynamic-hint muted';
422
+ hint.textContent = field.dynamic_hint || 'Click ↻ to load models';
423
+
424
+ const refreshBtn = document.createElement('button');
425
+ refreshBtn.type = 'button';
426
+ refreshBtn.className = 'btn-refresh';
427
+ refreshBtn.title = 'Refresh model list from server';
428
+ refreshBtn.textContent = '↻';
429
+ refreshBtn.addEventListener('click', async () => {
430
+ const engineName = $('engine-select').value;
431
+ const providerEl = $('cfg-provider');
432
+ const keyEl = $('cfg-api_key');
433
+ const provider = providerEl?.value?.toLowerCase() || 'openai';
434
+ const keySlot = engineName === 'OpenWebUI' ? 'openwebui' : provider;
435
+ const apiKey = keyEl?.value?.trim() || _loadBrowserKey(keySlot);
436
+
437
+ refreshBtn.textContent = '…';
438
+ refreshBtn.disabled = true;
439
+ try {
440
+ const baseUrlEl = $('cfg-base_url');
441
+ const baseUrl = baseUrlEl?.value?.trim() || '';
442
+ let data;
443
+ if (engineName === 'OpenWebUI') {
444
+ if (!baseUrl) throw new Error('Enter your OpenWebUI base URL');
445
+ if (!apiKey) throw new Error('Enter your OpenWebUI API key');
446
+ const models = await _fetchOpenWebUIModelsInBrowser(baseUrl, apiKey);
447
+ data = { models };
448
+ } else {
449
+ const params = new URLSearchParams({ provider, api_key: apiKey, base_url: baseUrl });
450
+ const resp = await fetch(
451
+ `/api/engine/${encodeURIComponent(engineName)}/models?${params}`
452
+ );
453
+ data = await resp.json();
454
+ }
455
+ if (data.error) {
456
+ hint.textContent = `Error: ${data.error}`;
457
+ } else if (data.models.length === 0) {
458
+ hint.textContent = 'No models found';
459
+ } else {
460
+ const current = select.value;
461
+ // Build options, keep __custom__ at the end if present
462
+ const newOpts = data.models.map(m => ({ label: m, value: m }));
463
+ if (field.custom_key) newOpts.push({ label: 'Custom model ID…', value: '__custom__' });
464
+ _populateSelect(select, newOpts, current);
465
+ hint.textContent = `${data.models.length} models loaded`;
466
+ }
467
+ } catch (e) {
468
+ hint.textContent = `Error: ${e.message}`;
469
+ } finally {
470
+ refreshBtn.textContent = '↻';
471
+ refreshBtn.disabled = false;
472
+ }
473
+ });
474
+ selectRow.appendChild(refreshBtn);
475
+ wrapper.appendChild(selectRow);
476
+ wrapper.appendChild(hint);
477
+ } else {
478
+ wrapper.appendChild(selectRow);
479
+ }
480
+
481
+ // If this select can have a __custom__ sentinel, wire up a
482
+ // hidden text input that appears when "__custom__" is chosen.
483
+ if (field.custom_key) {
484
+ const customInput = document.createElement('input');
485
+ customInput.type = 'text';
486
+ customInput.id = `cfg-${field.custom_key}`;
487
+ customInput.dataset.key = field.custom_key;
488
+ customInput.placeholder = field.custom_placeholder || 'Enter custom value';
489
+ customInput.style.marginTop = '4px';
490
+
491
+ // Show/hide based on current select value
492
+ const syncCustomVisibility = () => {
493
+ const isCustom = select.value === '__custom__';
494
+ customInput.style.display = isCustom ? '' : 'none';
495
+ customInput.required = isCustom;
496
+ };
497
+ select.addEventListener('change', syncCustomVisibility);
498
+ syncCustomVisibility(); // run once on creation
499
+
500
+ wrapper.appendChild(customInput);
501
+ }
502
+
503
+ // Upload button — lets users upload a local .mlmodel file from their machine
504
+ if (field.upload) {
505
+ const uploadRow = document.createElement('div');
506
+ uploadRow.className = 'upload-model-row';
507
+ uploadRow.style.cssText = 'display:flex;align-items:center;gap:6px;margin-top:6px;';
508
+
509
+ const fileInput = document.createElement('input');
510
+ fileInput.type = 'file';
511
+ fileInput.accept = '.mlmodel';
512
+ fileInput.style.display = 'none';
513
+
514
+ const uploadBtn = document.createElement('button');
515
+ uploadBtn.type = 'button';
516
+ uploadBtn.className = 'btn-secondary btn-sm';
517
+ uploadBtn.textContent = 'Upload .mlmodel…';
518
+ uploadBtn.title = 'Upload a Kraken model file from your computer';
519
+
520
+ const uploadStatus = document.createElement('span');
521
+ uploadStatus.className = 'muted';
522
+ uploadStatus.style.fontSize = '0.85em';
523
+
524
+ uploadBtn.addEventListener('click', () => fileInput.click());
525
+
526
+ fileInput.addEventListener('change', async () => {
527
+ const f = fileInput.files[0];
528
+ if (!f) return;
529
+ uploadStatus.textContent = `Uploading ${f.name}…`;
530
+ uploadBtn.disabled = true;
531
+ try {
532
+ const fd = new FormData();
533
+ fd.append('file', f);
534
+ const resp = await fetch('/api/models/upload', { method: 'POST', body: fd });
535
+ if (!resp.ok) {
536
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
537
+ throw new Error(err.detail || resp.statusText);
538
+ }
539
+ const data = await resp.json();
540
+ // Repopulate select with fresh options returned by server
541
+ const newPath = data.path;
542
+ _populateSelect(select, data.options, newPath);
543
+ uploadStatus.textContent = `Uploaded: ${data.filename}`;
544
+ // Re-run custom visibility sync (new value might not be __custom__)
545
+ if (field.custom_key) {
546
+ const isCustom = select.value === '__custom__';
547
+ const ci = document.getElementById(`cfg-${field.custom_key}`);
548
+ if (ci) { ci.style.display = isCustom ? '' : 'none'; ci.required = isCustom; }
549
+ }
550
+ } catch (e) {
551
+ uploadStatus.textContent = `Upload failed: ${e.message}`;
552
+ } finally {
553
+ uploadBtn.disabled = false;
554
+ fileInput.value = '';
555
+ }
556
+ });
557
+
558
+ uploadRow.appendChild(fileInput);
559
+ uploadRow.appendChild(uploadBtn);
560
+ uploadRow.appendChild(uploadStatus);
561
+ wrapper.appendChild(uploadRow);
562
+ }
563
+ } else if (field.type === 'number') {
564
+ const input = document.createElement('input');
565
+ input.type = 'number';
566
+ input.id = `cfg-${field.key}`;
567
+ input.dataset.key = field.key;
568
+ if (field.min != null) input.min = field.min;
569
+ if (field.max != null) input.max = field.max;
570
+ input.value = field.default ?? '';
571
+ wrapper.appendChild(input);
572
+ } else if (field.type === 'password') {
573
+ const input = document.createElement('input');
574
+ input.type = 'password';
575
+ input.id = `cfg-${field.key}`;
576
+ input.dataset.key = field.key;
577
+ input.dataset.passwordField = 'true';
578
+
579
+ // Determine effective key slot for localStorage lookup
580
+ function _getKeySlot() {
581
+ const providerEl = $('cfg-provider');
582
+ if (providerEl) return providerEl.value.toLowerCase();
583
+ const engineEl = $('engine-select');
584
+ if (engineEl?.value === 'OpenWebUI') return 'openwebui';
585
+ return field.key;
586
+ }
587
+
588
+ function applyKeyHint() {
589
+ const slot = _getKeySlot();
590
+ const hasBrowser = _hasBrowserKey(slot);
591
+ if (hasBrowser) {
592
+ input.placeholder = '•••••••• (saved in browser — leave blank to keep)';
593
+ input.dataset.hasBrowser = 'true';
594
+ } else {
595
+ input.placeholder = field.placeholder || 'Paste API key here';
596
+ delete input.dataset.hasBrowser;
597
+ }
598
+ input.disabled = false;
599
+ }
600
+ applyKeyHint();
601
+ wrapper.appendChild(input);
602
+
603
+ // "Save key in browser" checkbox
604
+ const saveRow = document.createElement('div');
605
+ saveRow.className = 'key-save-row';
606
+ const saveBox = document.createElement('input');
607
+ saveBox.type = 'checkbox';
608
+ saveBox.id = `cfg-${field.key}-save`;
609
+ saveBox.dataset.saveFor = field.key;
610
+ const slot = _getKeySlot();
611
+ saveBox.checked = _hasBrowserKey(slot);
612
+ const saveLabel = document.createElement('label');
613
+ saveLabel.htmlFor = saveBox.id;
614
+ saveLabel.textContent = _hasBrowserKey(slot)
615
+ ? 'Key saved in browser' : 'Save key in browser';
616
+ saveRow.appendChild(saveBox);
617
+ saveRow.appendChild(saveLabel);
618
+ wrapper.appendChild(saveRow);
619
+ } else if (field.type === 'textarea') {
620
+ const ta = document.createElement('textarea');
621
+ ta.id = `cfg-${field.key}`;
622
+ ta.dataset.key = field.key;
623
+ ta.rows = field.rows || 3;
624
+ ta.value = field.default ?? '';
625
+ if (field.placeholder) ta.placeholder = field.placeholder;
626
+ ta.style.width = '100%';
627
+ ta.style.resize = 'vertical';
628
+ wrapper.appendChild(ta);
629
+ if (field.hint) {
630
+ const hint = document.createElement('small');
631
+ hint.textContent = field.hint;
632
+ hint.style.color = 'var(--text-muted, #888)';
633
+ wrapper.appendChild(hint);
634
+ }
635
+ } else {
636
+ // text
637
+ const input = document.createElement('input');
638
+ input.type = 'text';
639
+ input.id = `cfg-${field.key}`;
640
+ input.dataset.key = field.key;
641
+ input.value = field.default ?? '';
642
+ if (field.placeholder) input.placeholder = field.placeholder;
643
+ wrapper.appendChild(input);
644
+ }
645
+ }
646
+ return wrapper;
647
+ }
648
+
649
+ function collectConfig() {
650
+ const config = {};
651
+ const fields = $('config-form').querySelectorAll('[data-key]');
652
+ for (const el of fields) {
653
+ const key = el.dataset.key;
654
+ if (el.dataset.saveFor) continue; // "save key" checkboxes are not config
655
+ if (el.type === 'checkbox') {
656
+ config[key] = el.checked;
657
+ } else if (el.type === 'number') {
658
+ config[key] = Number(el.value);
659
+ } else if (el.dataset.passwordField && !el.value.trim()) {
660
+ // Blank password field — inject key from browser localStorage
661
+ const providerEl = $('cfg-provider');
662
+ let slot = key;
663
+ if (providerEl) slot = providerEl.value.toLowerCase();
664
+ else if ($('engine-select')?.value === 'OpenWebUI') slot = 'openwebui';
665
+ const browserKey = _loadBrowserKey(slot);
666
+ config[key] = browserKey; // may be empty — server will check env next
667
+ } else {
668
+ config[key] = el.value;
669
+ }
670
+ }
671
+ return config;
672
+ }
673
+
674
+ function _persistNewKeys(engineName) {
675
+ // Save any typed API key to browser localStorage automatically.
676
+ // Unchecking "Save key" is the explicit opt-out (deletes saved key).
677
+ const saveBoxes = $('config-form').querySelectorAll('[data-save-for]');
678
+ for (const box of saveBoxes) {
679
+ const keyField = $(`cfg-${box.dataset.saveFor}`);
680
+ const newKey = keyField?.value?.trim();
681
+
682
+ // Determine slot from engine name
683
+ const slotMap = {
684
+ 'OpenWebUI': 'openwebui',
685
+ 'Commercial APIs': null, // slot depends on selected provider
686
+ };
687
+ let slot = slotMap[engineName];
688
+ if (engineName === 'Commercial APIs') {
689
+ const providerEl = $('cfg-provider');
690
+ slot = providerEl?.value?.toLowerCase() || 'openai';
691
+ }
692
+ if (!slot) continue;
693
+
694
+ if (newKey) {
695
+ const label = box.nextElementSibling;
696
+ if (_saveBrowserKey(slot, newKey)) {
697
+ keyField.value = ''; // clear field; hint shows key is saved
698
+ keyField.placeholder = '•••••••• (saved in browser — leave blank to keep)';
699
+ keyField.dataset.hasBrowser = 'true';
700
+ box.checked = true;
701
+ if (label) label.textContent = 'Key saved in browser';
702
+ } else {
703
+ box.checked = false;
704
+ if (label) label.textContent = 'Could not save key in browser';
705
+ }
706
+ } else if (!box.checked && _hasBrowserKey(slot)) {
707
+ // Explicit opt-out: unchecked + no typed key → delete saved key
708
+ _saveBrowserKey(slot, '');
709
+ delete keyField?.dataset?.hasBrowser;
710
+ }
711
+ }
712
+ }
713
+
714
+ async function onLoadModel() {
715
+ const name = $('engine-select').value;
716
+ const config = collectConfig();
717
+ // Attach Kraken preset ID if one is selected
718
+ if (name === 'Kraken') {
719
+ const presetSel = $('kraken-preset-select');
720
+ if (presetSel?.value) config.preset_id = presetSel.value;
721
+ }
722
+ const btn = $('btn-load-model');
723
+ const status = $('engine-status');
724
+
725
+ btn.classList.add('loading');
726
+ btn.textContent = 'Loading...';
727
+ status.className = 'status-badge status-loading';
728
+ status.textContent = `Loading ${name}...`;
729
+ status.classList.remove('hidden');
730
+
731
+ try {
732
+ if (name === 'OpenWebUI') {
733
+ config.base_url = _normalizeBaseUrl(config.base_url);
734
+ config.model = _resolveOpenWebUIModel(config);
735
+ if (!config.base_url) throw new Error('Enter your OpenWebUI base URL');
736
+ if (!config.api_key) throw new Error('Enter your OpenWebUI API key');
737
+ if (!config.model) throw new Error('Load the model list or enter an OpenWebUI model ID');
738
+
739
+ _browserOpenWebUIConfig = { ...config };
740
+ state.engineLoaded = true;
741
+ status.className = 'status-badge status-loaded';
742
+ status.textContent = `${name} ready in browser (${config.model})`;
743
+
744
+ _persistNewKeys(name);
745
+ const storedConfig = { ...config };
746
+ delete storedConfig.api_key;
747
+ saveEngineConfig(name, storedConfig);
748
+ emit('engine-loaded', {
749
+ success: true,
750
+ load_time_s: 0,
751
+ engine_name: name,
752
+ browser_direct: true,
753
+ });
754
+ return;
755
+ }
756
+
757
+ const resp = await api('/api/engine/load', {
758
+ method: 'POST',
759
+ body: JSON.stringify({ engine_name: name, config }),
760
+ });
761
+ const data = await resp.json();
762
+
763
+ state.engineLoaded = true;
764
+ status.className = 'status-badge status-loaded';
765
+ status.textContent = `${name} loaded (${data.load_time_s}s)`;
766
+
767
+ _persistNewKeys(name); // save keys only after the typed key was used for loading
768
+
769
+ // Persist engine + config for next session
770
+ const storedConfig = { ...config };
771
+ delete storedConfig.api_key;
772
+ saveEngineConfig(name, storedConfig);
773
+
774
+ emit('engine-loaded', data);
775
+ } catch (err) {
776
+ status.className = 'status-badge';
777
+ status.style.color = 'var(--danger)';
778
+ status.textContent = `Error: ${err.message}`;
779
+ state.engineLoaded = false;
780
+ } finally {
781
+ btn.classList.remove('loading');
782
+ btn.textContent = 'Load Model';
783
+ }
784
+ }
785
+
786
+ async function onTranscribe() {
787
+ if (state.isProcessing) return;
788
+ if (!state.engineLoaded || !state.imageId) return;
789
+
790
+ state.isProcessing = true;
791
+ const btn = $('btn-transcribe');
792
+ btn.classList.add('loading');
793
+ btn.textContent = 'Transcribing...';
794
+ btn.disabled = true;
795
+ $('btn-cancel').classList.remove('hidden');
796
+
797
+ const segMethod = $('seg-method').value;
798
+ const segDevice = $('seg-device').value;
799
+ const maxColumns = parseInt($('seg-max-columns')?.value || '6', 10);
800
+ const splitWidth = parseFloat($('seg-split-width')?.value || '40') / 100;
801
+ const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
802
+
803
+ emit('transcription-start');
804
+
805
+ try {
806
+ if ($('engine-select').value === 'OpenWebUI') {
807
+ await transcribeOpenWebUIInBrowser();
808
+ return;
809
+ }
810
+
811
+ // Collect live config overrides — non-password form fields are sent at
812
+ // transcription time so changes (e.g. custom_prompt, thinking_mode) take
813
+ // effect immediately without requiring a model reload.
814
+ const liveOverrides = {};
815
+ for (const el of $('config-form').querySelectorAll('[data-key]')) {
816
+ if (el.dataset.saveFor) continue; // skip "save key" checkboxes
817
+ if (el.dataset.passwordField) continue; // never resend secrets
818
+ const key = el.dataset.key;
819
+ if (el.type === 'checkbox') liveOverrides[key] = el.checked;
820
+ else if (el.type === 'number') liveOverrides[key] = Number(el.value);
821
+ else liveOverrides[key] = el.value;
822
+ }
823
+
824
+ const resp = await fetch('/api/transcribe', {
825
+ method: 'POST',
826
+ headers: { 'Content-Type': 'application/json' },
827
+ body: JSON.stringify({
828
+ image_id: state.imageId,
829
+ seg_method: segMethod,
830
+ seg_device: segDevice,
831
+ max_columns: maxColumns,
832
+ split_width_fraction: splitWidth,
833
+ text_direction: textDirection,
834
+ engine_config_overrides: liveOverrides,
835
+ }),
836
+ });
837
+
838
+ if (!resp.ok) {
839
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
840
+ throw new Error(err.detail || 'Transcription failed');
841
+ }
842
+
843
+ const reader = resp.body.getReader();
844
+ const decoder = new TextDecoder();
845
+ let buffer = '';
846
+
847
+ while (true) {
848
+ const { done, value } = await reader.read();
849
+ if (done) break;
850
+ buffer += decoder.decode(value, { stream: true });
851
+
852
+ const parts = buffer.split('\n\n');
853
+ buffer = parts.pop(); // keep incomplete
854
+
855
+ for (const part of parts) {
856
+ if (!part.trim()) continue;
857
+ const eventMatch = part.match(/event: (\w+)/);
858
+ const dataMatch = part.match(/data: (.+)/s);
859
+ if (eventMatch && dataMatch) {
860
+ const eventName = eventMatch[1];
861
+ const data = JSON.parse(dataMatch[1]);
862
+ emit(`sse-${eventName}`, data);
863
+ }
864
+ }
865
+ }
866
+ } catch (err) {
867
+ if (err.name === 'AbortError') emit('sse-cancelled', {});
868
+ else emit('transcription-error', { message: err.message });
869
+ } finally {
870
+ _browserOpenWebUIAbort = null;
871
+ }
872
+ }
873
+
874
+ async function transcribeOpenWebUIInBrowser() {
875
+ const config = { ...(_browserOpenWebUIConfig || collectConfig()) };
876
+ config.base_url = _normalizeBaseUrl(config.base_url);
877
+ config.api_key = config.api_key || _loadBrowserKey('openwebui');
878
+ config.model = _resolveOpenWebUIModel(config);
879
+ if (!config.base_url) throw new Error('Enter your OpenWebUI base URL');
880
+ if (!config.api_key) throw new Error('Enter your OpenWebUI API key');
881
+ if (!config.model) throw new Error('Load the model list or enter an OpenWebUI model ID');
882
+
883
+ const imageResp = await fetch(`/api/image/${state.imageId}`);
884
+ if (!imageResp.ok) throw new Error('Could not load uploaded image');
885
+ const imageBlob = await imageResp.blob();
886
+ const dataUrl = await _blobToDataUrl(imageBlob);
887
+
888
+ emit('sse-segmentation', {
889
+ num_lines: 1,
890
+ bboxes: [[0, 0, state.imageInfo?.width || 0, state.imageInfo?.height || 0]],
891
+ source: 'page',
892
+ });
893
+
894
+ const prompt = (config.custom_prompt || '').trim() ||
895
+ 'Transcribe all handwritten text in this manuscript image. Preserve the original language and layout. Output only the transcribed text without any additional commentary.';
896
+
897
+ const body = {
898
+ model: config.model,
899
+ messages: [{
900
+ role: 'user',
901
+ content: [
902
+ { type: 'text', text: prompt },
903
+ { type: 'image_url', image_url: { url: dataUrl } },
904
+ ],
905
+ }],
906
+ temperature: Number.isFinite(config.temperature) ? config.temperature : 0.1,
907
+ };
908
+ if (config.max_tokens && config.max_tokens > 0) body.max_tokens = config.max_tokens;
909
+
910
+ _browserOpenWebUIAbort = new AbortController();
911
+ const started = Date.now();
912
+ const resp = await fetch(`${config.base_url}/chat/completions`, {
913
+ method: 'POST',
914
+ headers: {
915
+ 'Authorization': `Bearer ${config.api_key}`,
916
+ 'Content-Type': 'application/json',
917
+ 'Accept': 'application/json',
918
+ },
919
+ body: JSON.stringify(body),
920
+ signal: _browserOpenWebUIAbort.signal,
921
+ });
922
+ const text = await resp.text();
923
+ if (!resp.ok) {
924
+ throw new Error(`OpenWebUI HTTP ${resp.status}: ${text.slice(0, 240)}`);
925
+ }
926
+ let payload;
927
+ try {
928
+ payload = JSON.parse(text);
929
+ } catch (_) {
930
+ throw new Error(`OpenWebUI returned non-JSON response: ${text.slice(0, 240)}`);
931
+ }
932
+ const output = (payload.choices?.[0]?.message?.content || '').trim();
933
+ const tokenUsage = payload.usage ? {
934
+ prompt_tokens: payload.usage.prompt_tokens,
935
+ output_tokens: payload.usage.completion_tokens,
936
+ total_tokens: payload.usage.total_tokens,
937
+ } : null;
938
+ const line = {
939
+ index: 0,
940
+ text: output,
941
+ confidence: null,
942
+ bbox: [0, 0, state.imageInfo?.width || 0, state.imageInfo?.height || 0],
943
+ region: 0,
944
+ };
945
+ const progress = { current: 1, total: 1, line };
946
+ if (tokenUsage) progress.token_usage = tokenUsage;
947
+ emit('sse-progress', progress);
948
+ const complete = {
949
+ lines: [line],
950
+ total_time_s: Math.round((Date.now() - started) / 10) / 100,
951
+ engine: 'OpenWebUI',
952
+ browser_direct: true,
953
+ };
954
+ if (tokenUsage) complete.token_usage = tokenUsage;
955
+ emit('sse-complete', complete);
956
+ }
957
+
958
+ function updateTranscribeBtn() {
959
+ $('btn-transcribe').disabled = !(state.engineLoaded && state.imageId && !state.isProcessing);
960
+ }
961
+
962
+ function updateSegmentBtn() {
963
+ $('btn-segment').disabled = !(state.imageId && !state.isProcessing);
964
+ }
965
+
966
+ async function onSegment() {
967
+ if (!state.imageId || state.isProcessing) return;
968
+
969
+ const btn = $('btn-segment');
970
+ const segMethod = $('seg-method').value;
971
+ const segDevice = $('seg-device').value;
972
+ const maxColumns = parseInt($('seg-max-columns')?.value || '6', 10);
973
+ const splitWidth = parseFloat($('seg-split-width')?.value || '40') / 100;
974
+ const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
975
+
976
+ btn.classList.add('loading');
977
+ btn.textContent = 'Segmenting…';
978
+ btn.disabled = true;
979
+
980
+ try {
981
+ const params = new URLSearchParams({
982
+ method: segMethod, device: segDevice,
983
+ max_columns: maxColumns, split_width_fraction: splitWidth,
984
+ text_direction: textDirection,
985
+ });
986
+ const resp = await api(`/api/image/${state.imageId}/segment?${params}`);
987
+ if (!resp.ok) {
988
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
989
+ throw new Error(err.detail || resp.statusText);
990
+ }
991
+ const data = await resp.json();
992
+ // Reuse the same event the transcription flow uses — draws bboxes on canvas
993
+ emit('sse-segmentation', data);
994
+ if (data.source !== 'page') {
995
+ toast(`${data.num_lines} lines found (${data.source})`, 'success', 3000);
996
+ }
997
+ emit('segment-preview'); // switch mobile tab to image view
998
+ } catch (err) {
999
+ toast(`Segmentation failed: ${err.message}`, 'error');
1000
+ } finally {
1001
+ btn.classList.remove('loading');
1002
+ btn.textContent = 'Segment';
1003
+ updateSegmentBtn();
1004
+ }
1005
+ }
1006
+
1007
+ /**
1008
+ * Populate a <select> with an array of options.
1009
+ * Each option may be a string or {label, value}.
1010
+ * Tries to restore previousValue after repopulating.
1011
+ */
1012
+ function _populateSelect(select, options, previousValue) {
1013
+ select.innerHTML = '';
1014
+ if (options.length === 0) {
1015
+ const o = document.createElement('option');
1016
+ o.value = '';
1017
+ o.textContent = '— click ↻ to load —';
1018
+ select.appendChild(o);
1019
+ return;
1020
+ }
1021
+ for (const opt of options) {
1022
+ const o = document.createElement('option');
1023
+ o.value = typeof opt === 'object' ? opt.value : opt;
1024
+ o.textContent = typeof opt === 'object' ? opt.label : opt;
1025
+ select.appendChild(o);
1026
+ }
1027
+ if (previousValue != null) {
1028
+ // Restore previous selection if it still exists
1029
+ const match = Array.from(select.options).find(o => o.value === previousValue);
1030
+ if (match) select.value = previousValue;
1031
+ }
1032
+ }
1033
+
1034
+ // Same palette as image-viewer.js REGION_COLORS
1035
+ const _REGION_COLORS = [
1036
+ 'rgba(255,160,30,0.9)', 'rgba(46,213,115,0.9)', 'rgba(232,65,24,0.9)',
1037
+ 'rgba(52,172,224,0.9)', 'rgba(162,16,213,0.9)', 'rgba(255,211,42,0.9)',
1038
+ 'rgba(18,203,196,0.9)', 'rgba(253,89,166,0.9)',
1039
+ ];
1040
+
1041
+ function renderRegionList(regions) {
1042
+ const list = $('seg-regions-list');
1043
+ list.innerHTML = '';
1044
+ if (!regions.length) { list.classList.add('hidden'); return; }
1045
+ list.classList.remove('hidden');
1046
+
1047
+ const hdr = document.createElement('div');
1048
+ hdr.className = 'seg-regions-header';
1049
+ hdr.textContent = `Regions (${regions.length})`;
1050
+ list.appendChild(hdr);
1051
+
1052
+ regions.forEach((r, i) => {
1053
+ const row = document.createElement('div');
1054
+ row.className = 'seg-region-row';
1055
+
1056
+ const dot = document.createElement('span');
1057
+ dot.className = 'seg-region-dot';
1058
+ dot.style.background = _REGION_COLORS[i % _REGION_COLORS.length];
1059
+
1060
+ const label = document.createElement('span');
1061
+ label.className = 'seg-region-label';
1062
+ label.textContent = `R${i + 1}`;
1063
+
1064
+ const count = document.createElement('span');
1065
+ count.className = 'seg-region-count';
1066
+ count.textContent = `${r.num_lines} line${r.num_lines !== 1 ? 's' : ''}`;
1067
+
1068
+ const delBtn = document.createElement('button');
1069
+ delBtn.className = 'seg-region-del btn-icon';
1070
+ delBtn.textContent = '×';
1071
+ delBtn.title = 'Delete this region';
1072
+ delBtn.addEventListener('click', async () => {
1073
+ delBtn.disabled = true;
1074
+ try {
1075
+ const resp = await api(`/api/image/${state.imageId}/region/${i}`, { method: 'DELETE' });
1076
+ const data = await resp.json();
1077
+ emit('sse-segmentation', data);
1078
+ toast(`Region R${i + 1} removed`, 'info', 2000);
1079
+ } catch (err) {
1080
+ toast(`Delete failed: ${err.message}`, 'error');
1081
+ delBtn.disabled = false;
1082
+ }
1083
+ });
1084
+
1085
+ row.appendChild(dot);
1086
+ row.appendChild(label);
1087
+ row.appendChild(count);
1088
+ row.appendChild(delBtn);
1089
+ list.appendChild(row);
1090
+ });
1091
+ }
web/static/components/image-viewer.js ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Image Viewer — upload, display, bbox overlay
3
+ */
4
+
5
+ import { state, emit, on, api, fitZoom, toast } from '../app.js';
6
+
7
+ const $ = id => document.getElementById(id);
8
+ const IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp', '.gif', '.webp']);
9
+
10
+ function extensionOf(file) {
11
+ const name = file?.name || '';
12
+ const dot = name.lastIndexOf('.');
13
+ return dot >= 0 ? name.slice(dot).toLowerCase() : '';
14
+ }
15
+
16
+ function isImageOrPdf(file) {
17
+ const ext = extensionOf(file);
18
+ return file.type.startsWith('image/') || ext === '.pdf' || IMAGE_EXTENSIONS.has(ext);
19
+ }
20
+
21
+ export function initImageViewer() {
22
+ const uploadArea = $('upload-area');
23
+ const fileInput = $('file-input');
24
+ const xmlInput = $('xml-input');
25
+ const viewerScroll = $('viewer-scroll');
26
+ const viewerPlaceholder = $('viewer-placeholder');
27
+
28
+ const handleDroppedFiles = files => {
29
+ const img = files.find(isImageOrPdf);
30
+ const xml = files.find(f => f.name.toLowerCase().endsWith('.xml'));
31
+ if (img) uploadFile(img);
32
+ if (xml) uploadXml(xml); // queued after image upload sets imageId
33
+ };
34
+
35
+ // Click to browse image
36
+ uploadArea.addEventListener('click', () => fileInput.click());
37
+
38
+ // File selected
39
+ fileInput.addEventListener('change', () => {
40
+ if (fileInput.files.length > 0) uploadFile(fileInput.files[0]);
41
+ });
42
+
43
+ // Drag & drop — accept image, PDF, and XML on the upload box or viewer.
44
+ const dropTargets = [uploadArea, viewerScroll, viewerPlaceholder].filter(Boolean);
45
+ dropTargets.forEach(target => {
46
+ target.addEventListener('dragover', e => {
47
+ e.preventDefault();
48
+ uploadArea.classList.add('dragover');
49
+ if (viewerPlaceholder && !state.imageId) viewerPlaceholder.classList.add('dragover');
50
+ });
51
+ target.addEventListener('dragleave', e => {
52
+ if (!e.currentTarget.contains(e.relatedTarget)) {
53
+ uploadArea.classList.remove('dragover');
54
+ viewerPlaceholder?.classList.remove('dragover');
55
+ }
56
+ });
57
+ target.addEventListener('drop', e => {
58
+ e.preventDefault();
59
+ uploadArea.classList.remove('dragover');
60
+ viewerPlaceholder?.classList.remove('dragover');
61
+ handleDroppedFiles(Array.from(e.dataTransfer.files));
62
+ });
63
+ });
64
+
65
+ // Keep the explicit upload area compatible with batch-panel's capture-phase
66
+ // drop interception for multi-image queues.
67
+ uploadArea.addEventListener('drop', e => {
68
+ e.preventDefault();
69
+ });
70
+
71
+ // XML file picker
72
+ xmlInput.addEventListener('change', () => {
73
+ if (xmlInput.files.length > 0) uploadXml(xmlInput.files[0]);
74
+ });
75
+
76
+ // Batch panel: load a completed item's image into the viewer
77
+ on('batch-item-start', ({ imageId, filename }) => {
78
+ state.imageId = imageId;
79
+ // Clear bboxes immediately for the new item
80
+ currentBboxes = [];
81
+ currentRegions = [];
82
+ const img = $('page-image');
83
+ img.src = `/api/image/${imageId}`;
84
+ $('image-container').classList.remove('hidden');
85
+ $('viewer-placeholder').classList.add('hidden');
86
+ img.onload = () => {
87
+ const canvas = $('overlay-canvas');
88
+ canvas.width = img.naturalWidth;
89
+ canvas.height = img.naturalHeight;
90
+ fitZoom();
91
+ // Redraw any bboxes that arrived before the image finished loading
92
+ if (currentBboxes.length > 0) {
93
+ drawBboxes(currentBboxes, -1, currentRegions);
94
+ } else {
95
+ const ctx = canvas.getContext('2d');
96
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
97
+ }
98
+ };
99
+ $('image-info').textContent = filename;
100
+ $('xml-upload-row').classList.remove('hidden');
101
+ $('xml-status').textContent = 'No PAGE XML';
102
+ $('xml-status').classList.remove('xml-ok');
103
+ emit('transcription-start', {});
104
+ });
105
+
106
+ // Draw bboxes after segmentation; keep state.regions in sync
107
+ on('sse-segmentation', data => {
108
+ state.regions = data.regions || [];
109
+ if (data.source === 'page') {
110
+ // Page-level engine: clear any old line bboxes, don't draw full-page box
111
+ drawBboxes([], -1, []);
112
+ } else {
113
+ drawBboxes(data.bboxes, -1, state.regions);
114
+ }
115
+ if (data.source === 'pagexml') {
116
+ $('xml-status').textContent = `PAGE XML: ${data.num_lines} lines`;
117
+ }
118
+ });
119
+
120
+ // Highlight line on click from transcription panel
121
+ on('highlight-line', ({ index }) => highlightBbox(index));
122
+
123
+ // Click on canvas → highlight the clicked bbox and emit highlight-line
124
+ const canvas = $('overlay-canvas');
125
+ canvas.addEventListener('click', e => {
126
+ if (currentBboxes.length === 0) return;
127
+
128
+ const img = $('page-image');
129
+ // Scale factor: natural image coords / displayed canvas coords
130
+ const scaleX = img.naturalWidth / img.clientWidth;
131
+ const scaleY = img.naturalHeight / img.clientHeight;
132
+
133
+ const rect = canvas.getBoundingClientRect();
134
+ const clickX = (e.clientX - rect.left) * scaleX;
135
+ const clickY = (e.clientY - rect.top) * scaleY;
136
+
137
+ for (let i = 0; i < currentBboxes.length; i++) {
138
+ const [x1, y1, x2, y2] = currentBboxes[i];
139
+ if (clickX >= x1 && clickX <= x2 && clickY >= y1 && clickY <= y2) {
140
+ emit('highlight-line', { index: i });
141
+ break;
142
+ }
143
+ }
144
+ });
145
+ }
146
+
147
+ async function uploadFile(file) {
148
+ const formData = new FormData();
149
+ formData.append('file', file);
150
+
151
+ $('image-info').textContent = 'Uploading...';
152
+
153
+ try {
154
+ const resp = await fetch('/api/image/upload', {
155
+ method: 'POST',
156
+ body: formData,
157
+ });
158
+ if (!resp.ok) {
159
+ const err = await resp.json();
160
+ throw new Error(err.detail);
161
+ }
162
+ const data = await resp.json();
163
+
164
+ // PDF: redirect all pages to batch panel
165
+ if (data.is_pdf) {
166
+ $('image-info').textContent = `PDF: ${data.num_pages} page(s) — added to batch queue`;
167
+ emit('pdf-pages-ready', data);
168
+ return;
169
+ }
170
+
171
+ state.imageId = data.image_id;
172
+ state.imageInfo = data;
173
+
174
+ // Display image — show container, hide placeholder
175
+ const img = $('page-image');
176
+ img.src = `/api/image/${data.image_id}`;
177
+ $('image-container').classList.remove('hidden');
178
+ $('viewer-placeholder').classList.add('hidden');
179
+
180
+ // Wait for image to load to size canvas and fit zoom
181
+ img.onload = () => {
182
+ const canvas = $('overlay-canvas');
183
+ canvas.width = img.naturalWidth;
184
+ canvas.height = img.naturalHeight;
185
+ fitZoom(); // sets img.style.width/height and canvas display size
186
+ };
187
+
188
+ $('image-info').textContent = `${data.filename} (${data.width}×${data.height})`;
189
+ // Show XML upload row
190
+ $('xml-upload-row').classList.remove('hidden');
191
+ $('xml-status').textContent = 'No PAGE XML';
192
+ $('xml-status').classList.remove('xml-ok');
193
+ emit('image-uploaded', data);
194
+ } catch (err) {
195
+ $('image-info').textContent = `Error: ${err.message}`;
196
+ toast(`Upload failed: ${err.message}`, 'error', 7000);
197
+ }
198
+ }
199
+
200
+ async function uploadXml(file) {
201
+ if (!state.imageId) {
202
+ // Will retry after image upload finishes
203
+ on('image-uploaded', () => uploadXml(file), { once: true });
204
+ return;
205
+ }
206
+ const xmlStatus = $('xml-status');
207
+ xmlStatus.textContent = 'Uploading XML...';
208
+ xmlStatus.classList.remove('xml-ok');
209
+ try {
210
+ const formData = new FormData();
211
+ formData.append('file', file);
212
+ const resp = await fetch(`/api/image/${state.imageId}/xml`, {
213
+ method: 'POST',
214
+ body: formData,
215
+ });
216
+ if (!resp.ok) {
217
+ const err = await resp.json();
218
+ throw new Error(err.detail);
219
+ }
220
+ xmlStatus.textContent = `✓ ${file.name}`;
221
+ xmlStatus.classList.add('xml-ok');
222
+ emit('xml-uploaded', { filename: file.name });
223
+ } catch (err) {
224
+ xmlStatus.textContent = `XML error: ${err.message}`;
225
+ }
226
+ }
227
+
228
+ let currentBboxes = [];
229
+ let currentRegions = [];
230
+
231
+ // Distinct colours for up to 8 regions (cycling)
232
+ const REGION_COLORS = [
233
+ 'rgba(255, 160, 30, 0.55)', // orange
234
+ 'rgba( 46, 213, 115, 0.55)', // green
235
+ 'rgba(232, 65, 24, 0.55)', // red
236
+ 'rgba( 52, 172, 224, 0.55)', // blue
237
+ 'rgba(162, 16, 213, 0.55)', // purple
238
+ 'rgba(255, 211, 42, 0.55)', // yellow
239
+ 'rgba( 18, 203, 196, 0.55)', // teal
240
+ 'rgba(253, 89, 166, 0.55)', // pink
241
+ ];
242
+
243
+ function drawBboxes(bboxes, highlightIndex = -1, regions = []) {
244
+ currentBboxes = bboxes;
245
+ currentRegions = regions;
246
+ const canvas = $('overlay-canvas');
247
+ const img = $('page-image');
248
+ const ctx = canvas.getContext('2d');
249
+
250
+ // Keep canvas display size in sync with zoom-controlled img size
251
+ canvas.style.width = img.style.width || img.clientWidth + 'px';
252
+ canvas.style.height = img.style.height || img.clientHeight + 'px';
253
+
254
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
255
+
256
+ // Draw region outlines first (underneath line boxes)
257
+ regions.forEach((r, ri) => {
258
+ const [x1, y1, x2, y2] = r.bbox;
259
+ const color = REGION_COLORS[ri % REGION_COLORS.length];
260
+ ctx.strokeStyle = color;
261
+ ctx.lineWidth = 2.5;
262
+ ctx.setLineDash([8, 4]);
263
+ ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
264
+ ctx.setLineDash([]);
265
+ // Subtle fill
266
+ ctx.fillStyle = color.replace('0.55', '0.07');
267
+ ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
268
+ // Region label
269
+ ctx.fillStyle = color.replace('0.55', '0.9');
270
+ ctx.font = 'bold 13px sans-serif';
271
+ ctx.fillText(`R${ri + 1} (${r.num_lines} lines)`, x1 + 4, y1 + 16);
272
+ });
273
+
274
+ // Draw line boxes on top
275
+ for (let i = 0; i < bboxes.length; i++) {
276
+ const [x1, y1, x2, y2] = bboxes[i];
277
+ const isHighlighted = i === highlightIndex;
278
+
279
+ ctx.strokeStyle = isHighlighted ? '#e94560' : 'rgba(58, 134, 255, 0.6)';
280
+ ctx.lineWidth = isHighlighted ? 3 : 1.5;
281
+ ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
282
+
283
+ if (isHighlighted) {
284
+ ctx.fillStyle = 'rgba(233, 69, 96, 0.1)';
285
+ ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
286
+ }
287
+ }
288
+ }
289
+
290
+ function highlightBbox(index) {
291
+ if (currentBboxes.length > 0) {
292
+ drawBboxes(currentBboxes, index, currentRegions);
293
+ }
294
+ }
web/static/components/transcription-panel.js ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Transcription Panel — SSE progress, results, export
3
+ */
4
+
5
+ import { state, emit, on, toast } from '../app.js';
6
+
7
+ const $ = id => document.getElementById(id);
8
+
9
+ // ── Font selector ───────────────────────────────────────────────────────
10
+ const LS_FONT = 'polyscriptor_results_font';
11
+
12
+ const FONTS = [
13
+ { label: 'Monospace (default)', value: '' },
14
+ { label: 'Monomakh Unicode ✦', value: 'Monomakh', local: true },
15
+ { label: 'Old Standard TT', value: 'Old Standard TT', gf: 'Old+Standard+TT' },
16
+ { label: 'Noto Serif', value: 'Noto Serif', gf: 'Noto+Serif' },
17
+ { label: 'Crimson Pro', value: 'Crimson Pro', gf: 'Crimson+Pro' },
18
+ { label: 'IM Fell English', value: 'IM Fell English', gf: 'IM+Fell+English' },
19
+ ];
20
+
21
+ const _loadedFonts = new Set();
22
+
23
+ function _loadGoogleFont(gfParam) {
24
+ const url = `https://fonts.googleapis.com/css2?family=${gfParam}&display=swap`;
25
+ if (_loadedFonts.has(url)) return;
26
+ const link = document.createElement('link');
27
+ link.rel = 'stylesheet';
28
+ link.href = url;
29
+ document.head.appendChild(link);
30
+ _loadedFonts.add(url);
31
+ }
32
+
33
+ function applyFont(value) {
34
+ const f = FONTS.find(f => f.value === value);
35
+ if (!f) return;
36
+ if (f.gf) _loadGoogleFont(f.gf);
37
+ if (f.value) {
38
+ document.documentElement.style.setProperty(
39
+ '--font-results', `"${f.value}", Georgia, serif`);
40
+ } else {
41
+ document.documentElement.style.removeProperty('--font-results');
42
+ }
43
+ }
44
+
45
+ export function initTranscriptionPanel() {
46
+ let _transcribeStart = null;
47
+ let _numRegions = 1;
48
+ let _columnMode = false;
49
+
50
+ // Confidence threshold slider
51
+ const slider = $('conf-threshold');
52
+ const sliderVal = $('conf-threshold-val');
53
+ slider.addEventListener('input', () => {
54
+ const threshold = parseInt(slider.value, 10);
55
+ sliderVal.textContent = threshold + '%';
56
+ applyConfidenceFilter(threshold);
57
+ });
58
+
59
+ // Search / filter
60
+ const searchInput = $('results-search');
61
+ searchInput.addEventListener('input', () => applySearch(searchInput.value));
62
+ // Clear search on new transcription
63
+ function resetSearch() {
64
+ searchInput.value = '';
65
+ $('results-search-row').classList.add('hidden');
66
+ $('results-search-count').textContent = '';
67
+ }
68
+
69
+ // Font selector — populate, restore, handle changes
70
+ const fontSel = $('font-select');
71
+ for (const f of FONTS) {
72
+ const o = document.createElement('option');
73
+ o.value = f.value;
74
+ o.textContent = f.label;
75
+ fontSel.appendChild(o);
76
+ }
77
+ const savedFont = (() => { try { return localStorage.getItem(LS_FONT) || ''; } catch { return ''; } })();
78
+ fontSel.value = savedFont;
79
+ if (savedFont) applyFont(savedFont);
80
+ fontSel.addEventListener('change', () => {
81
+ applyFont(fontSel.value);
82
+ try { localStorage.setItem(LS_FONT, fontSel.value); } catch { /* private mode */ }
83
+ });
84
+
85
+ // Column layout toggle
86
+ $('btn-col-layout').addEventListener('click', () => {
87
+ _columnMode = !_columnMode;
88
+ $('btn-col-layout').classList.toggle('active', _columnMode);
89
+ if (_columnMode) renderAllColumns();
90
+ else renderAllFlat();
91
+ });
92
+
93
+ on('transcription-start', () => {
94
+ state.lines = [];
95
+ _transcribeStart = null;
96
+ _numRegions = 1;
97
+ _columnMode = false;
98
+ $('btn-col-layout').classList.add('hidden');
99
+ $('btn-col-layout').classList.remove('active');
100
+ $('transcription-lines').innerHTML = '';
101
+ $('transcription-lines').classList.remove('col-layout');
102
+ $('progress-container').classList.remove('hidden');
103
+ $('results-footer').classList.add('hidden');
104
+ $('conf-filter-row').classList.add('hidden');
105
+ resetSearch();
106
+ $('progress-fill').style.width = '0%';
107
+ $('progress-fill').style.background = ''; // reset error colour
108
+ $('progress-text').textContent = 'Segmenting...';
109
+ });
110
+
111
+ // Highlight line in transcription panel when a bbox is clicked (or line clicked)
112
+ on('highlight-line', ({ index }) => {
113
+ const container = $('transcription-lines');
114
+ container.querySelectorAll('.line-active').forEach(el => el.classList.remove('line-active'));
115
+ const target = container.querySelector(`[data-index="${index}"]`);
116
+ if (target) {
117
+ target.classList.add('line-active');
118
+ target.scrollIntoView({ block: 'nearest', behavior: 'smooth' });
119
+ }
120
+ });
121
+
122
+ on('sse-status', data => {
123
+ $('progress-text').textContent = data.message;
124
+ });
125
+
126
+ on('sse-segmentation', data => {
127
+ if (data.source === 'page') {
128
+ $('progress-text').textContent = 'Processing full page...';
129
+ } else {
130
+ $('progress-text').textContent = `${data.num_lines} lines found. Transcribing...`;
131
+ }
132
+ });
133
+
134
+ on('sse-progress', data => {
135
+ const pct = Math.round((data.current / data.total) * 100);
136
+ $('progress-fill').style.width = pct + '%';
137
+
138
+ // ETA
139
+ const now = Date.now();
140
+ if (!_transcribeStart) _transcribeStart = now;
141
+ const elapsed = (now - _transcribeStart) / 1000;
142
+ const rate = data.current / elapsed; // lines/s
143
+ const remaining = rate > 0 ? Math.round((data.total - data.current) / rate) : null;
144
+ const etaStr = remaining != null
145
+ ? ` · ~${remaining < 60 ? remaining + 's' : Math.round(remaining / 60) + 'min'} left`
146
+ : '';
147
+ let tokenStr = '';
148
+ if (data.token_usage) {
149
+ const tu = data.token_usage;
150
+ const parts = [];
151
+ if (tu.prompt_tokens != null) parts.push(`in:${tu.prompt_tokens}`);
152
+ if (tu.output_tokens != null) parts.push(`out:${tu.output_tokens}`);
153
+ if (tu.thinking_tokens != null && tu.thinking_tokens > 0) parts.push(`think:${tu.thinking_tokens}`);
154
+ if (parts.length) tokenStr = ` | ${parts.join(' ')} tok`;
155
+ }
156
+ $('progress-text').textContent = `${data.current} / ${data.total} lines${etaStr}${tokenStr}`;
157
+
158
+ _numRegions = Math.max(_numRegions, (data.line.region ?? 0) + 1);
159
+ state.lines.push(data.line);
160
+ appendLine(data.line);
161
+ });
162
+
163
+ on('sse-complete', data => {
164
+ $('progress-container').classList.add('hidden');
165
+ $('results-footer').classList.remove('hidden');
166
+ $('btn-export-xml').classList.toggle('hidden', !!data.browser_direct);
167
+ let summary = `${data.lines.length} lines in ${data.total_time_s}s (${data.engine})`;
168
+ if (data.token_usage) {
169
+ const tu = data.token_usage;
170
+ const parts = [];
171
+ if (tu.prompt_tokens != null) parts.push(`in: ${tu.prompt_tokens}`);
172
+ if (tu.output_tokens != null) parts.push(`out: ${tu.output_tokens}`);
173
+ if (tu.thinking_tokens != null && tu.thinking_tokens > 0)
174
+ parts.push(`think: ${tu.thinking_tokens}`);
175
+ if (parts.length) summary += ` | tokens: ${parts.join(', ')}`;
176
+ }
177
+ $('results-summary').textContent = summary;
178
+ // Show confidence filter if any line has confidence data
179
+ if (state.lines.some(l => l.confidence != null)) {
180
+ $('conf-filter-row').classList.remove('hidden');
181
+ slider.value = 0;
182
+ sliderVal.textContent = '0%';
183
+ }
184
+ // Show search if there are results
185
+ if (state.lines.length > 0) {
186
+ $('results-search-row').classList.remove('hidden');
187
+ }
188
+ // Show column layout toggle if multiple regions detected
189
+ if (_numRegions > 1) {
190
+ $('btn-col-layout').classList.remove('hidden');
191
+ }
192
+ emit('transcription-complete', data);
193
+ });
194
+
195
+ on('sse-cancelled', () => {
196
+ $('progress-text').textContent = 'Cancelled';
197
+ $('progress-fill').style.width = '0%';
198
+ // Show footer if we have partial results
199
+ if (state.lines.length > 0) {
200
+ $('results-footer').classList.remove('hidden');
201
+ $('results-summary').textContent = `Cancelled — ${state.lines.length} lines transcribed`;
202
+ }
203
+ emit('transcription-complete', {});
204
+ });
205
+
206
+ on('sse-error', data => {
207
+ $('progress-text').textContent = `Error: ${data.message}`;
208
+ $('progress-fill').style.width = '0%';
209
+ $('progress-fill').style.background = 'var(--danger)';
210
+ emit('transcription-complete', {});
211
+ });
212
+
213
+ on('transcription-error', data => {
214
+ $('progress-text').textContent = `Error: ${data.message}`;
215
+ emit('transcription-complete', {});
216
+ });
217
+
218
+ // Also hide Export XML when a new transcription starts
219
+ on('transcription-start', () => {
220
+ $('btn-export-xml').classList.add('hidden');
221
+ });
222
+
223
+ $('btn-copy-text').addEventListener('click', copyText);
224
+ $('btn-export-txt').addEventListener('click', exportTxt);
225
+ $('btn-export-csv').addEventListener('click', exportCsv);
226
+ $('btn-export-xml').addEventListener('click', exportXml);
227
+ }
228
+
229
+ function renderAllFlat() {
230
+ const container = $('transcription-lines');
231
+ container.innerHTML = '';
232
+ container.classList.remove('col-layout');
233
+ state.lines.forEach(line => appendLine(line));
234
+ }
235
+
236
+ function renderAllColumns() {
237
+ const container = $('transcription-lines');
238
+ container.innerHTML = '';
239
+ container.classList.add('col-layout');
240
+
241
+ const maxRegion = state.lines.reduce((m, l) => Math.max(m, l.region ?? 0), 0);
242
+ const groups = Array.from({ length: maxRegion + 1 }, () => []);
243
+ state.lines.forEach(line => groups[line.region ?? 0].push(line));
244
+
245
+ groups.forEach((lines, r) => {
246
+ const col = document.createElement('div');
247
+ col.className = 'region-column';
248
+
249
+ const hdr = document.createElement('div');
250
+ hdr.className = 'region-col-header';
251
+
252
+ const title = document.createElement('span');
253
+ title.textContent = `Column ${r + 1} (${lines.length})`;
254
+ hdr.appendChild(title);
255
+
256
+ const closeBtn = document.createElement('button');
257
+ closeBtn.className = 'region-col-close';
258
+ closeBtn.textContent = '×';
259
+ closeBtn.title = 'Hide this column';
260
+ closeBtn.addEventListener('click', e => { e.stopPropagation(); col.remove(); });
261
+ hdr.appendChild(closeBtn);
262
+
263
+ col.appendChild(hdr);
264
+ lines.forEach(line => appendLine(line, col));
265
+ container.appendChild(col);
266
+ });
267
+ }
268
+
269
+ function appendLine(line, container = null) {
270
+ container = container || $('transcription-lines');
271
+ const div = document.createElement('div');
272
+ div.className = 'line-result';
273
+ div.dataset.index = line.index;
274
+ if (line.confidence != null) {
275
+ div.dataset.confidence = Math.round(line.confidence * 100);
276
+ }
277
+
278
+ // Line number
279
+ const numSpan = document.createElement('span');
280
+ numSpan.className = 'line-num';
281
+ numSpan.textContent = line.index + 1;
282
+
283
+ // Editable text span
284
+ const textSpan = document.createElement('span');
285
+ textSpan.className = 'line-text';
286
+ textSpan.textContent = line.text;
287
+
288
+ // Confidence badge
289
+ let confSpan = null;
290
+ if (line.confidence != null) {
291
+ const pct = Math.round(line.confidence * 100);
292
+ const cls = pct >= 90 ? 'conf-high' : pct >= 75 ? 'conf-mid' : 'conf-low';
293
+ confSpan = document.createElement('span');
294
+ confSpan.className = `confidence ${cls}`;
295
+ confSpan.textContent = pct + '%';
296
+ }
297
+
298
+ div.appendChild(numSpan);
299
+ div.appendChild(textSpan);
300
+ if (confSpan) div.appendChild(confSpan);
301
+
302
+ // Thinking text (Gemini reasoning) — collapsible per line
303
+ if (line.thinking_text) {
304
+ const details = document.createElement('details');
305
+ details.className = 'thinking-block';
306
+ const summary = document.createElement('summary');
307
+ summary.className = 'thinking-toggle';
308
+ summary.textContent = 'reasoning';
309
+ const pre = document.createElement('pre');
310
+ pre.className = 'thinking-text';
311
+ pre.textContent = line.thinking_text;
312
+ details.appendChild(summary);
313
+ details.appendChild(pre);
314
+ div.appendChild(details);
315
+ }
316
+
317
+ // Single click → highlight bbox on image
318
+ div.addEventListener('click', e => {
319
+ if (textSpan.contentEditable === 'true') return; // don't interfere while editing
320
+ emit('highlight-line', { index: line.index });
321
+ });
322
+
323
+ // Double-click → start inline editing
324
+ textSpan.addEventListener('dblclick', e => {
325
+ e.stopPropagation();
326
+ textSpan.contentEditable = 'true';
327
+ textSpan.focus();
328
+ // Select all text for easy replacement
329
+ const range = document.createRange();
330
+ range.selectNodeContents(textSpan);
331
+ const sel = window.getSelection();
332
+ sel.removeAllRanges();
333
+ sel.addRange(range);
334
+ });
335
+
336
+ // Save on blur or Enter
337
+ const saveEdit = () => {
338
+ textSpan.contentEditable = 'false';
339
+ const newText = textSpan.textContent;
340
+ if (newText !== line.text) {
341
+ state.lines[line.index].text = newText;
342
+ div.classList.add('line-edited');
343
+ }
344
+ };
345
+ textSpan.addEventListener('blur', saveEdit);
346
+ textSpan.addEventListener('keydown', e => {
347
+ if (e.key === 'Enter') { e.preventDefault(); saveEdit(); }
348
+ if (e.key === 'Escape') {
349
+ textSpan.textContent = state.lines[line.index].text; // revert
350
+ textSpan.contentEditable = 'false';
351
+ }
352
+ });
353
+
354
+ container.appendChild(div);
355
+ // Auto-scroll only for the main flat container (not column sub-divs)
356
+ if (container === $('transcription-lines')) {
357
+ container.scrollTop = container.scrollHeight;
358
+ }
359
+ }
360
+
361
+ function applyConfidenceFilter(threshold) {
362
+ $('transcription-lines').querySelectorAll('.line-result').forEach(div => {
363
+ const conf = parseInt(div.dataset.confidence ?? '100', 10);
364
+ div.classList.toggle('line-dimmed', conf < threshold);
365
+ });
366
+ }
367
+
368
+ function applySearch(query) {
369
+ const lines = $('transcription-lines').querySelectorAll('.line-result');
370
+ const q = query.trim().toLowerCase();
371
+ let matchCount = 0;
372
+
373
+ lines.forEach(div => {
374
+ const textSpan = div.querySelector('.line-text');
375
+ if (!textSpan) return;
376
+ // Use state.lines for the canonical text (survives inline edits and search markup)
377
+ const lineIdx = parseInt(div.dataset.index ?? '-1', 10);
378
+ const raw = lineIdx >= 0 && state.lines[lineIdx]
379
+ ? state.lines[lineIdx].text
380
+ : textSpan.textContent;
381
+
382
+ if (!q) {
383
+ // Clear search: restore plain text, remove hidden
384
+ textSpan.textContent = raw;
385
+ div.classList.remove('line-hidden');
386
+ return;
387
+ }
388
+
389
+ const lc = raw.toLowerCase();
390
+ const idx = lc.indexOf(q);
391
+ if (idx === -1) {
392
+ div.classList.add('line-hidden');
393
+ } else {
394
+ div.classList.remove('line-hidden');
395
+ matchCount++;
396
+ // Highlight match with <mark> using safe DOM manipulation
397
+ const before = raw.slice(0, idx);
398
+ const match = raw.slice(idx, idx + q.length);
399
+ const after = raw.slice(idx + q.length);
400
+ textSpan.textContent = '';
401
+ textSpan.appendChild(document.createTextNode(before));
402
+ const mark = document.createElement('mark');
403
+ mark.textContent = match;
404
+ textSpan.appendChild(mark);
405
+ textSpan.appendChild(document.createTextNode(after));
406
+ }
407
+ });
408
+
409
+ const countEl = $('results-search-count');
410
+ countEl.textContent = q ? `${matchCount} match${matchCount !== 1 ? 'es' : ''}` : '';
411
+ }
412
+
413
+ // (escapeHtml no longer needed — we use textContent/DOM directly)
414
+
415
+ async function copyText() {
416
+ if (state.lines.length === 0) return;
417
+ const text = state.lines.map(l => l.text).join('\n');
418
+ try {
419
+ await navigator.clipboard.writeText(text);
420
+ const btn = $('btn-copy-text');
421
+ const orig = btn.textContent;
422
+ btn.textContent = 'Copied!';
423
+ setTimeout(() => { btn.textContent = orig; }, 1500);
424
+ } catch {
425
+ toast('Clipboard not available — use Export TXT instead', 'error');
426
+ }
427
+ }
428
+
429
+ function exportTxt() {
430
+ if (state.lines.length === 0) return;
431
+ const text = state.lines.map(l => l.text).join('\n');
432
+ downloadFile('transcription.txt', text, 'text/plain');
433
+ }
434
+
435
+ function exportCsv() {
436
+ if (state.lines.length === 0) return;
437
+ const header = 'Line,Text,Confidence,X1,Y1,X2,Y2\n';
438
+ const rows = state.lines.map(l => {
439
+ const conf = l.confidence != null ? l.confidence.toFixed(4) : '';
440
+ const bbox = l.bbox ? l.bbox.join(',') : ',,,';
441
+ return `${l.index + 1},"${l.text.replace(/"/g, '""')}",${conf},${bbox}`;
442
+ }).join('\n');
443
+ downloadFile('transcription.csv', header + rows, 'text/csv');
444
+ }
445
+
446
+ function downloadFile(filename, content, mime) {
447
+ const blob = new Blob([content], { type: mime });
448
+ const url = URL.createObjectURL(blob);
449
+ const a = document.createElement('a');
450
+ a.href = url;
451
+ a.download = filename;
452
+ a.click();
453
+ URL.revokeObjectURL(url);
454
+ }
455
+
456
+ async function exportXml() {
457
+ if (!state.imageId) return;
458
+ try {
459
+ const resp = await fetch(`/api/image/${state.imageId}/export-xml`, { method: 'POST' });
460
+ if (!resp.ok) {
461
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
462
+ toast(`XML export failed: ${err.detail || resp.statusText}`, 'error');
463
+ return;
464
+ }
465
+ const blob = await resp.blob();
466
+ // Use filename from Content-Disposition if provided, else fall back
467
+ let filename = 'transcription.xml';
468
+ const cd = resp.headers.get('Content-Disposition');
469
+ if (cd) {
470
+ const m = cd.match(/filename="([^"]+)"/);
471
+ if (m) filename = m[1];
472
+ }
473
+ const url = URL.createObjectURL(blob);
474
+ const a = document.createElement('a');
475
+ a.href = url;
476
+ a.download = filename;
477
+ a.click();
478
+ URL.revokeObjectURL(url);
479
+ } catch (err) {
480
+ toast(`XML export error: ${err.message}`, 'error');
481
+ }
482
+ }
web/static/fonts/MonomakhUnicode-Regular.woff2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a07ebc9c97abc54866b6c8f35d6057f861f84a760127349f28c47c069a9cfea4
3
+ size 86480
web/static/index.html ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Polyscriptor HTR</title>
7
+ <link rel="stylesheet" href="/static/app.css">
8
+ </head>
9
+ <body>
10
+ <!-- Header -->
11
+ <header id="header">
12
+ <div class="header-left">
13
+ <span class="header-logo">⬡</span>
14
+ <h1>Polyscriptor <span class="header-sub">HTR</span></h1>
15
+ </div>
16
+ <div class="header-right">
17
+ <div id="gpu-status" class="gpu-widget"></div>
18
+ <button id="btn-help" class="btn-icon" title="Help">?</button>
19
+ </div>
20
+ </header>
21
+
22
+ <!-- Main 3-column layout -->
23
+ <main id="app">
24
+ <!-- Left: Engine + Image controls -->
25
+ <aside id="engine-panel" class="panel" data-panel="settings">
26
+
27
+
28
+ <section class="panel-section">
29
+ <h2>HTR Engine</h2>
30
+ <label for="engine-select">Engine</label>
31
+ <select id="engine-select" disabled>
32
+ <option>Loading engines…</option>
33
+ </select>
34
+ <p id="engine-description" class="muted"></p>
35
+ <div id="config-form"></div>
36
+ <div id="kraken-preset-row" class="hidden" style="margin-top:8px">
37
+ <label for="kraken-preset-select" style="display:block;font-size:0.78rem;margin-bottom:3px">Kraken Model Preset</label>
38
+ <select id="kraken-preset-select" style="width:100%">
39
+ <option value="">Loading presets…</option>
40
+ </select>
41
+ <span id="kraken-preset-status" class="muted" style="font-size:0.72rem;display:block;margin-top:3px"></span>
42
+ </div>
43
+ <button id="btn-load-model" class="btn btn-primary" disabled>Load Model</button>
44
+ <div id="engine-status" class="status-badge hidden"></div>
45
+ </section>
46
+
47
+ <hr>
48
+
49
+ <section class="panel-section">
50
+ <h2>Image</h2>
51
+ <div id="upload-area" class="upload-area">
52
+ <svg class="upload-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
53
+ <path stroke-linecap="round" stroke-linejoin="round"
54
+ d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5m-13.5-9L12 3m0 0l4.5 4.5M12 3v13.5"/>
55
+ </svg>
56
+ <p>Drop image or PDF, or click to browse</p>
57
+ <input type="file" id="file-input" accept="image/*,.pdf" multiple hidden>
58
+ </div>
59
+ <p id="image-info" class="muted"></p>
60
+ <div id="batch-queue-section" class="hidden">
61
+ <div class="batch-queue-header">
62
+ <span class="section-label">Queue</span>
63
+ <span id="batch-overall-progress" class="batch-overall-progress hidden"></span>
64
+ </div>
65
+ <div id="batch-list"></div>
66
+ <div class="batch-options-row">
67
+ <label class="checkbox-label" title="Use PAGE XML segmentation if a matching .xml file was uploaded for this image">
68
+ <input type="checkbox" id="batch-use-pagexml" checked>
69
+ Use PAGE XML
70
+ </label>
71
+ <label class="checkbox-label" title="Skip images that have already been transcribed in this session">
72
+ <input type="checkbox" id="batch-resume">
73
+ Resume
74
+ </label>
75
+ </div>
76
+ <div class="btn-row" style="margin-top:6px">
77
+ <button id="btn-process-batch" class="btn btn-primary btn-small">Process All</button>
78
+ <button id="btn-clear-batch" class="btn btn-small btn-outline">Clear</button>
79
+ </div>
80
+ <div id="batch-export-row" class="btn-row hidden" style="margin-top:6px">
81
+ <button id="btn-export-batch-txt" class="btn btn-small">All TXT</button>
82
+ <button id="btn-export-batch-csv" class="btn btn-small">All CSV</button>
83
+ <button id="btn-export-batch-txt-zip" class="btn btn-small btn-primary">Download ZIP (TXT)</button>
84
+ <button id="btn-export-batch-thinking-zip" class="btn btn-small btn-primary">Download ZIP (Thinking)</button>
85
+ <button id="btn-export-batch-xml" class="btn btn-small btn-primary">Download ZIP (XML)</button>
86
+ </div>
87
+ </div>
88
+ <div id="xml-upload-row" class="xml-row hidden">
89
+ <span id="xml-status" class="muted">No PAGE XML</span>
90
+ <label class="btn btn-small btn-outline" for="xml-input">
91
+ Upload XML
92
+ <input type="file" id="xml-input" accept=".xml" hidden multiple>
93
+ </label>
94
+ </div>
95
+ </section>
96
+
97
+ <hr>
98
+
99
+ <section class="panel-section" id="seg-controls">
100
+ <h2>Segmentation</h2>
101
+ <label for="seg-method">Method</label>
102
+ <select id="seg-method">
103
+ <option value="kraken" selected>Kraken Classical</option>
104
+ <option value="hpp">HPP / projection profile fallback</option>
105
+ <option value="kraken-blla" disabled>Kraken Neural / blla (server only)</option>
106
+ </select>
107
+
108
+ <label for="seg-device">Device</label>
109
+ <select id="seg-device">
110
+ <option value="cpu">CPU</option>
111
+ <option value="cuda:0">GPU 0</option>
112
+ <option value="cuda:1">GPU 1</option>
113
+ </select>
114
+
115
+ <div id="blla-options" style="display:none">
116
+ <div style="display:flex;gap:12px;align-items:center;flex-wrap:wrap">
117
+ <div style="display:flex;flex-direction:column;gap:3px">
118
+ <label for="seg-max-columns">Max columns</label>
119
+ <input type="number" id="seg-max-columns" min="1" max="12" value="6" style="width:60px">
120
+ </div>
121
+ <div style="display:flex;flex-direction:column;gap:3px">
122
+ <label for="seg-split-width">Split width %</label>
123
+ <input type="number" id="seg-split-width" min="5" max="80" value="40" step="5" style="width:60px" title="Min region width (% of page) to trigger sub-column splitting. Lower = split narrower regions. Double pages: try 20.">
124
+ </div>
125
+ </div>
126
+ <div style="margin-top:6px">
127
+ <label for="seg-text-direction">Reading direction</label>
128
+ <select id="seg-text-direction" title="Controls column reading order. Use horizontal-rl for Arabic, Ottoman, Hebrew manuscripts.">
129
+ <option value="horizontal-lr">LTR (Latin, Cyrillic, …)</option>
130
+ <option value="horizontal-rl">RTL (Arabic, Ottoman, Hebrew, …)</option>
131
+ <option value="vertical-lr">Vertical LTR</option>
132
+ <option value="vertical-rl">Vertical RTL</option>
133
+ </select>
134
+ </div>
135
+ </div>
136
+ </section>
137
+
138
+ <div id="seg-regions-list" class="hidden"></div>
139
+
140
+ <div class="panel-footer">
141
+ <div class="btn-row footer-btn-row">
142
+ <button id="btn-segment" class="btn btn-outline" disabled title="Preview line segmentation without transcribing">Segment</button>
143
+ <button id="btn-transcribe" class="btn btn-accent" disabled>Transcribe</button>
144
+ </div>
145
+ </div>
146
+ </aside>
147
+ <div class="panel-resize-handle" id="resize-left" title="Drag to resize"></div>
148
+
149
+ <!-- Center: Image viewer -->
150
+ <section id="viewer-panel" class="panel" data-panel="image">
151
+ <!-- Zoom toolbar — only visible when image is loaded -->
152
+ <div id="zoom-toolbar" class="zoom-toolbar hidden">
153
+ <button class="zoom-btn" id="btn-zoom-out" title="Zoom out">−</button>
154
+ <span id="zoom-level" class="zoom-level">100%</span>
155
+ <button class="zoom-btn" id="btn-zoom-in" title="Zoom in">+</button>
156
+ <button class="zoom-btn zoom-fit" id="btn-zoom-fit" title="Fit to view">⊡</button>
157
+ <span class="zoom-toolbar-sep"></span>
158
+ <button class="btn btn-small btn-outline nav-btn" id="btn-nav-prev" title="Previous image (←)" disabled>‹ Prev</button>
159
+ <span id="batch-nav-label" class="batch-nav-label-toolbar"></span>
160
+ <button class="btn btn-small btn-outline nav-btn" id="btn-nav-next" title="Next image (→)" disabled>Next ›</button>
161
+ </div>
162
+ <!-- Scroll area fills remaining height -->
163
+ <div id="viewer-scroll">
164
+ <div id="viewer-placeholder" class="viewer-placeholder">
165
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1">
166
+ <rect x="3" y="3" width="18" height="18" rx="2"/>
167
+ <circle cx="8.5" cy="8.5" r="1.5"/>
168
+ <path stroke-linecap="round" stroke-linejoin="round" d="M21 15l-5-5L5 21"/>
169
+ </svg>
170
+ <p>Upload an image to begin</p>
171
+ </div>
172
+ <div id="image-container" class="hidden">
173
+ <img id="page-image">
174
+ <canvas id="overlay-canvas"></canvas>
175
+ </div>
176
+ </div>
177
+ </section>
178
+ <div class="panel-resize-handle" id="resize-right" title="Drag to resize"></div>
179
+
180
+ <!-- Right: Transcription results -->
181
+ <section id="results-panel" class="panel" data-panel="results">
182
+ <div class="results-header">
183
+ <div class="results-header-row">
184
+ <h2>Transcription</h2>
185
+ <div class="results-header-controls">
186
+ <select id="font-select" class="font-select" title="Transcription font"></select>
187
+ <button id="btn-col-layout" class="btn-icon hidden" title="Toggle column layout">⊞</button>
188
+ </div>
189
+ </div>
190
+ <div id="results-search-row" class="results-search-row hidden">
191
+ <input type="search" id="results-search" placeholder="Search lines…" autocomplete="off">
192
+ <span id="results-search-count" class="muted"></span>
193
+ </div>
194
+ <div id="conf-filter-row" class="conf-filter-row hidden">
195
+ <label>Min conf: <strong id="conf-threshold-val">0%</strong></label>
196
+ <input type="range" id="conf-threshold" min="0" max="100" value="0" step="5">
197
+ </div>
198
+ <div id="progress-container" class="hidden">
199
+ <div id="progress-bar"><div id="progress-fill"></div></div>
200
+ <div class="progress-row">
201
+ <p id="progress-text" class="muted">0 / 0 lines</p>
202
+ <button id="btn-cancel" class="btn btn-small hidden">Cancel</button>
203
+ </div>
204
+ </div>
205
+ </div>
206
+ <div id="transcription-lines"></div>
207
+ <div id="results-footer" class="hidden">
208
+ <p id="results-summary" class="muted"></p>
209
+ <div class="btn-row">
210
+ <button id="btn-copy-text" class="btn btn-small">Copy Text</button>
211
+ <button id="btn-export-txt" class="btn btn-small">TXT</button>
212
+ <button id="btn-export-csv" class="btn btn-small">CSV</button>
213
+ <button id="btn-export-xml" class="btn btn-small hidden">XML</button>
214
+ </div>
215
+ </div>
216
+ </section>
217
+ </main>
218
+
219
+ <!-- Mobile tab bar (visible < 700px) -->
220
+ <nav id="mobile-tabs">
221
+ <button class="tab-btn active" data-target="settings">
222
+ <svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M11.49 3.17c-.38-1.56-2.6-1.56-2.98 0a1.532 1.532 0 01-2.286.948c-1.372-.836-2.942.734-2.106 2.106.54.886.061 2.042-.947 2.287-1.561.379-1.561 2.6 0 2.978a1.532 1.532 0 01.947 2.287c-.836 1.372.734 2.942 2.106 2.106a1.532 1.532 0 012.287.947c.379 1.561 2.6 1.561 2.978 0a1.533 1.533 0 012.287-.947c1.372.836 2.942-.734 2.106-2.106a1.533 1.533 0 01.947-2.287c1.561-.379 1.561-2.6 0-2.978a1.532 1.532 0 01-.947-2.287c.836-1.372-.734-2.942-2.106-2.106a1.532 1.532 0 01-2.287-.947zM10 13a3 3 0 100-6 3 3 0 000 6z" clip-rule="evenodd"/></svg>
223
+ Settings
224
+ </button>
225
+ <button class="tab-btn" data-target="image">
226
+ <svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M4 3a2 2 0 00-2 2v10a2 2 0 002 2h12a2 2 0 002-2V5a2 2 0 00-2-2H4zm12 12H4l4-8 3 6 2-4 3 6z" clip-rule="evenodd"/></svg>
227
+ Image
228
+ </button>
229
+ <button class="tab-btn" data-target="results">
230
+ <svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M4 4a2 2 0 012-2h4.586A2 2 0 0112 2.586L15.414 6A2 2 0 0116 7.414V16a2 2 0 01-2 2H6a2 2 0 01-2-2V4zm2 6a1 1 0 011-1h6a1 1 0 110 2H7a1 1 0 01-1-1zm1 3a1 1 0 100 2h6a1 1 0 100-2H7z" clip-rule="evenodd"/></svg>
231
+ Results
232
+ </button>
233
+ </nav>
234
+
235
+ <!-- Help modal -->
236
+ <dialog id="help-modal">
237
+ <div class="modal-header">
238
+ <h2>Polyscriptor HTR — Quick Guide</h2>
239
+ <button id="btn-help-close" class="btn-icon">✕</button>
240
+ </div>
241
+ <div class="modal-body">
242
+ <h3>Quick Start</h3>
243
+ <ol>
244
+ <li><strong>Select an engine</strong> from the dropdown and configure it (model path, API key, etc.).</li>
245
+ <li>Click <strong>Load Model</strong> and wait for the green status badge.</li>
246
+ <li><strong>Upload an image</strong> by dragging it onto the upload area or clicking to browse.</li>
247
+ <li>Optionally click <strong>Segment</strong> to preview line detection before transcribing.</li>
248
+ <li>Click <strong>Transcribe</strong>. Lines appear one by one as they are processed.</li>
249
+ <li><strong>Export</strong> the result as TXT, CSV, or PAGE XML.</li>
250
+ </ol>
251
+
252
+ <h3>Source Code</h3>
253
+ <p>
254
+ The public Polyscriptor source code is available on
255
+ <a href="https://github.com/achimrabus/polyscriptor" target="_blank" rel="noopener noreferrer">GitHub</a>.
256
+ This Hugging Face Space runs a curated hosted demo configuration.
257
+ </p>
258
+
259
+ <h3>Engines</h3>
260
+ <table>
261
+ <tr><th>Engine</th><th>Best for</th></tr>
262
+ <tr><td>CRNN-CTC</td><td>Fastest; works well on Church Slavonic, Glagolitic, Ukrainian with trained models</td></tr>
263
+ <tr><td>TrOCR</td><td>HuggingFace Transformer OCR; good general-purpose accuracy</td></tr>
264
+ <tr><td>Qwen3-VL</td><td>Large vision-language model; best quality but slow, needs GPU</td></tr>
265
+ <tr><td>Kraken</td><td>Classical HTR; good for Latin scripts</td></tr>
266
+ <tr><td>Party</td><td>Whole-page transformer; requires PAGE XML with line segmentation</td></tr>
267
+ <tr><td>Commercial APIs</td><td>OpenAI / Gemini / Claude — cloud inference, no local GPU needed</td></tr>
268
+ <tr><td>OpenWebUI</td><td>Locally hosted models via OpenWebUI/Ollama</td></tr>
269
+ </table>
270
+
271
+ <h3>Segmentation</h3>
272
+ <ul>
273
+ <li><strong>Kraken Classical</strong> — default line segmentation in this Hugging Face CPU demo.</li>
274
+ <li><strong>HPP</strong> — horizontal projection profile fallback.</li>
275
+ <li><strong>Kraken Neural / blla</strong> — available on the full server setup, but not enabled in this Space.</li>
276
+ <li><strong>PAGE XML upload</strong> — skip segmentation entirely by uploading an existing PAGE XML annotation (e.g. from Transkribus).</li>
277
+ </ul>
278
+
279
+ <h3>Tips</h3>
280
+ <ul>
281
+ <li>Click a transcription line to highlight the corresponding bounding box in the image.</li>
282
+ <li>Confidence badges: <span class="conf-high demo-badge">high ≥90%</span> <span class="conf-mid demo-badge">mid ≥75%</span> <span class="conf-low demo-badge">low &lt;75%</span></li>
283
+ <li>Line-segmenting engines (CRNN-CTC, TrOCR, Kraken) use the segmentation method above. Page-level engines (Party, Qwen3-VL, Commercial APIs) do their own segmentation.</li>
284
+ <li>API keys can be saved on the server — enter the key once, check <em>Save key on server</em>.</li>
285
+ <li>Uploads are kept for 24 hours, then cleaned up automatically.</li>
286
+ </ul>
287
+
288
+ <h3>Keyboard</h3>
289
+ <ul>
290
+ <li><kbd>Esc</kbd> — close this dialog</li>
291
+ </ul>
292
+ </div>
293
+ </dialog>
294
+
295
+ <!-- Toast notification container -->
296
+ <div id="toast-container"></div>
297
+
298
+ <script type="module" src="/static/app.js"></script>
299
+ <script>
300
+ // Help modal
301
+ const modal = document.getElementById('help-modal');
302
+ document.getElementById('btn-help').addEventListener('click', () => modal.showModal());
303
+ document.getElementById('btn-help-close').addEventListener('click', () => modal.close());
304
+ modal.addEventListener('click', e => { if (e.target === modal) modal.close(); });
305
+
306
+ // Mobile tab bar
307
+ const tabBtns = document.querySelectorAll('.tab-btn');
308
+ const panels = document.querySelectorAll('[data-panel]');
309
+ tabBtns.forEach(btn => {
310
+ btn.addEventListener('click', () => {
311
+ const target = btn.dataset.target;
312
+ tabBtns.forEach(b => b.classList.remove('active'));
313
+ btn.classList.add('active');
314
+ panels.forEach(p => {
315
+ p.classList.toggle('panel-active', p.dataset.panel === target);
316
+ });
317
+ });
318
+ });
319
+ // Default: settings active on mobile
320
+ document.querySelector('[data-panel="settings"]').classList.add('panel-active');
321
+ </script>
322
+ </body>
323
+ </html>
web/static/pwa/demo.css ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ── Design tokens (matching main app) ───────────────────────────────── */
2
+ :root {
3
+ --bg: #111827;
4
+ --bg-panel: #1f2937;
5
+ --bg-card: #1a2333;
6
+ --bg-input: #111827;
7
+ --bg-hover: #2a3a52;
8
+ --text: #e2e8f0;
9
+ --text-muted: #64748b;
10
+ --text-dim: #94a3b8;
11
+ --accent: #e94560;
12
+ --primary: #3b82f6;
13
+ --primary-dark: #2563eb;
14
+ --primary-glow: rgba(59,130,246,0.25);
15
+ --success: #22c55e;
16
+ --warning: #f59e0b;
17
+ --danger: #ef4444;
18
+ --border: #2d3f59;
19
+ --border-light: #3a4f6e;
20
+ --radius: 10px;
21
+ --radius-sm: 6px;
22
+ --font: 'Segoe UI', system-ui, -apple-system, sans-serif;
23
+ --font-mono: 'Consolas', 'Fira Code', monospace;
24
+ --header-h: 52px;
25
+ --safe-bottom: env(safe-area-inset-bottom, 0px);
26
+ }
27
+
28
+ /* ── Reset ───────────────────────────────────────────────────────────── */
29
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
30
+
31
+ html, body {
32
+ height: 100%;
33
+ font-family: var(--font);
34
+ background: var(--bg);
35
+ color: var(--text);
36
+ -webkit-text-size-adjust: 100%;
37
+ }
38
+
39
+ body {
40
+ display: flex;
41
+ flex-direction: column;
42
+ min-height: 100dvh;
43
+ overflow-x: hidden;
44
+ }
45
+
46
+ /* ── Header ──────────────────────────────────────────────────────────── */
47
+ #header {
48
+ position: sticky;
49
+ top: 0;
50
+ z-index: 100;
51
+ height: var(--header-h);
52
+ display: flex;
53
+ align-items: center;
54
+ justify-content: space-between;
55
+ padding: 0 16px;
56
+ padding-top: env(safe-area-inset-top, 0);
57
+ background: var(--bg-panel);
58
+ border-bottom: 1px solid var(--border);
59
+ flex-shrink: 0;
60
+ }
61
+
62
+ .header-brand {
63
+ display: flex;
64
+ align-items: center;
65
+ gap: 8px;
66
+ user-select: none;
67
+ }
68
+
69
+ .logo-hex {
70
+ font-size: 1.5rem;
71
+ color: var(--primary);
72
+ line-height: 1;
73
+ }
74
+
75
+ .logo-text {
76
+ font-size: 1.05rem;
77
+ font-weight: 700;
78
+ letter-spacing: -0.01em;
79
+ color: var(--text);
80
+ }
81
+
82
+ .logo-sub {
83
+ font-weight: 400;
84
+ color: var(--text-dim);
85
+ font-size: 0.9em;
86
+ }
87
+
88
+ /* Engine status pill */
89
+ .engine-pill {
90
+ display: flex;
91
+ align-items: center;
92
+ gap: 6px;
93
+ padding: 4px 10px;
94
+ border-radius: 20px;
95
+ font-size: 0.75rem;
96
+ font-weight: 600;
97
+ border: 1px solid transparent;
98
+ transition: all 0.2s;
99
+ }
100
+ .engine-pill--unknown { background: var(--bg); border-color: var(--border); color: var(--text-muted); }
101
+ .engine-pill--loaded { background: rgba(34,197,94,0.12); border-color: rgba(34,197,94,0.4); color: var(--success); }
102
+ .engine-pill--unloaded { background: rgba(239,68,68,0.1); border-color: rgba(239,68,68,0.3); color: var(--danger); }
103
+ .engine-pill--loading { background: rgba(245,158,11,0.1); border-color: rgba(245,158,11,0.3); color: var(--warning); }
104
+
105
+ .pill-dot {
106
+ width: 7px;
107
+ height: 7px;
108
+ border-radius: 50%;
109
+ background: currentColor;
110
+ flex-shrink: 0;
111
+ }
112
+
113
+ .engine-pill--loading .pill-dot {
114
+ animation: pulse-dot 1s ease-in-out infinite;
115
+ }
116
+
117
+ @keyframes pulse-dot {
118
+ 0%, 100% { opacity: 1; }
119
+ 50% { opacity: 0.3; }
120
+ }
121
+
122
+ /* ── Toast ───────────────────────────────────────────────────────────── */
123
+ #toast-container {
124
+ position: fixed;
125
+ top: calc(var(--header-h) + 8px);
126
+ left: 50%;
127
+ transform: translateX(-50%);
128
+ z-index: 200;
129
+ display: flex;
130
+ flex-direction: column;
131
+ gap: 6px;
132
+ width: calc(100% - 32px);
133
+ max-width: 420px;
134
+ pointer-events: none;
135
+ }
136
+
137
+ .toast {
138
+ padding: 10px 14px;
139
+ border-radius: var(--radius-sm);
140
+ font-size: 0.85rem;
141
+ font-weight: 500;
142
+ pointer-events: auto;
143
+ animation: toast-in 0.25s ease;
144
+ }
145
+ .toast--info { background: var(--bg-panel); border: 1px solid var(--border); color: var(--text); }
146
+ .toast--success { background: rgba(34,197,94,0.15); border: 1px solid rgba(34,197,94,0.4); color: var(--success); }
147
+ .toast--error { background: rgba(239,68,68,0.15); border: 1px solid rgba(239,68,68,0.4); color: #fca5a5; }
148
+ .toast--warn { background: rgba(245,158,11,0.12); border: 1px solid rgba(245,158,11,0.35); color: var(--warning); }
149
+
150
+ @keyframes toast-in {
151
+ from { opacity: 0; transform: translateY(-8px); }
152
+ to { opacity: 1; transform: translateY(0); }
153
+ }
154
+
155
+ /* ── Main scroll area ─────────────────────────────────────────────────── */
156
+ #main {
157
+ flex: 1;
158
+ overflow-y: auto;
159
+ padding: 14px;
160
+ padding-bottom: calc(16px + var(--safe-bottom));
161
+ display: flex;
162
+ flex-direction: column;
163
+ gap: 12px;
164
+ }
165
+
166
+ /* ── Cards ───────────────────────────────────────────────────────────── */
167
+ .card {
168
+ background: var(--bg-card);
169
+ border: 1px solid var(--border);
170
+ border-radius: var(--radius);
171
+ padding: 16px;
172
+ }
173
+
174
+ .card-header {
175
+ display: flex;
176
+ align-items: center;
177
+ justify-content: space-between;
178
+ margin-bottom: 14px;
179
+ }
180
+
181
+ .card-header h2 {
182
+ font-size: 0.9rem;
183
+ font-weight: 700;
184
+ text-transform: uppercase;
185
+ letter-spacing: 0.06em;
186
+ color: var(--text-dim);
187
+ }
188
+
189
+ /* ── Buttons ──────────────────────────────────────────────────────────── */
190
+ .btn {
191
+ display: inline-flex;
192
+ align-items: center;
193
+ justify-content: center;
194
+ gap: 8px;
195
+ padding: 12px 18px;
196
+ border: 1px solid transparent;
197
+ border-radius: var(--radius-sm);
198
+ font-family: var(--font);
199
+ font-size: 0.95rem;
200
+ font-weight: 600;
201
+ cursor: pointer;
202
+ transition: all 0.15s;
203
+ min-height: 48px;
204
+ user-select: none;
205
+ -webkit-tap-highlight-color: transparent;
206
+ white-space: nowrap;
207
+ }
208
+
209
+ .btn svg {
210
+ width: 18px;
211
+ height: 18px;
212
+ flex-shrink: 0;
213
+ }
214
+
215
+ .btn:disabled {
216
+ opacity: 0.4;
217
+ cursor: not-allowed;
218
+ }
219
+
220
+ .btn-primary {
221
+ background: var(--primary);
222
+ color: #fff;
223
+ border-color: var(--primary);
224
+ }
225
+ .btn-primary:not(:disabled):hover,
226
+ .btn-primary:not(:disabled):active {
227
+ background: var(--primary-dark);
228
+ border-color: var(--primary-dark);
229
+ }
230
+
231
+ .btn-secondary {
232
+ background: var(--bg-panel);
233
+ color: var(--text);
234
+ border-color: var(--border-light);
235
+ }
236
+ .btn-secondary:not(:disabled):hover,
237
+ .btn-secondary:not(:disabled):active {
238
+ background: var(--bg-hover);
239
+ border-color: var(--primary);
240
+ color: var(--primary);
241
+ }
242
+
243
+ .btn-danger {
244
+ background: rgba(239,68,68,0.15);
245
+ color: var(--danger);
246
+ border-color: rgba(239,68,68,0.4);
247
+ }
248
+ .btn-danger:not(:disabled):hover,
249
+ .btn-danger:not(:disabled):active {
250
+ background: rgba(239,68,68,0.25);
251
+ }
252
+
253
+ .btn-ghost {
254
+ background: transparent;
255
+ color: var(--text-muted);
256
+ border-color: transparent;
257
+ padding: 4px 8px;
258
+ min-height: unset;
259
+ font-size: 0.8rem;
260
+ }
261
+ .btn-ghost:hover { color: var(--text); }
262
+
263
+ .btn-small { font-size: 0.8rem; padding: 6px 10px; min-height: 32px; }
264
+
265
+ /* Capture button — accent-colored, full-width primary CTA */
266
+ .btn-capture {
267
+ background: linear-gradient(135deg, var(--primary) 0%, #6366f1 100%);
268
+ color: #fff;
269
+ border-color: transparent;
270
+ flex: 1;
271
+ padding: 14px;
272
+ font-size: 1rem;
273
+ }
274
+ .btn-capture:not(:disabled):hover,
275
+ .btn-capture:not(:disabled):active {
276
+ opacity: 0.9;
277
+ transform: translateY(-1px);
278
+ }
279
+ .btn-capture:not(:disabled):active {
280
+ transform: translateY(0);
281
+ }
282
+
283
+ /* ── Upload section ───────────────────────────────────────────────────── */
284
+ .upload-btn-row {
285
+ display: flex;
286
+ gap: 10px;
287
+ }
288
+
289
+ .btn-upload {
290
+ flex: 0 0 auto;
291
+ padding: 14px 16px;
292
+ }
293
+
294
+ /* Image preview */
295
+ #image-preview-wrap {
296
+ margin-top: 14px;
297
+ }
298
+
299
+ #image-container {
300
+ position: relative;
301
+ display: inline-block;
302
+ width: 100%;
303
+ border-radius: var(--radius-sm);
304
+ overflow: hidden;
305
+ background: #000;
306
+ }
307
+
308
+ #preview-img {
309
+ display: block;
310
+ width: 100%;
311
+ height: auto;
312
+ max-height: 55vh;
313
+ object-fit: contain;
314
+ }
315
+
316
+ #bbox-canvas {
317
+ position: absolute;
318
+ top: 0;
319
+ left: 0;
320
+ width: 100%;
321
+ height: 100%;
322
+ pointer-events: none;
323
+ }
324
+
325
+ .preview-meta {
326
+ display: flex;
327
+ align-items: center;
328
+ justify-content: space-between;
329
+ margin-top: 8px;
330
+ }
331
+
332
+ .meta-filename {
333
+ font-size: 0.78rem;
334
+ color: var(--text-muted);
335
+ overflow: hidden;
336
+ text-overflow: ellipsis;
337
+ white-space: nowrap;
338
+ max-width: 70%;
339
+ }
340
+
341
+ /* ── Engine card ──────────────────────────────────────────────────────── */
342
+ .field-row {
343
+ display: flex;
344
+ flex-direction: column;
345
+ gap: 5px;
346
+ margin-bottom: 10px;
347
+ }
348
+
349
+ .field-row label {
350
+ font-size: 0.75rem;
351
+ font-weight: 600;
352
+ color: var(--text-dim);
353
+ text-transform: uppercase;
354
+ letter-spacing: 0.05em;
355
+ }
356
+
357
+ select {
358
+ width: 100%;
359
+ padding: 10px 12px;
360
+ background: var(--bg-input);
361
+ color: var(--text);
362
+ border: 1px solid var(--border);
363
+ border-radius: var(--radius-sm);
364
+ font-family: var(--font);
365
+ font-size: 0.9rem;
366
+ cursor: pointer;
367
+ min-height: 44px;
368
+ -webkit-appearance: none;
369
+ appearance: none;
370
+ background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='8' viewBox='0 0 12 8'%3E%3Cpath d='M1 1l5 5 5-5' stroke='%2364748b' stroke-width='1.5' fill='none' stroke-linecap='round'/%3E%3C/svg%3E");
371
+ background-repeat: no-repeat;
372
+ background-position: right 12px center;
373
+ padding-right: 36px;
374
+ }
375
+
376
+ select:focus {
377
+ outline: none;
378
+ border-color: var(--primary);
379
+ box-shadow: 0 0 0 3px var(--primary-glow);
380
+ }
381
+
382
+ /* Badges */
383
+ .badge {
384
+ padding: 3px 9px;
385
+ border-radius: 20px;
386
+ font-size: 0.72rem;
387
+ font-weight: 700;
388
+ text-transform: uppercase;
389
+ letter-spacing: 0.05em;
390
+ }
391
+ .badge--loading { background: rgba(100,116,139,0.2); color: var(--text-muted); }
392
+ .badge--loaded { background: rgba(34,197,94,0.15); color: var(--success); }
393
+ .badge--unloaded { background: rgba(239,68,68,0.12); color: var(--danger); }
394
+ .badge--info { background: rgba(59,130,246,0.15); color: var(--primary); }
395
+
396
+ /* Advanced details */
397
+ #advanced-details {
398
+ margin-top: 8px;
399
+ border-top: 1px solid var(--border);
400
+ padding-top: 10px;
401
+ }
402
+
403
+ #advanced-details summary {
404
+ font-size: 0.8rem;
405
+ color: var(--text-muted);
406
+ cursor: pointer;
407
+ user-select: none;
408
+ padding: 2px 0;
409
+ list-style: none;
410
+ display: flex;
411
+ align-items: center;
412
+ gap: 6px;
413
+ }
414
+
415
+ #advanced-details summary::before {
416
+ content: '›';
417
+ font-size: 1.1em;
418
+ transition: transform 0.2s;
419
+ display: inline-block;
420
+ }
421
+
422
+ #advanced-details[open] summary::before {
423
+ transform: rotate(90deg);
424
+ }
425
+
426
+ .advanced-inner {
427
+ margin-top: 10px;
428
+ }
429
+
430
+ /* ── Actions card ─────────────────────────────────────────────────────── */
431
+ .actions-card {
432
+ display: flex;
433
+ flex-wrap: wrap;
434
+ gap: 10px;
435
+ padding: 12px;
436
+ }
437
+
438
+ #btn-cancel {
439
+ flex: 0 0 100%;
440
+ }
441
+
442
+ .btn-action {
443
+ flex: 1;
444
+ padding: 14px 10px;
445
+ }
446
+
447
+ .btn-segment {
448
+ background: var(--bg-panel);
449
+ color: var(--text-dim);
450
+ border-color: var(--border-light);
451
+ }
452
+
453
+ .btn-segment:not(:disabled) {
454
+ color: var(--text);
455
+ }
456
+
457
+ .btn-segment:not(:disabled):hover,
458
+ .btn-segment:not(:disabled):active {
459
+ background: var(--bg-hover);
460
+ border-color: var(--primary);
461
+ color: var(--primary);
462
+ }
463
+
464
+ /* ── Progress card ────────────────────────────────────────────────────── */
465
+ #progress-bar-wrap {
466
+ height: 6px;
467
+ background: var(--bg-panel);
468
+ border-radius: 3px;
469
+ overflow: hidden;
470
+ margin-bottom: 10px;
471
+ }
472
+
473
+ #progress-bar {
474
+ height: 100%;
475
+ background: linear-gradient(90deg, var(--primary), #6366f1);
476
+ border-radius: 3px;
477
+ transition: width 0.3s ease;
478
+ }
479
+
480
+ .status-text {
481
+ font-size: 0.82rem;
482
+ color: var(--text-dim);
483
+ min-height: 1.4em;
484
+ }
485
+
486
+ /* ── Results card ─────────────────────────────────────────────────────── */
487
+ #results-list {
488
+ display: flex;
489
+ flex-direction: column;
490
+ gap: 6px;
491
+ margin-bottom: 14px;
492
+ max-height: 50vh;
493
+ overflow-y: auto;
494
+ }
495
+
496
+ .result-line {
497
+ display: flex;
498
+ gap: 10px;
499
+ align-items: flex-start;
500
+ padding: 8px 10px;
501
+ background: var(--bg-panel);
502
+ border-radius: var(--radius-sm);
503
+ border: 1px solid var(--border);
504
+ animation: line-in 0.2s ease;
505
+ }
506
+
507
+ @keyframes line-in {
508
+ from { opacity: 0; transform: translateY(4px); }
509
+ to { opacity: 1; transform: translateY(0); }
510
+ }
511
+
512
+ .line-num {
513
+ font-size: 0.72rem;
514
+ font-weight: 700;
515
+ color: var(--text-muted);
516
+ min-width: 22px;
517
+ padding-top: 1px;
518
+ flex-shrink: 0;
519
+ font-family: var(--font-mono);
520
+ }
521
+
522
+ .line-text {
523
+ flex: 1;
524
+ font-size: 0.88rem;
525
+ line-height: 1.45;
526
+ color: var(--text);
527
+ word-break: break-word;
528
+ }
529
+
530
+ .line-conf {
531
+ font-size: 0.7rem;
532
+ font-weight: 600;
533
+ padding: 2px 6px;
534
+ border-radius: 4px;
535
+ flex-shrink: 0;
536
+ align-self: flex-start;
537
+ margin-top: 1px;
538
+ }
539
+ .conf-high { background: rgba(34,197,94,0.15); color: var(--success); }
540
+ .conf-mid { background: rgba(245,158,11,0.15); color: var(--warning); }
541
+ .conf-low { background: rgba(239,68,68,0.12); color: var(--danger); }
542
+
543
+ .results-actions {
544
+ display: flex;
545
+ gap: 10px;
546
+ }
547
+
548
+ .results-actions .btn {
549
+ flex: 1;
550
+ font-size: 0.85rem;
551
+ padding: 10px 12px;
552
+ }
553
+
554
+ /* ── Landscape layout ─────────────────────────────────────────────────── */
555
+ @media (orientation: landscape) and (max-height: 600px) {
556
+ #main {
557
+ display: grid;
558
+ grid-template-columns: 1fr 1fr;
559
+ grid-template-rows: auto;
560
+ align-items: start;
561
+ }
562
+
563
+ #upload-card { grid-column: 1; grid-row: 1 / 3; }
564
+ #engine-card { grid-column: 2; grid-row: 1; }
565
+ #actions-card { grid-column: 2; grid-row: 2; }
566
+ #progress-card { grid-column: 1 / 3; }
567
+ #results-card { grid-column: 1 / 3; }
568
+
569
+ #preview-img {
570
+ max-height: 70vh;
571
+ }
572
+ }
573
+
574
+ /* ── Desktop (>= 768px) ───────────────────────────────────────────────── */
575
+ @media (min-width: 768px) {
576
+ #main {
577
+ max-width: 580px;
578
+ margin: 0 auto;
579
+ padding: 20px 0 40px;
580
+ }
581
+ }
582
+
583
+ /* ── Utility ──────────────────────────────────────────────────────────── */
584
+ .hidden { display: none !important; }
585
+
586
+ /* Scrollbar styling */
587
+ #results-list::-webkit-scrollbar { width: 4px; }
588
+ #results-list::-webkit-scrollbar-track { background: transparent; }
589
+ #results-list::-webkit-scrollbar-thumb { background: var(--border-light); border-radius: 2px; }
590
+
591
+ /* Focus visible for accessibility */
592
+ :focus-visible {
593
+ outline: 2px solid var(--primary);
594
+ outline-offset: 2px;
595
+ }
596
+
597
+ /* ── Photo Review Overlay ─────────────────────────────────────────────── */
598
+ #photo-review {
599
+ position: fixed;
600
+ inset: 0;
601
+ background: #0a0a0a;
602
+ z-index: 500;
603
+ /* Use block layout instead of flex to avoid the iOS Safari flex+overflow-y scroll bug */
604
+ display: block;
605
+ overflow-y: auto;
606
+ -webkit-overflow-scrolling: touch;
607
+ }
608
+
609
+ #photo-review[hidden] {
610
+ display: none;
611
+ }
612
+
613
+ #review-inner {
614
+ width: 100%;
615
+ max-width: 600px;
616
+ margin: 0 auto;
617
+ padding: max(14px, env(safe-area-inset-top, 0px)) 14px calc(14px + env(safe-area-inset-bottom, 0px));
618
+ box-sizing: border-box;
619
+ display: flex;
620
+ flex-direction: column;
621
+ gap: 12px;
622
+ }
623
+
624
+ #review-warn {
625
+ background: rgba(245, 158, 11, 0.2);
626
+ color: #fef3c7;
627
+ border: 1px solid rgba(245, 158, 11, 0.45);
628
+ padding: 10px 14px;
629
+ border-radius: 8px;
630
+ font-size: 0.875rem;
631
+ text-align: center;
632
+ line-height: 1.4;
633
+ }
634
+
635
+ #review-warn[hidden] { display: none; }
636
+
637
+ #review-img-outer {
638
+ text-align: center;
639
+ }
640
+
641
+ #review-img-wrap {
642
+ display: inline-block;
643
+ position: relative;
644
+ max-width: 100%;
645
+ border-radius: 8px;
646
+ overflow: hidden;
647
+ background: #111;
648
+ vertical-align: top;
649
+ }
650
+
651
+ #review-img {
652
+ display: block;
653
+ max-width: 100%;
654
+ max-height: 45vh; /* fallback */
655
+ max-height: 45svh; /* small viewport height: excludes browser chrome on iOS/Android */
656
+ width: auto;
657
+ height: auto;
658
+ }
659
+
660
+ #review-crop-canvas {
661
+ position: absolute;
662
+ inset: 0;
663
+ width: 100%;
664
+ height: 100%;
665
+ pointer-events: none;
666
+ touch-action: none;
667
+ cursor: crosshair;
668
+ }
669
+
670
+ #review-toolbar {
671
+ display: flex;
672
+ gap: 8px;
673
+ flex-wrap: wrap;
674
+ align-items: center;
675
+ justify-content: center;
676
+ }
677
+
678
+ .btn-icon {
679
+ font-size: 1.2rem;
680
+ padding: 8px 16px;
681
+ min-width: 48px;
682
+ }
683
+
684
+ #review-actions {
685
+ display: flex;
686
+ gap: 10px;
687
+ }
688
+
689
+ #review-actions .btn {
690
+ flex: 1;
691
+ }
692
+
693
+ @media (orientation: landscape) and (max-height: 500px) {
694
+ #review-img {
695
+ max-height: 30vh;
696
+ max-height: 30svh;
697
+ }
698
+ }
web/static/pwa/demo.html ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
6
+ <meta name="theme-color" content="#3b82f6">
7
+ <meta name="mobile-web-app-capable" content="yes">
8
+ <meta name="apple-mobile-web-app-capable" content="yes">
9
+ <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
10
+ <meta name="apple-mobile-web-app-title" content="Polyscriptor">
11
+ <title>Polyscriptor HTR Demo</title>
12
+ <link rel="manifest" href="/manifest.json">
13
+ <link rel="apple-touch-icon" href="/static/pwa/icons/icon-192.png">
14
+ <link rel="stylesheet" href="/static/pwa/demo.css">
15
+ </head>
16
+ <body>
17
+
18
+ <!-- Header -->
19
+ <header id="header">
20
+ <div class="header-brand">
21
+ <span class="logo-hex">⬡</span>
22
+ <span class="logo-text">Polyscriptor <span class="logo-sub">HTR Demo</span></span>
23
+ </div>
24
+ <div class="header-actions">
25
+ <div id="engine-pill" class="engine-pill engine-pill--unknown" title="Engine status">
26
+ <span class="pill-dot"></span>
27
+ <span id="engine-pill-text">…</span>
28
+ </div>
29
+ </div>
30
+ </header>
31
+
32
+ <!-- Toast container -->
33
+ <div id="toast-container" aria-live="polite"></div>
34
+
35
+ <!-- Main content (scrollable) -->
36
+ <main id="main">
37
+
38
+ <!-- Card: Upload / Camera -->
39
+ <section id="upload-card" class="card">
40
+ <div id="upload-buttons" class="upload-btn-row">
41
+ <button id="btn-camera" class="btn btn-capture">
42
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
43
+ <path d="M23 19a2 2 0 0 1-2 2H3a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h4l2-3h6l2 3h4a2 2 0 0 1 2 2z"/>
44
+ <circle cx="12" cy="13" r="4"/>
45
+ </svg>
46
+ <span>Take Photo</span>
47
+ </button>
48
+ <button id="btn-file" class="btn btn-secondary btn-upload">
49
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
50
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
51
+ <polyline points="17 8 12 3 7 8"/>
52
+ <line x1="12" y1="3" x2="12" y2="15"/>
53
+ </svg>
54
+ <span>Upload Image</span>
55
+ </button>
56
+ </div>
57
+
58
+ <!-- Hidden inputs -->
59
+ <input id="file-camera" type="file" accept="image/*" capture="environment" hidden>
60
+ <input id="file-picker" type="file" accept="image/*,application/pdf" hidden>
61
+
62
+ <!-- Image preview with bbox canvas overlay -->
63
+ <div id="image-preview-wrap" hidden>
64
+ <div id="image-container">
65
+ <img id="preview-img" alt="Uploaded image">
66
+ <canvas id="bbox-canvas"></canvas>
67
+ </div>
68
+ <div class="preview-meta">
69
+ <span id="preview-filename" class="meta-filename"></span>
70
+ <button id="btn-clear-image" class="btn-ghost btn-small">✕ Remove</button>
71
+ </div>
72
+ </div>
73
+ </section>
74
+
75
+ <!-- Card: Engine & Model -->
76
+ <section id="engine-card" class="card">
77
+ <div class="card-header">
78
+ <h2>HTR Engine</h2>
79
+ <span id="model-status-badge" class="badge badge--loading">checking…</span>
80
+ </div>
81
+
82
+ <div id="engine-controls">
83
+ <div class="field-row">
84
+ <label for="engine-select">Engine</label>
85
+ <select id="engine-select">
86
+ <option value="">Loading…</option>
87
+ </select>
88
+ </div>
89
+
90
+ <div class="field-row" id="model-row" hidden>
91
+ <label for="model-select">Model</label>
92
+ <select id="model-select">
93
+ <option value="">Select engine first</option>
94
+ </select>
95
+ </div>
96
+
97
+ <button id="btn-load-model" class="btn btn-secondary" hidden>
98
+ Load Model
99
+ </button>
100
+ </div>
101
+
102
+ <!-- Advanced: segmentation -->
103
+ <details id="advanced-details">
104
+ <summary>Advanced options</summary>
105
+ <div class="advanced-inner">
106
+ <div class="field-row">
107
+ <label for="seg-method-select">Line segmentation</label>
108
+ <select id="seg-method-select">
109
+ <option value="kraken" selected>Kraken Classical</option>
110
+ <option value="hpp">Projection Profile fallback</option>
111
+ <option value="kraken-blla" disabled>Kraken Neural / blla (server only)</option>
112
+ </select>
113
+ </div>
114
+ </div>
115
+ </details>
116
+ </section>
117
+
118
+ <!-- Card: Actions -->
119
+ <section id="actions-card" class="card actions-card">
120
+ <button id="btn-segment" class="btn btn-action btn-segment" disabled>
121
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
122
+ <rect x="3" y="3" width="18" height="18" rx="2"/>
123
+ <line x1="3" y1="9" x2="21" y2="9"/>
124
+ <line x1="3" y1="15" x2="21" y2="15"/>
125
+ </svg>
126
+ Detect Lines
127
+ </button>
128
+ <button id="btn-transcribe" class="btn btn-action btn-primary" disabled>
129
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
130
+ <path d="M12 20h9"/><path d="M16.5 3.5a2.121 2.121 0 0 1 3 3L7 19l-4 1 1-4L16.5 3.5z"/>
131
+ </svg>
132
+ Transcribe
133
+ </button>
134
+ <button id="btn-cancel" class="btn btn-danger" hidden>
135
+ Cancel
136
+ </button>
137
+ </section>
138
+
139
+ <!-- Card: Progress -->
140
+ <section id="progress-card" class="card" hidden>
141
+ <div id="progress-bar-wrap">
142
+ <div id="progress-bar" style="width:0%"></div>
143
+ </div>
144
+ <p id="status-text" class="status-text"></p>
145
+ </section>
146
+
147
+ <!-- Card: Results -->
148
+ <section id="results-card" class="card" hidden>
149
+ <div class="card-header">
150
+ <h2>Transcription</h2>
151
+ <span id="line-count" class="badge badge--info"></span>
152
+ </div>
153
+ <div id="results-list"></div>
154
+ <div id="results-actions" class="results-actions">
155
+ <button id="btn-copy" class="btn btn-secondary">
156
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
157
+ <rect x="9" y="9" width="13" height="13" rx="2"/>
158
+ <path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/>
159
+ </svg>
160
+ Copy All
161
+ </button>
162
+ <button id="btn-export-txt" class="btn btn-secondary">
163
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
164
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
165
+ <polyline points="7 10 12 15 17 10"/>
166
+ <line x1="12" y1="15" x2="12" y2="3"/>
167
+ </svg>
168
+ Save TXT
169
+ </button>
170
+ </div>
171
+ </section>
172
+
173
+ </main>
174
+
175
+ <!-- Photo Review Overlay — shown after camera capture, before upload -->
176
+ <div id="photo-review" hidden>
177
+ <div id="review-inner">
178
+ <div id="review-warn" hidden>
179
+ Landscape photo detected - for line segmentation, please rotate to portrait (↺ or ↻)
180
+ </div>
181
+ <div id="review-img-outer">
182
+ <div id="review-img-wrap">
183
+ <img id="review-img" alt="Photo preview">
184
+ <canvas id="review-crop-canvas"></canvas>
185
+ </div>
186
+ </div>
187
+ <div id="review-toolbar">
188
+ <button id="btn-rotate-ccw" class="btn btn-secondary btn-icon" title="Rotate 90° left">↺</button>
189
+ <button id="btn-rotate-cw" class="btn btn-secondary btn-icon" title="Rotate 90° right">↻</button>
190
+ <button id="btn-auto-crop" class="btn btn-secondary">Auto crop page</button>
191
+ <button id="btn-crop-start" class="btn btn-secondary">✂ Manual crop</button>
192
+ <button id="btn-crop-apply" class="btn btn-primary" hidden>Apply crop</button>
193
+ <button id="btn-crop-cancel" class="btn btn-ghost btn-small" hidden>Cancel</button>
194
+ </div>
195
+ <div id="review-actions">
196
+ <button id="btn-retake" class="btn btn-secondary">Retake</button>
197
+ <button id="btn-use-photo" class="btn btn-primary">Use photo →</button>
198
+ </div>
199
+ </div>
200
+ </div>
201
+
202
+ <script src="/static/pwa/demo.js" type="module"></script>
203
+ </body>
204
+ </html>
web/static/pwa/demo.js ADDED
@@ -0,0 +1,1069 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Polyscriptor PWA Demo — App Logic
3
+ * Self-contained (no imports from main app.js).
4
+ * Cache-bust: 2026-05-18 (photo review CSS fix)
5
+ */
6
+
7
+ // ── LocalStorage keys ──────────────────────────────────────────────────
8
+ const LS_ENGINE = 'pwa_last_engine';
9
+ const LS_SEG_METHOD = 'pwa_seg_method';
10
+ const LS_MODEL = name => `pwa_last_model_${name}`;
11
+
12
+ // ── State ──────────────────────────────────────────────────────────────
13
+ const state = {
14
+ imageId: null,
15
+ imageInfo: null, // { width, height, filename }
16
+ bboxes: [], // [[x1,y1,x2,y2], …]
17
+ lines: [], // [{index, text, confidence, bbox}, …]
18
+ engines: [], // from /api/engines
19
+ loadedEngine: null, // currently active engine name in pool
20
+ engineChangeSeq: 0, // guards against stale async schema responses
21
+ isSegmenting: false,
22
+ isTranscribing: false,
23
+ sseAbort: null, // AbortController for active SSE
24
+ };
25
+
26
+ // ── DOM refs ───────────────────────────────────────────────────────────
27
+ const $ = id => document.getElementById(id);
28
+ const el = {
29
+ btnCamera: $('btn-camera'),
30
+ btnFile: $('btn-file'),
31
+ fileCamera: $('file-camera'),
32
+ filePicker: $('file-picker'),
33
+ previewWrap: $('image-preview-wrap'),
34
+ previewImg: $('preview-img'),
35
+ bboxCanvas: $('bbox-canvas'),
36
+ previewFilename: $('preview-filename'),
37
+ btnClearImage: $('btn-clear-image'),
38
+
39
+ engineSelect: $('engine-select'),
40
+ modelRow: $('model-row'),
41
+ modelSelect: $('model-select'),
42
+ btnLoadModel: $('btn-load-model'),
43
+ modelStatusBadge: $('model-status-badge'),
44
+ enginePill: $('engine-pill'),
45
+ enginePillText: $('engine-pill-text'),
46
+
47
+ segMethodSelect: $('seg-method-select'),
48
+
49
+ btnSegment: $('btn-segment'),
50
+ btnTranscribe: $('btn-transcribe'),
51
+ btnCancel: $('btn-cancel'),
52
+
53
+ progressCard: $('progress-card'),
54
+ progressBar: $('progress-bar'),
55
+ statusText: $('status-text'),
56
+
57
+ resultsCard: $('results-card'),
58
+ resultsList: $('results-list'),
59
+ lineCount: $('line-count'),
60
+ btnCopy: $('btn-copy'),
61
+ btnExportTxt: $('btn-export-txt'),
62
+
63
+ // Photo review overlay
64
+ photoReview: $('photo-review'),
65
+ reviewImg: $('review-img'),
66
+ reviewCropCanvas: $('review-crop-canvas'),
67
+ reviewWarn: $('review-warn'),
68
+ btnRotateCCW: $('btn-rotate-ccw'),
69
+ btnRotateCW: $('btn-rotate-cw'),
70
+ btnAutoCrop: $('btn-auto-crop'),
71
+ btnCropStart: $('btn-crop-start'),
72
+ btnCropApply: $('btn-crop-apply'),
73
+ btnCropCancel: $('btn-crop-cancel'),
74
+ btnRetake: $('btn-retake'),
75
+ btnUsePhoto: $('btn-use-photo'),
76
+ };
77
+
78
+ // ── Photo Review State ─────────────────────────────────────────────────
79
+ const reviewState = {
80
+ canvas: null, // off-screen working canvas (rotated / cropped)
81
+ cropMode: false,
82
+ cropStart: null, // image-coord pointer-down position
83
+ cropRect: null, // {x, y, w, h} in image coords
84
+ srcFilename: '',
85
+ };
86
+
87
+ // ── Toast ──────────────────────────────────────────────────────────────
88
+ function toast(msg, type = 'info', ms = 4000) {
89
+ const container = $('toast-container');
90
+ const div = document.createElement('div');
91
+ div.className = `toast toast--${type}`;
92
+ div.textContent = msg;
93
+ container.appendChild(div);
94
+ setTimeout(() => div.remove(), ms);
95
+ }
96
+
97
+ // ── API helper ─────────────────────────────────────────────────────────
98
+ async function api(path, options = {}) {
99
+ const headers = { 'Content-Type': 'application/json', ...(options.headers || {}) };
100
+ const resp = await fetch(path, { ...options, headers });
101
+ if (!resp.ok) {
102
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
103
+ throw new Error(err.detail || err.message || `HTTP ${resp.status}`);
104
+ }
105
+ return resp;
106
+ }
107
+
108
+ // ── Engine pill ────────────────────────────────────────────────────────
109
+ function setPill(state, text) {
110
+ el.enginePill.className = `engine-pill engine-pill--${state}`;
111
+ el.enginePillText.textContent = text;
112
+ }
113
+
114
+ // ── Engine status (check pool) ─────────────────────────────��───────────
115
+ async function checkEngineStatus() {
116
+ try {
117
+ const resp = await api('/api/engine/status');
118
+ const data = await resp.json();
119
+
120
+ // Response: { loaded: bool, engine_name: str, config: {...} }
121
+ if (data.loaded && data.engine_name) {
122
+ state.loadedEngine = data.engine_name;
123
+ setPill('loaded', data.engine_name);
124
+ setBadge('loaded', 'Model loaded');
125
+ // Pre-select the matching engine in the dropdown
126
+ if (el.engineSelect.querySelector(`option[value="${data.engine_name}"]`)) {
127
+ el.engineSelect.value = data.engine_name;
128
+ }
129
+ // Hide load controls — engine already active
130
+ el.btnLoadModel.hidden = true;
131
+ el.modelRow.hidden = true;
132
+ } else {
133
+ state.loadedEngine = null;
134
+ setPill('unloaded', 'No model');
135
+ setBadge('unloaded', 'No model loaded');
136
+ el.btnLoadModel.hidden = false;
137
+ }
138
+ updateActionButtons();
139
+ } catch {
140
+ setPill('unknown', 'Offline');
141
+ setBadge('loading', 'Checking…');
142
+ }
143
+ }
144
+
145
+ function setBadge(type, text) {
146
+ el.modelStatusBadge.className = `badge badge--${type}`;
147
+ el.modelStatusBadge.textContent = text;
148
+ }
149
+
150
+ // ── Load engines list ──────────────────────────────────────────────────
151
+ async function loadEngines() {
152
+ try {
153
+ const resp = await api('/api/engines');
154
+ const data = await resp.json();
155
+ // /api/engines returns a plain array
156
+ state.engines = Array.isArray(data) ? data : (data.engines || []);
157
+
158
+ el.engineSelect.innerHTML = '';
159
+ const avail = state.engines.filter(e => e.available);
160
+
161
+ if (avail.length === 0) {
162
+ el.engineSelect.innerHTML = '<option value="">No engines available</option>';
163
+ return;
164
+ }
165
+
166
+ for (const eng of avail) {
167
+ const opt = document.createElement('option');
168
+ opt.value = eng.name;
169
+ opt.textContent = eng.display_name || eng.name;
170
+ el.engineSelect.appendChild(opt);
171
+ }
172
+
173
+ // Restore last selection
174
+ const last = localStorage.getItem(LS_ENGINE);
175
+ if (last && el.engineSelect.querySelector(`option[value="${last}"]`)) {
176
+ el.engineSelect.value = last;
177
+ }
178
+
179
+ await onEngineChange();
180
+ } catch (e) {
181
+ el.engineSelect.innerHTML = '<option value="">Failed to load engines</option>';
182
+ toast('Could not reach server', 'error');
183
+ }
184
+ }
185
+
186
+ // ── Engine selection changed ───────────────────────────────────────────
187
+ async function onEngineChange() {
188
+ const name = el.engineSelect.value;
189
+ if (!name) return;
190
+ const requestSeq = ++state.engineChangeSeq;
191
+ localStorage.setItem(LS_ENGINE, name);
192
+
193
+ // If this engine is already the loaded one, hide load controls
194
+ if (name === state.loadedEngine) {
195
+ el.modelRow.hidden = true;
196
+ el.btnLoadModel.hidden = true;
197
+ return;
198
+ }
199
+
200
+ el.modelRow.hidden = false;
201
+ el.modelSelect.innerHTML = '<option>Loading…</option>';
202
+ el.btnLoadModel.hidden = false;
203
+ el.btnLoadModel.disabled = true;
204
+ state.modelFieldKey = null;
205
+
206
+ try {
207
+ // Use config-schema (same as main app) — it has the full model option list
208
+ const resp = await api(`/api/engine/${encodeURIComponent(name)}/config-schema`);
209
+ const schema = await resp.json();
210
+
211
+ if (requestSeq !== state.engineChangeSeq || el.engineSelect.value !== name) {
212
+ return;
213
+ }
214
+
215
+ // Find first non-dynamic select field → that's the model selector
216
+ const selectField = (schema.fields || []).find(
217
+ f => f.type === 'select' && !f.dynamic
218
+ );
219
+
220
+ el.modelSelect.innerHTML = '';
221
+
222
+ if (selectField && (selectField.options || []).length > 0) {
223
+ state.modelFieldKey = selectField.key;
224
+ for (const opt of selectField.options) {
225
+ const o = document.createElement('option');
226
+ o.value = typeof opt === 'object' ? opt.value : opt;
227
+ o.textContent = typeof opt === 'object' ? opt.label : opt;
228
+ el.modelSelect.appendChild(o);
229
+ }
230
+ // Restore last selection or apply schema default
231
+ const lastModel = localStorage.getItem(LS_MODEL(name));
232
+ if (lastModel && el.modelSelect.querySelector(`option[value="${lastModel}"]`)) {
233
+ el.modelSelect.value = lastModel;
234
+ } else if (selectField.default != null) {
235
+ el.modelSelect.value = selectField.default;
236
+ }
237
+ } else {
238
+ // No static options (e.g. API-based engines) — show Default
239
+ state.modelFieldKey = selectField?.key || 'model_path';
240
+ const o = document.createElement('option');
241
+ o.value = '';
242
+ o.textContent = 'Default';
243
+ el.modelSelect.appendChild(o);
244
+ }
245
+
246
+ el.btnLoadModel.disabled = false;
247
+ } catch {
248
+ if (requestSeq !== state.engineChangeSeq || el.engineSelect.value !== name) {
249
+ return;
250
+ }
251
+ el.modelSelect.innerHTML = '<option value="">Default</option>';
252
+ state.modelFieldKey = 'model_path';
253
+ el.btnLoadModel.disabled = false;
254
+ }
255
+ }
256
+
257
+ // ── Load model ─────────────────────────────────────────────────────────
258
+ async function loadModel() {
259
+ const engineName = el.engineSelect.value;
260
+ if (!engineName) return;
261
+
262
+ const modelVal = el.modelSelect.value || '';
263
+ localStorage.setItem(LS_MODEL(engineName), modelVal);
264
+
265
+ el.btnLoadModel.disabled = true;
266
+ el.btnLoadModel.textContent = 'Loading…';
267
+ setPill('loading', 'Loading…');
268
+ setBadge('loading', 'Loading…');
269
+
270
+ try {
271
+ // Use the field key from the config schema (e.g. 'model_path' for CRNN-CTC/TrOCR/Kraken)
272
+ const fieldKey = state.modelFieldKey || 'model_path';
273
+ const config = modelVal ? { [fieldKey]: modelVal } : {};
274
+ await api('/api/engine/load', {
275
+ method: 'POST',
276
+ body: JSON.stringify({ engine_name: engineName, config }),
277
+ });
278
+
279
+ state.loadedEngine = engineName;
280
+ setPill('loaded', engineName);
281
+ setBadge('loaded', 'Model loaded');
282
+ el.btnLoadModel.hidden = true;
283
+ el.modelRow.hidden = true;
284
+ toast(`${engineName} loaded`, 'success');
285
+ } catch (e) {
286
+ setPill('unloaded', 'Load failed');
287
+ setBadge('unloaded', 'Load failed');
288
+ toast(`Load failed: ${e.message}`, 'error');
289
+ } finally {
290
+ el.btnLoadModel.disabled = false;
291
+ el.btnLoadModel.textContent = 'Load Model';
292
+ updateActionButtons();
293
+ }
294
+ }
295
+
296
+ // ── Update action button states ────────────────────────────────────────
297
+ function updateActionButtons() {
298
+ const hasImage = !!state.imageId;
299
+ const hasEngine = !!state.loadedEngine;
300
+ const busy = state.isSegmenting || state.isTranscribing;
301
+
302
+ el.btnSegment.disabled = !hasImage || !hasEngine || busy;
303
+ el.btnTranscribe.disabled = !hasImage || !hasEngine || busy;
304
+ el.btnCancel.hidden = !busy;
305
+ }
306
+
307
+ // ── File upload ────────────────────────────────────────────────────────
308
+ async function uploadFile(file) {
309
+ if (!file) return;
310
+
311
+ const fd = new FormData();
312
+ fd.append('file', file);
313
+
314
+ setStatus('Uploading…');
315
+ el.progressCard.hidden = false;
316
+ setProgress(0);
317
+
318
+ try {
319
+ const resp = await fetch('/api/image/upload?max_dim=2400', { method: 'POST', body: fd });
320
+ if (!resp.ok) {
321
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
322
+ throw new Error(err.detail || 'Upload failed');
323
+ }
324
+ const data = await resp.json();
325
+
326
+ if (data.is_pdf) {
327
+ // PDF: use first page
328
+ const first = data.pages[0];
329
+ state.imageId = first.image_id;
330
+ state.imageInfo = { width: first.width, height: first.height, filename: first.filename };
331
+ toast(`PDF uploaded — using page 1 of ${data.pages.length}`, 'info');
332
+ } else {
333
+ state.imageId = data.image_id;
334
+ state.imageInfo = { width: data.width, height: data.height, filename: data.filename };
335
+ }
336
+
337
+ // Show preview
338
+ el.previewImg.src = `/api/image/${state.imageId}`;
339
+ el.previewFilename.textContent = state.imageInfo.filename || file.name;
340
+ el.previewWrap.hidden = false;
341
+ clearBboxes();
342
+
343
+ // Clear old results
344
+ hideResults();
345
+ setStatus('Image ready');
346
+ setProgress(100);
347
+ setTimeout(() => { el.progressCard.hidden = true; }, 800);
348
+ updateActionButtons();
349
+ } catch (e) {
350
+ toast(`Upload failed: ${e.message}`, 'error');
351
+ setStatus('');
352
+ el.progressCard.hidden = true;
353
+ }
354
+ }
355
+
356
+ // ── Clear image ────────────────────────────────────────────────────────
357
+ function clearImage() {
358
+ state.imageId = null;
359
+ state.imageInfo = null;
360
+ state.bboxes = [];
361
+ state.lines = [];
362
+ el.previewWrap.hidden = true;
363
+ el.previewImg.src = '';
364
+ clearBboxes();
365
+ hideResults();
366
+ updateActionButtons();
367
+ }
368
+
369
+ // ── BBox canvas ────────────────────────────────────────────────────────
370
+ function clearBboxes() {
371
+ const canvas = el.bboxCanvas;
372
+ const ctx = canvas.getContext('2d');
373
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
374
+ state.bboxes = [];
375
+ }
376
+
377
+ // Draw bounding boxes scaled to displayed image size
378
+ function drawBboxes(bboxes, highlightIdx = -1) {
379
+ const img = el.previewImg;
380
+ const canvas = el.bboxCanvas;
381
+ const ctx = canvas.getContext('2d');
382
+
383
+ // Match canvas to displayed size
384
+ canvas.width = img.offsetWidth;
385
+ canvas.height = img.offsetHeight;
386
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
387
+
388
+ if (!bboxes || bboxes.length === 0 || !state.imageInfo) return;
389
+
390
+ const scaleX = img.offsetWidth / state.imageInfo.width;
391
+ const scaleY = img.offsetHeight / state.imageInfo.height;
392
+
393
+ // Color palette for lines — use distinct hues
394
+ const COLORS = [
395
+ 'rgba(59,130,246,', // blue
396
+ 'rgba(99,102,241,', // indigo
397
+ 'rgba(34,197,94,', // green
398
+ 'rgba(245,158,11,', // amber
399
+ 'rgba(239,68,68,', // red
400
+ 'rgba(168,85,247,', // purple
401
+ 'rgba(20,184,166,', // teal
402
+ 'rgba(249,115,22,', // orange
403
+ ];
404
+
405
+ bboxes.forEach((bbox, i) => {
406
+ const [x1, y1, x2, y2] = bbox;
407
+ const x = x1 * scaleX;
408
+ const y = y1 * scaleY;
409
+ const w = (x2 - x1) * scaleX;
410
+ const h = (y2 - y1) * scaleY;
411
+
412
+ const colorBase = COLORS[i % COLORS.length];
413
+ const isHighlighted = i === highlightIdx;
414
+ const fillAlpha = isHighlighted ? 0.25 : 0.10;
415
+ const strokeAlpha = isHighlighted ? 1.0 : 0.7;
416
+
417
+ ctx.fillStyle = `${colorBase}${fillAlpha})`;
418
+ ctx.strokeStyle = `${colorBase}${strokeAlpha})`;
419
+ ctx.lineWidth = isHighlighted ? 2 : 1.5;
420
+
421
+ ctx.fillRect(x, y, w, h);
422
+ ctx.strokeRect(x, y, w, h);
423
+
424
+ // Line number label
425
+ ctx.font = 'bold 10px monospace';
426
+ ctx.fillStyle = `${colorBase}0.9)`;
427
+ const label = String(i + 1);
428
+ const pad = 3;
429
+ const tw = ctx.measureText(label).width + pad * 2;
430
+ ctx.fillStyle = `${colorBase}0.85)`;
431
+ ctx.fillRect(x, y - 14, tw, 14);
432
+ ctx.fillStyle = '#fff';
433
+ ctx.fillText(label, x + pad, y - 3);
434
+ });
435
+ }
436
+
437
+ // ── Segment ────────────────────────────────────────────────────────────
438
+ async function segmentImage() {
439
+ if (!state.imageId) return;
440
+
441
+ state.isSegmenting = true;
442
+ updateActionButtons();
443
+ el.progressCard.hidden = false;
444
+ setProgress(0);
445
+ setStatus('Detecting lines…');
446
+ clearBboxes();
447
+
448
+ const method = el.segMethodSelect.value || 'kraken';
449
+ localStorage.setItem(LS_SEG_METHOD, method);
450
+
451
+ try {
452
+ const url = `/api/image/${state.imageId}/segment?method=${encodeURIComponent(method)}&device=cuda%3A0`;
453
+ const resp = await api(url);
454
+ const data = await resp.json();
455
+
456
+ state.bboxes = data.bboxes || [];
457
+ drawBboxes(state.bboxes);
458
+
459
+ setStatus(`${state.bboxes.length} line${state.bboxes.length !== 1 ? 's' : ''} detected`);
460
+ setProgress(100);
461
+ toast(`${state.bboxes.length} lines detected`, 'success', 2500);
462
+ } catch (e) {
463
+ toast(`Segmentation failed: ${e.message}`, 'error');
464
+ setStatus('Segmentation failed');
465
+ } finally {
466
+ state.isSegmenting = false;
467
+ updateActionButtons();
468
+ setTimeout(() => { if (!state.isTranscribing) el.progressCard.hidden = true; }, 1500);
469
+ }
470
+ }
471
+
472
+ // ── Transcribe (SSE) ───────────────────────────────────────────────────
473
+ async function startTranscription() {
474
+ if (!state.imageId || !state.loadedEngine) return;
475
+
476
+ state.isTranscribing = true;
477
+ state.lines = [];
478
+ updateActionButtons();
479
+
480
+ el.progressCard.hidden = false;
481
+ setProgress(0);
482
+ setStatus('Starting transcription…');
483
+ el.resultsCard.hidden = true;
484
+ el.resultsList.innerHTML = '';
485
+
486
+ const method = el.segMethodSelect.value || 'kraken';
487
+
488
+ const body = JSON.stringify({
489
+ image_id: state.imageId,
490
+ seg_method: method,
491
+ seg_device: 'cuda:0',
492
+ });
493
+
494
+ const abort = new AbortController();
495
+ state.sseAbort = abort;
496
+
497
+ try {
498
+ const resp = await fetch('/api/transcribe', {
499
+ method: 'POST',
500
+ headers: { 'Content-Type': 'application/json' },
501
+ body,
502
+ signal: abort.signal,
503
+ });
504
+
505
+ if (!resp.ok) {
506
+ const err = await resp.json().catch(() => ({ detail: resp.statusText }));
507
+ throw new Error(err.detail || 'Transcription failed');
508
+ }
509
+
510
+ const reader = resp.body.getReader();
511
+ const decoder = new TextDecoder();
512
+ let buffer = '';
513
+
514
+ while (true) {
515
+ const { done, value } = await reader.read();
516
+ if (done) break;
517
+
518
+ buffer += decoder.decode(value, { stream: true });
519
+ const parts = buffer.split('\n\n');
520
+ buffer = parts.pop(); // last part may be incomplete
521
+
522
+ for (const part of parts) {
523
+ const eventLine = part.split('\n').find(l => l.startsWith('event:'));
524
+ const dataLine = part.split('\n').find(l => l.startsWith('data:'));
525
+ if (!dataLine) continue;
526
+
527
+ const event = eventLine ? eventLine.slice(7).trim() : 'message';
528
+ const payload = JSON.parse(dataLine.slice(5).trim());
529
+
530
+ handleSSEEvent(event, payload);
531
+ }
532
+ }
533
+ } catch (e) {
534
+ if (e.name !== 'AbortError') {
535
+ toast(`Transcription error: ${e.message}`, 'error');
536
+ setStatus('Error');
537
+ }
538
+ } finally {
539
+ state.isTranscribing = false;
540
+ state.sseAbort = null;
541
+ updateActionButtons();
542
+ }
543
+ }
544
+
545
+ function handleSSEEvent(event, payload) {
546
+ switch (event) {
547
+ case 'status':
548
+ setStatus(payload.message || '');
549
+ break;
550
+
551
+ case 'segmentation': {
552
+ state.bboxes = payload.bboxes || [];
553
+ drawBboxes(state.bboxes);
554
+ setStatus(`${state.bboxes.length} lines detected — transcribing…`);
555
+ break;
556
+ }
557
+
558
+ case 'progress': {
559
+ const { current, total, line } = payload;
560
+ setProgress(total > 0 ? (current / total) * 100 : 0);
561
+ setStatus(`Transcribing line ${current} / ${total}…`);
562
+
563
+ if (line) {
564
+ state.lines.push(line);
565
+ appendResultLine(line);
566
+ // Highlight corresponding bbox
567
+ drawBboxes(state.bboxes, line.index);
568
+ }
569
+
570
+ // Show results card on first result
571
+ if (el.resultsCard.hidden && state.lines.length === 1) {
572
+ el.resultsCard.hidden = false;
573
+ el.resultsCard.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
574
+ }
575
+ break;
576
+ }
577
+
578
+ case 'complete': {
579
+ setProgress(100);
580
+ const count = (payload.lines || []).length;
581
+ const secs = payload.total_time_s ? ` in ${payload.total_time_s}s` : '';
582
+ setStatus(`Done — ${count} lines${secs}`);
583
+ el.lineCount.textContent = `${count} lines`;
584
+ el.lineCount.className = 'badge badge--info';
585
+
586
+ // Redraw all bboxes without highlight
587
+ drawBboxes(state.bboxes);
588
+ toast(`Transcription complete (${count} lines)`, 'success');
589
+ setTimeout(() => { el.progressCard.hidden = true; }, 1200);
590
+ break;
591
+ }
592
+
593
+ case 'cancelled':
594
+ setStatus('Cancelled');
595
+ toast('Transcription cancelled', 'warn', 2500);
596
+ setTimeout(() => { el.progressCard.hidden = true; }, 1000);
597
+ break;
598
+
599
+ case 'error':
600
+ toast(`Error: ${payload.message}`, 'error');
601
+ setStatus('Error');
602
+ break;
603
+ }
604
+ }
605
+
606
+ // ── Result line DOM ────────────────────────────────────────────────────
607
+ function appendResultLine(line) {
608
+ const div = document.createElement('div');
609
+ div.className = 'result-line';
610
+
611
+ const numSpan = document.createElement('span');
612
+ numSpan.className = 'line-num';
613
+ numSpan.textContent = String(line.index + 1);
614
+
615
+ const textSpan = document.createElement('span');
616
+ textSpan.className = 'line-text';
617
+ textSpan.textContent = line.text || '';
618
+
619
+ div.appendChild(numSpan);
620
+ div.appendChild(textSpan);
621
+
622
+ if (line.confidence !== null && line.confidence !== undefined) {
623
+ const pct = Math.round(line.confidence * 100);
624
+ const confSpan = document.createElement('span');
625
+ confSpan.className = `line-conf ${pct >= 90 ? 'conf-high' : pct >= 75 ? 'conf-mid' : 'conf-low'}`;
626
+ confSpan.textContent = `${pct}%`;
627
+ div.appendChild(confSpan);
628
+ }
629
+
630
+ el.resultsList.appendChild(div);
631
+ // Auto-scroll to latest
632
+ el.resultsList.scrollTop = el.resultsList.scrollHeight;
633
+ }
634
+
635
+ // ── Cancel ─────────────────────────────────────────────────────────────
636
+ async function cancelTranscription() {
637
+ if (state.sseAbort) state.sseAbort.abort();
638
+ try {
639
+ await api('/api/transcribe/cancel', { method: 'POST', body: '{}' });
640
+ } catch { /* ignore */ }
641
+ }
642
+
643
+ // ── Progress helpers ───────────────────────────────────────────────────
644
+ function setProgress(pct) {
645
+ el.progressBar.style.width = `${Math.min(100, Math.max(0, pct))}%`;
646
+ }
647
+
648
+ function setStatus(msg) {
649
+ el.statusText.textContent = msg;
650
+ }
651
+
652
+ // ── Hide results ───────────────────────────────────────────────────────
653
+ function hideResults() {
654
+ el.resultsCard.hidden = true;
655
+ el.resultsList.innerHTML = '';
656
+ state.lines = [];
657
+ el.lineCount.textContent = '';
658
+ }
659
+
660
+ // ── Copy all ───────────────────────────────────────────────────────────
661
+ function copyAll() {
662
+ const text = state.lines.map(l => l.text || '').join('\n');
663
+ if (!text) { toast('Nothing to copy', 'warn', 2000); return; }
664
+ navigator.clipboard.writeText(text)
665
+ .then(() => toast('Copied to clipboard', 'success', 2000))
666
+ .catch(() => toast('Copy failed', 'error'));
667
+ }
668
+
669
+ // ── Export TXT ─────────────────────────────────────────────────────────
670
+ function exportTxt() {
671
+ const text = state.lines.map(l => l.text || '').join('\n');
672
+ if (!text) { toast('Nothing to export', 'warn', 2000); return; }
673
+ const blob = new Blob([text], { type: 'text/plain;charset=utf-8' });
674
+ const url = URL.createObjectURL(blob);
675
+ const a = document.createElement('a');
676
+ a.href = url;
677
+ a.download = (state.imageInfo?.filename?.replace(/\.[^.]+$/, '') || 'transcription') + '.txt';
678
+ a.click();
679
+ URL.revokeObjectURL(url);
680
+ }
681
+
682
+ // ── Redraw bboxes on image resize ─────────────���────────────────────────
683
+ function onImageResize() {
684
+ if (state.bboxes.length > 0) drawBboxes(state.bboxes);
685
+ }
686
+
687
+ // ── Photo Review ────────────────────────────────────────────────────────
688
+
689
+ function openPhotoReview(file) {
690
+ reviewState.srcFilename = file.name || 'photo.jpg';
691
+ reviewState.cropMode = false;
692
+ reviewState.cropStart = null;
693
+ reviewState.cropRect = null;
694
+
695
+ const img = new Image();
696
+ const url = URL.createObjectURL(file);
697
+ img.onload = () => {
698
+ URL.revokeObjectURL(url);
699
+ const canvas = document.createElement('canvas');
700
+ canvas.width = img.naturalWidth;
701
+ canvas.height = img.naturalHeight;
702
+ canvas.getContext('2d').drawImage(img, 0, 0);
703
+ reviewState.canvas = canvas;
704
+ updateReviewDisplay();
705
+ el.photoReview.hidden = false;
706
+ document.body.style.overflow = 'hidden';
707
+ };
708
+ img.onerror = () => {
709
+ URL.revokeObjectURL(url);
710
+ toast('Could not load photo', 'error');
711
+ };
712
+ img.src = url;
713
+ }
714
+
715
+ function closePhotoReview() {
716
+ el.photoReview.hidden = true;
717
+ document.body.style.overflow = '';
718
+ reviewState.canvas = null;
719
+ reviewState.cropMode = false;
720
+ reviewState.cropRect = null;
721
+ resetCropUI();
722
+ }
723
+
724
+ function updateReviewDisplay() {
725
+ if (!reviewState.canvas) return;
726
+ el.reviewImg.onload = () => {
727
+ syncCropCanvas();
728
+ checkReviewOrientation();
729
+ };
730
+ el.reviewImg.src = reviewState.canvas.toDataURL('image/jpeg', 0.9);
731
+ }
732
+
733
+ function checkReviewOrientation() {
734
+ const landscape = reviewState.canvas.width > reviewState.canvas.height;
735
+ el.reviewWarn.hidden = !landscape;
736
+ }
737
+
738
+ function syncCropCanvas() {
739
+ const c = el.reviewCropCanvas;
740
+ const rect = el.reviewImg.getBoundingClientRect();
741
+ if (!rect.width) return;
742
+ c.width = Math.round(rect.width);
743
+ c.height = Math.round(rect.height);
744
+ c.getContext('2d').clearRect(0, 0, c.width, c.height);
745
+ }
746
+
747
+ // ── Auto-Crop (adaptive page detection) ────────────────────────────────
748
+
749
+ function autoDetectAndCrop() {
750
+ if (!reviewState.canvas) return;
751
+ exitCropMode();
752
+
753
+ const canvas = reviewState.canvas;
754
+ const { width, height } = canvas;
755
+ const data = canvas.getContext('2d').getImageData(0, 0, width, height).data;
756
+
757
+ // Single pass: accumulate page-likelihood per row and per column.
758
+ // Heuristic: white paper is typically bright with low saturation.
759
+ const rowSum = new Float32Array(height);
760
+ const colSum = new Float32Array(width);
761
+ let borderSum = 0;
762
+ let borderCount = 0;
763
+
764
+ const borderBandY = Math.max(1, Math.floor(height * 0.08));
765
+ const borderBandX = Math.max(1, Math.floor(width * 0.08));
766
+
767
+ for (let y = 0; y < height; y++) {
768
+ for (let x = 0; x < width; x++) {
769
+ const i = (y * width + x) * 4;
770
+ const r = data[i];
771
+ const g = data[i + 1];
772
+ const b = data[i + 2];
773
+
774
+ const v = Math.max(r, g, b);
775
+ const min = Math.min(r, g, b);
776
+ const s = v === 0 ? 0 : (v - min) / v;
777
+
778
+ const pageScore = v - (s * 90);
779
+ rowSum[y] += pageScore;
780
+ colSum[x] += pageScore;
781
+
782
+ const isBorderPixel = y < borderBandY || y >= (height - borderBandY) || x < borderBandX || x >= (width - borderBandX);
783
+ if (isBorderPixel) {
784
+ borderSum += pageScore;
785
+ borderCount += 1;
786
+ }
787
+ }
788
+ }
789
+
790
+ const borderMean = borderCount > 0 ? (borderSum / borderCount) : 40;
791
+ const THRESHOLD = Math.min(230, borderMean + 14);
792
+ const PAD = 12;
793
+
794
+ let top = 0, bottom = height - 1, left = 0, right = width - 1;
795
+ for (let y = 0; y < height; y++) { if (rowSum[y] / width > THRESHOLD) { top = y; break; } }
796
+ for (let y = height - 1; y >= 0; y--) { if (rowSum[y] / width > THRESHOLD) { bottom = y; break; } }
797
+ for (let x = 0; x < width; x++) { if (colSum[x] / height > THRESHOLD) { left = x; break; } }
798
+ for (let x = width - 1; x >= 0; x--) { if (colSum[x] / height > THRESHOLD) { right = x; break; } }
799
+
800
+ // Apply padding and clamp
801
+ top = Math.max(0, top - PAD);
802
+ bottom = Math.min(height - 1, bottom + PAD);
803
+ left = Math.max(0, left - PAD);
804
+ right = Math.min(width - 1, right + PAD);
805
+
806
+ const w = right - left;
807
+ const h = bottom - top;
808
+
809
+ // Sanity check: don't crop to less than 20% of original
810
+ if (w < width * 0.2 || h < height * 0.2) {
811
+ toast('Page not detected clearly - please crop manually', 'warn');
812
+ return;
813
+ }
814
+
815
+ const dst = document.createElement('canvas');
816
+ dst.width = w;
817
+ dst.height = h;
818
+ dst.getContext('2d').drawImage(canvas, left, top, w, h, 0, 0, w, h);
819
+ reviewState.canvas = dst;
820
+ updateReviewDisplay();
821
+ }
822
+
823
+ // ── Rotate ─────────────────────────────────────────────────────────────
824
+
825
+ function rotateReview(angle) {
826
+ if (!reviewState.canvas) return;
827
+ exitCropMode();
828
+ const src = reviewState.canvas;
829
+ const dst = document.createElement('canvas');
830
+ dst.width = src.height;
831
+ dst.height = src.width;
832
+ const ctx = dst.getContext('2d');
833
+ ctx.translate(dst.width / 2, dst.height / 2);
834
+ ctx.rotate(angle * Math.PI / 180);
835
+ ctx.drawImage(src, -src.width / 2, -src.height / 2);
836
+ reviewState.canvas = dst;
837
+ updateReviewDisplay();
838
+ }
839
+
840
+ // ── Crop ───────────────────────────────────────────────────────────────
841
+
842
+ function enterCropMode() {
843
+ reviewState.cropMode = true;
844
+ reviewState.cropRect = null;
845
+ reviewState.cropStart = null;
846
+ el.btnCropStart.hidden = true;
847
+ el.btnCropApply.hidden = true;
848
+ el.btnCropCancel.hidden = false;
849
+ el.reviewCropCanvas.style.pointerEvents = 'auto';
850
+ syncCropCanvas();
851
+ }
852
+
853
+ function exitCropMode() {
854
+ reviewState.cropMode = false;
855
+ reviewState.cropStart = null;
856
+ reviewState.cropRect = null;
857
+ el.reviewCropCanvas.style.pointerEvents = 'none';
858
+ resetCropUI();
859
+ syncCropCanvas();
860
+ }
861
+
862
+ function resetCropUI() {
863
+ el.btnCropStart.hidden = false;
864
+ el.btnCropApply.hidden = true;
865
+ el.btnCropCancel.hidden = true;
866
+ }
867
+
868
+ function pointerToImageCoords(e) {
869
+ const c = el.reviewCropCanvas;
870
+ const rect = c.getBoundingClientRect();
871
+ return {
872
+ x: Math.max(0, Math.min(reviewState.canvas.width, (e.clientX - rect.left) * (reviewState.canvas.width / rect.width))),
873
+ y: Math.max(0, Math.min(reviewState.canvas.height, (e.clientY - rect.top) * (reviewState.canvas.height / rect.height))),
874
+ };
875
+ }
876
+
877
+ function onCropPointerDown(e) {
878
+ if (!reviewState.cropMode) return;
879
+ e.preventDefault();
880
+ el.reviewCropCanvas.setPointerCapture(e.pointerId);
881
+ reviewState.cropStart = pointerToImageCoords(e);
882
+ reviewState.cropRect = null;
883
+ el.btnCropApply.hidden = true;
884
+ }
885
+
886
+ function onCropPointerMove(e) {
887
+ if (!reviewState.cropMode || !reviewState.cropStart) return;
888
+ e.preventDefault();
889
+ const cur = pointerToImageCoords(e);
890
+ reviewState.cropRect = {
891
+ x: Math.min(reviewState.cropStart.x, cur.x),
892
+ y: Math.min(reviewState.cropStart.y, cur.y),
893
+ w: Math.abs(cur.x - reviewState.cropStart.x),
894
+ h: Math.abs(cur.y - reviewState.cropStart.y),
895
+ };
896
+ drawCropOverlay();
897
+ }
898
+
899
+ function onCropPointerUp(e) {
900
+ if (!reviewState.cropMode) return;
901
+ e.preventDefault();
902
+ reviewState.cropStart = null;
903
+ const r = reviewState.cropRect;
904
+ if (r && r.w > 20 && r.h > 20) {
905
+ el.btnCropApply.hidden = false;
906
+ }
907
+ }
908
+
909
+ function drawCropOverlay() {
910
+ const c = el.reviewCropCanvas;
911
+ const ctx = c.getContext('2d');
912
+ const r = reviewState.cropRect;
913
+ if (!r) return;
914
+
915
+ const scaleX = c.width / reviewState.canvas.width;
916
+ const scaleY = c.height / reviewState.canvas.height;
917
+ const rx = r.x * scaleX, ry = r.y * scaleY;
918
+ const rw = r.w * scaleX, rh = r.h * scaleY;
919
+
920
+ ctx.clearRect(0, 0, c.width, c.height);
921
+ ctx.fillStyle = 'rgba(0,0,0,0.55)';
922
+ ctx.fillRect(0, 0, c.width, c.height);
923
+ ctx.clearRect(rx, ry, rw, rh);
924
+ ctx.strokeStyle = 'rgba(255,255,255,0.9)';
925
+ ctx.lineWidth = 2;
926
+ ctx.strokeRect(rx, ry, rw, rh);
927
+ }
928
+
929
+ function applyReviewCrop() {
930
+ const r = reviewState.cropRect;
931
+ if (!r || r.w < 20 || r.h < 20) return;
932
+ const dst = document.createElement('canvas');
933
+ dst.width = Math.round(r.w);
934
+ dst.height = Math.round(r.h);
935
+ dst.getContext('2d').drawImage(
936
+ reviewState.canvas,
937
+ Math.round(r.x), Math.round(r.y), Math.round(r.w), Math.round(r.h),
938
+ 0, 0, Math.round(r.w), Math.round(r.h)
939
+ );
940
+ reviewState.canvas = dst;
941
+ exitCropMode();
942
+ updateReviewDisplay();
943
+ }
944
+
945
+ // ── Confirm / Retake ────────────────────────────────────────────────────
946
+
947
+ function retakePhoto() {
948
+ closePhotoReview();
949
+ el.fileCamera.value = '';
950
+ el.fileCamera.click();
951
+ }
952
+
953
+ function confirmPhoto() {
954
+ if (!reviewState.canvas) return;
955
+ el.btnUsePhoto.disabled = true;
956
+ reviewState.canvas.toBlob(blob => {
957
+ if (!blob) {
958
+ toast('Error while processing photo', 'error');
959
+ el.btnUsePhoto.disabled = false;
960
+ return;
961
+ }
962
+ const baseName = reviewState.srcFilename.replace(/\.[^.]+$/, '');
963
+ const file = new File([blob], baseName + '.jpg', { type: 'image/jpeg' });
964
+ closePhotoReview();
965
+ el.btnUsePhoto.disabled = false;
966
+ uploadFile(file);
967
+ }, 'image/jpeg', 0.92);
968
+ }
969
+
970
+ // ── Register service worker ─────────────────────────────────────────────
971
+ async function detectPwaVersion() {
972
+ try {
973
+ const resp = await fetch('/static/pwa/demo.js', {
974
+ method: 'HEAD',
975
+ cache: 'no-store',
976
+ });
977
+ const lastModified = resp.headers.get('last-modified');
978
+ if (lastModified) {
979
+ const ts = Date.parse(lastModified);
980
+ if (Number.isFinite(ts) && ts > 0) return String(ts);
981
+ }
982
+ } catch {
983
+ // Fallback below
984
+ }
985
+ return 'dev';
986
+ }
987
+
988
+ if ('serviceWorker' in navigator) {
989
+ window.addEventListener('load', async () => {
990
+ try {
991
+ const version = await detectPwaVersion();
992
+ const reg = await navigator.serviceWorker.register(`/sw.js?v=${encodeURIComponent(version)}`, { scope: '/' });
993
+ reg.update().catch(() => {});
994
+ } catch (e) {
995
+ console.warn('SW registration failed:', e);
996
+ }
997
+ });
998
+ }
999
+
1000
+ // ── Init ───────────────────────────────────────────────────────────────
1001
+ function init() {
1002
+ // Camera button — open review overlay instead of uploading directly
1003
+ el.btnCamera.addEventListener('click', () => el.fileCamera.click());
1004
+ el.fileCamera.addEventListener('change', () => {
1005
+ if (el.fileCamera.files[0]) openPhotoReview(el.fileCamera.files[0]);
1006
+ el.fileCamera.value = '';
1007
+ });
1008
+
1009
+ // Photo review
1010
+ el.btnRotateCCW.addEventListener('click', () => rotateReview(-90));
1011
+ el.btnRotateCW.addEventListener('click', () => rotateReview(90));
1012
+ el.btnAutoCrop.addEventListener('click', autoDetectAndCrop);
1013
+ el.btnCropStart.addEventListener('click', enterCropMode);
1014
+ el.btnCropApply.addEventListener('click', applyReviewCrop);
1015
+ el.btnCropCancel.addEventListener('click', exitCropMode);
1016
+ el.btnRetake.addEventListener('click', retakePhoto);
1017
+ el.btnUsePhoto.addEventListener('click', confirmPhoto);
1018
+ el.reviewCropCanvas.addEventListener('pointerdown', onCropPointerDown);
1019
+ el.reviewCropCanvas.addEventListener('pointermove', onCropPointerMove);
1020
+ el.reviewCropCanvas.addEventListener('pointerup', onCropPointerUp);
1021
+
1022
+ // File picker button
1023
+ el.btnFile.addEventListener('click', () => el.filePicker.click());
1024
+ el.filePicker.addEventListener('change', () => {
1025
+ if (el.filePicker.files[0]) uploadFile(el.filePicker.files[0]);
1026
+ el.filePicker.value = '';
1027
+ });
1028
+
1029
+ // Clear image
1030
+ el.btnClearImage.addEventListener('click', clearImage);
1031
+
1032
+ // Engine select
1033
+ el.engineSelect.addEventListener('change', onEngineChange);
1034
+
1035
+ // Load model
1036
+ el.btnLoadModel.addEventListener('click', loadModel);
1037
+
1038
+ // Segment
1039
+ el.btnSegment.addEventListener('click', segmentImage);
1040
+
1041
+ // Transcribe
1042
+ el.btnTranscribe.addEventListener('click', startTranscription);
1043
+
1044
+ // Cancel
1045
+ el.btnCancel.addEventListener('click', cancelTranscription);
1046
+
1047
+ // Export
1048
+ el.btnCopy.addEventListener('click', copyAll);
1049
+ el.btnExportTxt.addEventListener('click', exportTxt);
1050
+
1051
+ // Seg method persistence
1052
+ const savedSeg = localStorage.getItem(LS_SEG_METHOD);
1053
+ const savedSegOption = savedSeg ? el.segMethodSelect.querySelector(`option[value="${savedSeg}"]`) : null;
1054
+ if (savedSegOption && !savedSegOption.disabled) {
1055
+ el.segMethodSelect.value = savedSeg;
1056
+ }
1057
+ el.segMethodSelect.addEventListener('change', () => {
1058
+ localStorage.setItem(LS_SEG_METHOD, el.segMethodSelect.value);
1059
+ });
1060
+
1061
+ // Redraw bboxes on layout changes (image resize)
1062
+ const ro = new ResizeObserver(onImageResize);
1063
+ ro.observe(el.previewImg);
1064
+
1065
+ // Initial data load
1066
+ loadEngines().then(checkEngineStatus);
1067
+ }
1068
+
1069
+ document.addEventListener('DOMContentLoaded', init);
web/static/pwa/icons/icon-192.png ADDED

Git LFS Details

  • SHA256: cac30492acbc8fff49fd2e166c0a5610148dd73c832c6cfe9f48bbacca2b94b6
  • Pointer size: 130 Bytes
  • Size of remote file: 37.5 kB
web/static/pwa/icons/icon-512.png ADDED

Git LFS Details

  • SHA256: 006af190dcb8989e09a2d36566fea9371aa92e5aa95b9f772176f262be35401a
  • Pointer size: 131 Bytes
  • Size of remote file: 160 kB
web/static/pwa/manifest.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Polyscriptor HTR Demo",
3
+ "short_name": "Polyscriptor",
4
+ "description": "Handwritten Text Recognition — capture a photo and transcribe it instantly",
5
+ "start_url": "/demo",
6
+ "scope": "/",
7
+ "display": "standalone",
8
+ "orientation": "portrait-primary",
9
+ "background_color": "#111827",
10
+ "theme_color": "#3b82f6",
11
+ "icons": [
12
+ {
13
+ "src": "/static/pwa/icons/icon-192.png",
14
+ "sizes": "192x192",
15
+ "type": "image/png",
16
+ "purpose": "any maskable"
17
+ },
18
+ {
19
+ "src": "/static/pwa/icons/icon-512.png",
20
+ "sizes": "512x512",
21
+ "type": "image/png",
22
+ "purpose": "any maskable"
23
+ }
24
+ ],
25
+ "categories": ["productivity", "utilities"],
26
+ "lang": "en"
27
+ }
web/static/pwa/sw.js ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Polyscriptor PWA — Service Worker
3
+ * Caches static assets for faster startup; API calls always go to network.
4
+ */
5
+
6
+ const SW_VERSION = new URL(self.location.href).searchParams.get('v') || 'dev';
7
+ const CACHE = `polyscriptor-pwa-${SW_VERSION}`;
8
+ const STATIC = [
9
+ '/demo',
10
+ '/static/pwa/demo.html',
11
+ '/static/pwa/demo.css',
12
+ '/static/pwa/demo.js',
13
+ '/static/pwa/manifest.json',
14
+ '/static/pwa/icons/icon-192.png',
15
+ '/static/pwa/icons/icon-512.png',
16
+ ];
17
+
18
+ self.addEventListener('install', e => {
19
+ e.waitUntil(
20
+ caches.open(CACHE)
21
+ .then(async c => {
22
+ const freshRequests = STATIC.map(url => new Request(url, { cache: 'reload' }));
23
+ await c.addAll(freshRequests);
24
+ })
25
+ .then(() => self.skipWaiting())
26
+ );
27
+ });
28
+
29
+ self.addEventListener('activate', e => {
30
+ e.waitUntil(
31
+ caches.keys().then(keys =>
32
+ Promise.all(keys.filter(k => k !== CACHE).map(k => caches.delete(k)))
33
+ ).then(() => self.clients.claim())
34
+ );
35
+ });
36
+
37
+ self.addEventListener('fetch', e => {
38
+ const url = new URL(e.request.url);
39
+
40
+ // API calls: always network-only (no caching)
41
+ if (url.pathname.startsWith('/api/')) {
42
+ e.respondWith(fetch(e.request).catch(() =>
43
+ new Response(JSON.stringify({ detail: 'No server connection' }), {
44
+ status: 503,
45
+ headers: { 'Content-Type': 'application/json' },
46
+ })
47
+ ));
48
+ return;
49
+ }
50
+
51
+ // Static assets: cache-first
52
+ e.respondWith(
53
+ caches.match(e.request).then(cached => cached || fetch(e.request).then(resp => {
54
+ if (resp.ok && STATIC.some(s => url.pathname === s || url.pathname.startsWith(s))) {
55
+ caches.open(CACHE).then(c => c.put(e.request, resp.clone()));
56
+ }
57
+ return resp;
58
+ }))
59
+ );
60
+ });