Achim Rabus commited on
Commit ·
78431ff
1
Parent(s): c8ba8c4
Deploy Polyscriptor HTR Space demo
Browse files- .dockerignore +10 -0
- .gitattributes +2 -0
- Dockerfile +23 -0
- README.md +73 -7
- engines/__init__.py +17 -0
- engines/commercial_api_engine.py +768 -0
- engines/kraken_engine.py +535 -0
- engines/openwebui_engine.py +505 -0
- engines/pylaia_engine.py +414 -0
- hf-space/README.md +43 -0
- hf-space/SPACE_README.md +78 -0
- hf-space/requirements.txt +21 -0
- htr_engine_base.py +398 -0
- inference_commercial_api.py +760 -0
- inference_page.py +946 -0
- inference_pylaia_native.py +453 -0
- kraken_segmenter.py +823 -0
- page_xml_exporter.py +276 -0
- web/polyscriptor_server.py +2237 -0
- web/server_config.yaml +25 -0
- web/static/app.css +1269 -0
- web/static/app.js +298 -0
- web/static/components/batch-panel.js +735 -0
- web/static/components/engine-panel.js +1091 -0
- web/static/components/image-viewer.js +294 -0
- web/static/components/transcription-panel.js +482 -0
- web/static/fonts/MonomakhUnicode-Regular.woff2 +3 -0
- web/static/index.html +323 -0
- web/static/pwa/demo.css +698 -0
- web/static/pwa/demo.html +204 -0
- web/static/pwa/demo.js +1069 -0
- web/static/pwa/icons/icon-192.png +3 -0
- web/static/pwa/icons/icon-512.png +3 -0
- web/static/pwa/manifest.json +27 -0
- web/static/pwa/sw.js +60 -0
.dockerignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.pytest_cache
|
| 3 |
+
__pycache__
|
| 4 |
+
**/__pycache__
|
| 5 |
+
*.pyc
|
| 6 |
+
*.ipynb
|
| 7 |
+
*.zip
|
| 8 |
+
models
|
| 9 |
+
htr_gui
|
| 10 |
+
Documentation
|
.gitattributes
CHANGED
|
@@ -19,6 +19,7 @@
|
|
| 19 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
|
@@ -29,6 +30,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 19 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 23 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 24 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 25 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 30 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 31 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.woff2 filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 36 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
POLYSCRIPTOR_DEMO_MODE=hf_space \
|
| 6 |
+
HF_HOME=/tmp/huggingface \
|
| 7 |
+
PORT=7860
|
| 8 |
+
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 12 |
+
libgl1 \
|
| 13 |
+
libglib2.0-0 \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
COPY hf-space/requirements.txt /tmp/requirements-hf-space.txt
|
| 17 |
+
RUN pip install --no-cache-dir -r /tmp/requirements-hf-space.txt
|
| 18 |
+
|
| 19 |
+
COPY . /app
|
| 20 |
+
|
| 21 |
+
EXPOSE 7860
|
| 22 |
+
|
| 23 |
+
CMD ["python", "-m", "uvicorn", "web.polyscriptor_server:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,12 +1,78 @@
|
|
| 1 |
---
|
| 2 |
-
title: Polyscriptor
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
license:
|
| 9 |
-
short_description: Demo of Polyscriptor HTR
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Polyscriptor HTR Demo
|
| 3 |
+
emoji: 📝
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: apache-2.0
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Polyscriptor HTR Demo
|
| 12 |
+
|
| 13 |
+
Polyscriptor is a browser-based demo for handwritten text recognition (HTR) on
|
| 14 |
+
historical Slavic manuscript material. This Hugging Face Space runs a constrained
|
| 15 |
+
public version of the Polyscriptor FastAPI/Web interface.
|
| 16 |
+
|
| 17 |
+
The hosted demo is intended for quick inspection and teaching. It is not the full
|
| 18 |
+
local research environment used for training, batch processing, GPU inference, or
|
| 19 |
+
private manuscript collections.
|
| 20 |
+
|
| 21 |
+
## Source Code
|
| 22 |
+
|
| 23 |
+
The public Polyscriptor source code is available on GitHub:
|
| 24 |
+
|
| 25 |
+
https://github.com/achimrabus/polyscriptor
|
| 26 |
+
|
| 27 |
+
This Hugging Face Space contains the curated hosted demo deployment. The GitHub
|
| 28 |
+
repository contains the broader Polyscriptor codebase, including the web UI,
|
| 29 |
+
engine plugins, segmentation code, training utilities, and local workflows.
|
| 30 |
+
|
| 31 |
+
## What This Demo Supports
|
| 32 |
+
|
| 33 |
+
- CRNN-CTC / PyLaia-inspired HTR presets for selected public model repositories.
|
| 34 |
+
- User-supplied API keys for OpenAI, Gemini, Claude, and OpenWebUI-compatible
|
| 35 |
+
endpoints.
|
| 36 |
+
- Public model download from the Hugging Face Hub, primarily under
|
| 37 |
+
`achimrabus/*`.
|
| 38 |
+
- CPU-only inference.
|
| 39 |
+
- Kraken Classical line segmentation, with HPP as a lightweight fallback.
|
| 40 |
+
- Temporary image uploads during the active session.
|
| 41 |
+
|
| 42 |
+
## Limitations
|
| 43 |
+
|
| 44 |
+
- No private models are bundled with this Space.
|
| 45 |
+
- API-based engines require users to paste their own API key in the browser
|
| 46 |
+
form. The Space does not ship with shared provider credentials.
|
| 47 |
+
- Uploaded files are treated as temporary runtime data and are not part of the
|
| 48 |
+
repository.
|
| 49 |
+
- Large local GPU/VLM engines from the full Polyscriptor workflow are not
|
| 50 |
+
enabled here.
|
| 51 |
+
- Accuracy depends strongly on script, language, writing style, image quality,
|
| 52 |
+
and segmentation quality.
|
| 53 |
+
|
| 54 |
+
## Model Notes
|
| 55 |
+
|
| 56 |
+
The demo uses publicly available model presets. For best results, choose a model
|
| 57 |
+
that matches the manuscript tradition as closely as possible. The current public
|
| 58 |
+
Polyscriptor model cards are available at:
|
| 59 |
+
|
| 60 |
+
https://huggingface.co/achimrabus
|
| 61 |
+
|
| 62 |
+
## Project Context
|
| 63 |
+
|
| 64 |
+
Polyscriptor is developed for historical HTR workflows, with a focus on Slavic
|
| 65 |
+
manuscripts and reproducible comparison of OCR/HTR engines. The full development
|
| 66 |
+
repository contains additional tooling for local use, training, evaluation, and
|
| 67 |
+
batch processing; this Space contains only the hosted demo configuration.
|
| 68 |
+
|
| 69 |
+
## Privacy
|
| 70 |
+
|
| 71 |
+
Do not upload sensitive or unpublished manuscript images unless you are
|
| 72 |
+
comfortable processing them in a hosted public demo environment. The application
|
| 73 |
+
uses temporary server-side files during processing, but this Space should be
|
| 74 |
+
treated as a public demonstration service rather than a secure private workflow.
|
| 75 |
+
|
| 76 |
+
For API-based engines, provider keys are entered by the user at runtime. Do not
|
| 77 |
+
commit keys to this repository or add them to the Space configuration unless you
|
| 78 |
+
intend to provide a shared project credential.
|
engines/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTR Engine Plugins
|
| 2 |
+
|
| 3 |
+
This package contains plugin implementations for different HTR engines.
|
| 4 |
+
Each engine module implements the HTREngine interface defined in htr_engine_base.py.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"TrOCREngine",
|
| 9 |
+
"Qwen3Engine",
|
| 10 |
+
"PyLaiaEngine",
|
| 11 |
+
"KrakenEngine",
|
| 12 |
+
"CommercialAPIEngine",
|
| 13 |
+
"PartyEngine",
|
| 14 |
+
"DeepSeekOCREngine",
|
| 15 |
+
"LightOnOCREngine",
|
| 16 |
+
"PaddleOCREngine",
|
| 17 |
+
]
|
engines/commercial_api_engine.py
ADDED
|
@@ -0,0 +1,768 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Commercial API Engine Plugin
|
| 3 |
+
|
| 4 |
+
Wraps commercial HTR APIs (OpenAI, Gemini, Claude) as a unified plugin.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, Any, Optional
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
from htr_engine_base import HTREngine, TranscriptionResult
|
| 13 |
+
|
| 14 |
+
# Load environment variables from .env file
|
| 15 |
+
try:
|
| 16 |
+
from dotenv import load_dotenv
|
| 17 |
+
# Look for .env in the project root (parent of engines/)
|
| 18 |
+
env_path = Path(__file__).parent.parent / ".env"
|
| 19 |
+
if env_path.exists():
|
| 20 |
+
load_dotenv(env_path)
|
| 21 |
+
print(f"[CommercialAPIEngine] Loaded environment variables from {env_path}")
|
| 22 |
+
except ImportError:
|
| 23 |
+
print("[CommercialAPIEngine] Warning: python-dotenv not installed. API keys will not be loaded from .env file.")
|
| 24 |
+
print("Install with: pip install python-dotenv")
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
from PyQt6.QtWidgets import (
|
| 28 |
+
QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
|
| 29 |
+
QPushButton, QCheckBox, QLineEdit, QGroupBox, QTextEdit
|
| 30 |
+
)
|
| 31 |
+
from PyQt6.QtCore import Qt
|
| 32 |
+
PYQT_AVAILABLE = True
|
| 33 |
+
except ImportError:
|
| 34 |
+
PYQT_AVAILABLE = False
|
| 35 |
+
QWidget = object
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
from inference_commercial_api import (
|
| 39 |
+
OpenAIInference, GeminiInference, ClaudeInference,
|
| 40 |
+
check_api_availability,
|
| 41 |
+
OPENAI_MODELS, GEMINI_MODELS, CLAUDE_MODELS,
|
| 42 |
+
fetch_openai_models, fetch_gemini_models
|
| 43 |
+
)
|
| 44 |
+
COMMERCIAL_API_AVAILABLE = True
|
| 45 |
+
API_AVAILABILITY = check_api_availability()
|
| 46 |
+
except ImportError:
|
| 47 |
+
COMMERCIAL_API_AVAILABLE = False
|
| 48 |
+
API_AVAILABILITY = {"openai": False, "gemini": False, "claude": False}
|
| 49 |
+
OPENAI_MODELS = []
|
| 50 |
+
GEMINI_MODELS = []
|
| 51 |
+
CLAUDE_MODELS = []
|
| 52 |
+
fetch_openai_models = lambda api_key=None: []
|
| 53 |
+
fetch_gemini_models = lambda api_key=None: []
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class CommercialAPIEngine(HTREngine):
|
| 57 |
+
"""Commercial API HTR engine plugin."""
|
| 58 |
+
|
| 59 |
+
def __init__(self):
|
| 60 |
+
# Instance attributes (avoid type annotations here for broader runtime compatibility in some environments)
|
| 61 |
+
self.model = None # Can be OpenAI, Gemini, or Claude
|
| 62 |
+
self._config_widget = None
|
| 63 |
+
self._current_provider = None
|
| 64 |
+
|
| 65 |
+
# Widget references
|
| 66 |
+
self._provider_combo = None
|
| 67 |
+
self._model_combo = None
|
| 68 |
+
self._custom_model_edit = None
|
| 69 |
+
self._use_custom_model_check = None
|
| 70 |
+
self._refresh_models_btn = None
|
| 71 |
+
self._api_key_edit = None
|
| 72 |
+
self._show_key_check = None
|
| 73 |
+
self._prompt_edit = None
|
| 74 |
+
self._thinking_combo = None
|
| 75 |
+
self._temperature_edit = None
|
| 76 |
+
self._max_tokens_edit = None
|
| 77 |
+
self._early_exit_check = None
|
| 78 |
+
self._auto_continue_check = None
|
| 79 |
+
self._max_continuations_edit = None
|
| 80 |
+
|
| 81 |
+
def get_name(self) -> str:
|
| 82 |
+
return "Commercial APIs"
|
| 83 |
+
|
| 84 |
+
def get_description(self) -> str:
|
| 85 |
+
return "OpenAI GPT-4V, Google Gemini, Anthropic Claude vision APIs"
|
| 86 |
+
|
| 87 |
+
def is_available(self) -> bool:
|
| 88 |
+
return COMMERCIAL_API_AVAILABLE and any(API_AVAILABILITY.values())
|
| 89 |
+
|
| 90 |
+
def get_unavailable_reason(self) -> str:
|
| 91 |
+
if not COMMERCIAL_API_AVAILABLE:
|
| 92 |
+
return "Commercial API support not available. Install with: pip install openai google-generativeai anthropic"
|
| 93 |
+
if not any(API_AVAILABILITY.values()):
|
| 94 |
+
return "No API libraries installed. Install at least one: openai, google-generativeai, or anthropic"
|
| 95 |
+
return ""
|
| 96 |
+
|
| 97 |
+
def get_config_widget(self):
|
| 98 |
+
"""Create Commercial API configuration panel."""
|
| 99 |
+
if self._config_widget is not None:
|
| 100 |
+
return self._config_widget
|
| 101 |
+
|
| 102 |
+
widget = QWidget()
|
| 103 |
+
layout = QVBoxLayout()
|
| 104 |
+
|
| 105 |
+
# Provider selection
|
| 106 |
+
provider_group = QGroupBox("API Provider")
|
| 107 |
+
provider_layout = QVBoxLayout()
|
| 108 |
+
|
| 109 |
+
self._provider_combo = QComboBox()
|
| 110 |
+
available_providers = []
|
| 111 |
+
if API_AVAILABILITY.get("openai", False):
|
| 112 |
+
available_providers.append("OpenAI")
|
| 113 |
+
if API_AVAILABILITY.get("gemini", False):
|
| 114 |
+
available_providers.append("Gemini")
|
| 115 |
+
if API_AVAILABILITY.get("claude", False):
|
| 116 |
+
available_providers.append("Claude")
|
| 117 |
+
|
| 118 |
+
if not available_providers:
|
| 119 |
+
available_providers = ["No APIs available"]
|
| 120 |
+
|
| 121 |
+
self._provider_combo.addItems(available_providers)
|
| 122 |
+
self._provider_combo.currentTextChanged.connect(self._on_provider_changed)
|
| 123 |
+
provider_layout.addWidget(self._provider_combo)
|
| 124 |
+
|
| 125 |
+
provider_group.setLayout(provider_layout)
|
| 126 |
+
layout.addWidget(provider_group)
|
| 127 |
+
|
| 128 |
+
# Model selection
|
| 129 |
+
model_group = QGroupBox("Model")
|
| 130 |
+
model_layout = QVBoxLayout()
|
| 131 |
+
|
| 132 |
+
# Dropdown for standard models
|
| 133 |
+
model_dropdown_layout = QHBoxLayout()
|
| 134 |
+
self._model_combo = QComboBox()
|
| 135 |
+
model_dropdown_layout.addWidget(self._model_combo)
|
| 136 |
+
|
| 137 |
+
# Refresh models button
|
| 138 |
+
self._refresh_models_btn = QPushButton("🔄 Refresh")
|
| 139 |
+
self._refresh_models_btn.setToolTip("Fetch latest models from API")
|
| 140 |
+
self._refresh_models_btn.setMaximumWidth(80)
|
| 141 |
+
self._refresh_models_btn.clicked.connect(self._on_refresh_models)
|
| 142 |
+
model_dropdown_layout.addWidget(self._refresh_models_btn)
|
| 143 |
+
|
| 144 |
+
model_layout.addLayout(model_dropdown_layout)
|
| 145 |
+
|
| 146 |
+
# Custom model ID checkbox and field
|
| 147 |
+
custom_model_layout = QHBoxLayout()
|
| 148 |
+
self._use_custom_model_check = QCheckBox("Use custom model ID:")
|
| 149 |
+
self._use_custom_model_check.toggled.connect(self._on_custom_model_toggled)
|
| 150 |
+
custom_model_layout.addWidget(self._use_custom_model_check)
|
| 151 |
+
|
| 152 |
+
self._custom_model_edit = QLineEdit()
|
| 153 |
+
self._custom_model_edit.setPlaceholderText("e.g., gpt-4.5, o1-preview-2024-12-17")
|
| 154 |
+
self._custom_model_edit.setEnabled(False) # Disabled by default
|
| 155 |
+
custom_model_layout.addWidget(self._custom_model_edit)
|
| 156 |
+
|
| 157 |
+
model_layout.addLayout(custom_model_layout)
|
| 158 |
+
|
| 159 |
+
model_hint = QLabel("💡 Use custom model ID for bleeding-edge models not in the dropdown")
|
| 160 |
+
model_hint.setStyleSheet("color: gray; font-size: 8pt;")
|
| 161 |
+
model_hint.setWordWrap(True)
|
| 162 |
+
model_layout.addWidget(model_hint)
|
| 163 |
+
|
| 164 |
+
model_group.setLayout(model_layout)
|
| 165 |
+
layout.addWidget(model_group)
|
| 166 |
+
|
| 167 |
+
# API key
|
| 168 |
+
key_group = QGroupBox("API Key")
|
| 169 |
+
key_layout = QVBoxLayout()
|
| 170 |
+
|
| 171 |
+
key_input_layout = QHBoxLayout()
|
| 172 |
+
self._api_key_edit = QLineEdit()
|
| 173 |
+
self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
|
| 174 |
+
self._api_key_edit.setPlaceholderText("Enter your API key")
|
| 175 |
+
|
| 176 |
+
key_input_layout.addWidget(self._api_key_edit)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
self._show_key_check = QCheckBox("Show")
|
| 180 |
+
self._show_key_check.toggled.connect(self._toggle_key_visibility)
|
| 181 |
+
key_input_layout.addWidget(self._show_key_check)
|
| 182 |
+
key_layout.addLayout(key_input_layout)
|
| 183 |
+
|
| 184 |
+
key_hint = QLabel("API keys are stored locally in .trocr_gui/")
|
| 185 |
+
key_hint.setStyleSheet("color: gray; font-size: 9pt;")
|
| 186 |
+
key_layout.addWidget(key_hint)
|
| 187 |
+
|
| 188 |
+
key_group.setLayout(key_layout)
|
| 189 |
+
layout.addWidget(key_group)
|
| 190 |
+
|
| 191 |
+
# Prompt & Sampling section
|
| 192 |
+
prompt_group = QGroupBox("Prompt & Sampling (Optional)")
|
| 193 |
+
prompt_layout = QVBoxLayout()
|
| 194 |
+
|
| 195 |
+
self._prompt_edit = QTextEdit()
|
| 196 |
+
self._prompt_edit.setPlaceholderText("Enter custom transcription prompt...")
|
| 197 |
+
self._prompt_edit.setMaximumHeight(100)
|
| 198 |
+
prompt_layout.addWidget(self._prompt_edit)
|
| 199 |
+
|
| 200 |
+
# Temperature control
|
| 201 |
+
temp_row = QHBoxLayout()
|
| 202 |
+
temp_row.addWidget(QLabel("Temperature:"))
|
| 203 |
+
self._temperature_edit = QLineEdit()
|
| 204 |
+
self._temperature_edit.setPlaceholderText("1.0 (default)")
|
| 205 |
+
self._temperature_edit.setToolTip(
|
| 206 |
+
"Sampling temperature (web default ~1.0).\n"
|
| 207 |
+
"Use 0-0.3 for deterministic; >1 can increase variability."
|
| 208 |
+
)
|
| 209 |
+
self._temperature_edit.setMaximumWidth(90)
|
| 210 |
+
temp_row.addWidget(self._temperature_edit)
|
| 211 |
+
temp_row.addStretch()
|
| 212 |
+
prompt_layout.addLayout(temp_row)
|
| 213 |
+
|
| 214 |
+
# Max output tokens control
|
| 215 |
+
tokens_row = QHBoxLayout()
|
| 216 |
+
tokens_row.addWidget(QLabel("Max output tokens:"))
|
| 217 |
+
self._max_tokens_edit = QLineEdit()
|
| 218 |
+
self._max_tokens_edit.setPlaceholderText("4096 preview / 2048 default")
|
| 219 |
+
self._max_tokens_edit.setToolTip(
|
| 220 |
+
"Upper limit on generated tokens. Lowering may force earlier output.\n"
|
| 221 |
+
"Raising (e.g. 8192) may help high reasoning but risks long 'thinking'."
|
| 222 |
+
)
|
| 223 |
+
self._max_tokens_edit.setMaximumWidth(130)
|
| 224 |
+
tokens_row.addWidget(self._max_tokens_edit)
|
| 225 |
+
tokens_row.addStretch()
|
| 226 |
+
prompt_layout.addLayout(tokens_row)
|
| 227 |
+
|
| 228 |
+
prompt_group.setLayout(prompt_layout)
|
| 229 |
+
layout.addWidget(prompt_group)
|
| 230 |
+
|
| 231 |
+
# Thinking Mode section (for Gemini models)
|
| 232 |
+
thinking_group = QGroupBox("Thinking Mode (Gemini only)")
|
| 233 |
+
thinking_layout = QVBoxLayout()
|
| 234 |
+
|
| 235 |
+
# (Removed warning banner recommending alternative models; preview model retained for Church Slavonic use)
|
| 236 |
+
|
| 237 |
+
thinking_row = QHBoxLayout()
|
| 238 |
+
thinking_row.addWidget(QLabel("Reasoning:"))
|
| 239 |
+
self._thinking_combo = QComboBox()
|
| 240 |
+
self._thinking_combo.addItems(["Auto (Low for preview)", "Low (Fast)", "High (More reasoning)"])
|
| 241 |
+
self._thinking_combo.setToolTip(
|
| 242 |
+
"Low: Fast, direct output\n"
|
| 243 |
+
"High: Slower, uses more tokens for reasoning\n"
|
| 244 |
+
"Auto: Uses Low for preview models to avoid token waste"
|
| 245 |
+
)
|
| 246 |
+
thinking_row.addWidget(self._thinking_combo)
|
| 247 |
+
thinking_row.addStretch()
|
| 248 |
+
thinking_layout.addLayout(thinking_row)
|
| 249 |
+
|
| 250 |
+
thinking_group.setLayout(thinking_layout)
|
| 251 |
+
layout.addWidget(thinking_group)
|
| 252 |
+
|
| 253 |
+
# Advanced Gemini controls
|
| 254 |
+
advanced_group = QGroupBox("Gemini Advanced")
|
| 255 |
+
adv_layout = QVBoxLayout()
|
| 256 |
+
|
| 257 |
+
# Row 1: Checkboxes
|
| 258 |
+
adv_row1 = QHBoxLayout()
|
| 259 |
+
self._early_exit_check = QCheckBox("Early exit on first chunk")
|
| 260 |
+
self._early_exit_check.setChecked(True)
|
| 261 |
+
self._early_exit_check.setToolTip("If checked, streaming returns after first non-empty text chunk. Uncheck to collect full stream.")
|
| 262 |
+
adv_row1.addWidget(self._early_exit_check)
|
| 263 |
+
|
| 264 |
+
self._auto_continue_check = QCheckBox("Auto continuation")
|
| 265 |
+
self._auto_continue_check.setChecked(False) # Default: off for speed
|
| 266 |
+
self._auto_continue_check.setToolTip("If checked, performs additional continuation calls to capture missed trailing text.")
|
| 267 |
+
adv_row1.addWidget(self._auto_continue_check)
|
| 268 |
+
adv_row1.addStretch()
|
| 269 |
+
adv_layout.addLayout(adv_row1)
|
| 270 |
+
|
| 271 |
+
# Row 2: Continuation settings (symmetrical grid)
|
| 272 |
+
adv_row2 = QHBoxLayout()
|
| 273 |
+
adv_row2.addWidget(QLabel("Max passes:"))
|
| 274 |
+
self._max_continuations_edit = QLineEdit()
|
| 275 |
+
self._max_continuations_edit.setText("2") # Default value
|
| 276 |
+
self._max_continuations_edit.setToolTip("Maximum number of continuation attempts (2-3 recommended)")
|
| 277 |
+
self._max_continuations_edit.setFixedWidth(60)
|
| 278 |
+
adv_row2.addWidget(self._max_continuations_edit)
|
| 279 |
+
|
| 280 |
+
adv_row2.addSpacing(20)
|
| 281 |
+
|
| 282 |
+
adv_row2.addWidget(QLabel("Min new chars:"))
|
| 283 |
+
self._min_new_chars_edit = QLineEdit()
|
| 284 |
+
self._min_new_chars_edit.setText("50") # Default value
|
| 285 |
+
self._min_new_chars_edit.setToolTip("Minimum number of new characters required to accept a continuation chunk.")
|
| 286 |
+
self._min_new_chars_edit.setFixedWidth(60)
|
| 287 |
+
adv_row2.addWidget(self._min_new_chars_edit)
|
| 288 |
+
adv_row2.addStretch()
|
| 289 |
+
adv_layout.addLayout(adv_row2)
|
| 290 |
+
|
| 291 |
+
# Row 3: Token & fallback settings (symmetrical grid)
|
| 292 |
+
adv_row3 = QHBoxLayout()
|
| 293 |
+
adv_row3.addWidget(QLabel("Low-mode tokens:"))
|
| 294 |
+
self._low_initial_tokens_edit = QLineEdit()
|
| 295 |
+
self._low_initial_tokens_edit.setText("6144") # Default value
|
| 296 |
+
self._low_initial_tokens_edit.setToolTip("Initial max_output_tokens for LOW thinking before fallback escalation (4096-8192).")
|
| 297 |
+
self._low_initial_tokens_edit.setFixedWidth(60)
|
| 298 |
+
adv_row3.addWidget(self._low_initial_tokens_edit)
|
| 299 |
+
|
| 300 |
+
adv_row3.addSpacing(20)
|
| 301 |
+
|
| 302 |
+
adv_row3.addWidget(QLabel("Fallback %:"))
|
| 303 |
+
self._reasoning_fallback_edit = QLineEdit()
|
| 304 |
+
self._reasoning_fallback_edit.setText("0.6") # Default value
|
| 305 |
+
self._reasoning_fallback_edit.setToolTip("Fraction of token budget consumed internally (no output) that triggers early fallback (0.5-0.8).")
|
| 306 |
+
self._reasoning_fallback_edit.setFixedWidth(60)
|
| 307 |
+
adv_row3.addWidget(self._reasoning_fallback_edit)
|
| 308 |
+
|
| 309 |
+
adv_row3.addSpacing(20)
|
| 310 |
+
adv_row3.addWidget(QLabel("Fallback cap:"))
|
| 311 |
+
self._fallback_cap_edit = QLineEdit()
|
| 312 |
+
self._fallback_cap_edit.setText("8192") # Default configurable cap
|
| 313 |
+
self._fallback_cap_edit.setToolTip("Maximum tokens for fallback attempt. Increase for page-wise recognition (e.g. 12288 or 16384).")
|
| 314 |
+
self._fallback_cap_edit.setFixedWidth(70)
|
| 315 |
+
adv_row3.addWidget(self._fallback_cap_edit)
|
| 316 |
+
adv_row3.addStretch()
|
| 317 |
+
adv_layout.addLayout(adv_row3)
|
| 318 |
+
|
| 319 |
+
advanced_group.setLayout(adv_layout)
|
| 320 |
+
layout.addWidget(advanced_group)
|
| 321 |
+
|
| 322 |
+
layout.addStretch()
|
| 323 |
+
widget.setLayout(layout)
|
| 324 |
+
|
| 325 |
+
self._config_widget = widget
|
| 326 |
+
|
| 327 |
+
# Initialize model list based on default provider
|
| 328 |
+
self._on_provider_changed(self._provider_combo.currentText())
|
| 329 |
+
|
| 330 |
+
return widget
|
| 331 |
+
|
| 332 |
+
def _get_api_key_file(self) -> 'Path':
|
| 333 |
+
"""Get path to API key storage file."""
|
| 334 |
+
from pathlib import Path
|
| 335 |
+
storage_dir = Path.home() / ".trocr_gui"
|
| 336 |
+
storage_dir.mkdir(exist_ok=True)
|
| 337 |
+
return storage_dir / "api_keys.json"
|
| 338 |
+
|
| 339 |
+
def _load_saved_api_key(self):
|
| 340 |
+
"""Load saved API key for current provider."""
|
| 341 |
+
try:
|
| 342 |
+
import json
|
| 343 |
+
key_file = self._get_api_key_file()
|
| 344 |
+
|
| 345 |
+
if key_file.exists():
|
| 346 |
+
with open(key_file, "r") as f:
|
| 347 |
+
keys = json.load(f)
|
| 348 |
+
|
| 349 |
+
provider = self._provider_combo.currentText().lower()
|
| 350 |
+
if provider in keys:
|
| 351 |
+
self._api_key_edit.setText(keys[provider])
|
| 352 |
+
except Exception as e:
|
| 353 |
+
print(f"Warning: Could not load saved API key: {e}")
|
| 354 |
+
|
| 355 |
+
def _save_api_key(self):
|
| 356 |
+
"""Save API key for current provider."""
|
| 357 |
+
try:
|
| 358 |
+
import json
|
| 359 |
+
key_file = self._get_api_key_file()
|
| 360 |
+
|
| 361 |
+
# Load existing keys
|
| 362 |
+
keys = {}
|
| 363 |
+
if key_file.exists():
|
| 364 |
+
with open(key_file, "r") as f:
|
| 365 |
+
keys = json.load(f)
|
| 366 |
+
|
| 367 |
+
# Update key for current provider
|
| 368 |
+
provider = self._provider_combo.currentText().lower()
|
| 369 |
+
api_key = self._api_key_edit.text().strip()
|
| 370 |
+
|
| 371 |
+
if api_key:
|
| 372 |
+
keys[provider] = api_key
|
| 373 |
+
|
| 374 |
+
with open(key_file, "w") as f:
|
| 375 |
+
json.dump(keys, f, indent=2)
|
| 376 |
+
except Exception as e:
|
| 377 |
+
print(f"Warning: Could not save API key: {e}")
|
| 378 |
+
|
| 379 |
+
def _on_provider_changed(self, provider: str):
|
| 380 |
+
"""Update model list when provider changes and load API key from environment."""
|
| 381 |
+
if self._model_combo is None:
|
| 382 |
+
return
|
| 383 |
+
|
| 384 |
+
self._model_combo.clear()
|
| 385 |
+
|
| 386 |
+
if provider == "OpenAI":
|
| 387 |
+
self._model_combo.addItems(OPENAI_MODELS)
|
| 388 |
+
elif provider == "Gemini":
|
| 389 |
+
self._model_combo.addItems(GEMINI_MODELS)
|
| 390 |
+
elif provider == "Claude":
|
| 391 |
+
self._model_combo.addItems(CLAUDE_MODELS)
|
| 392 |
+
else:
|
| 393 |
+
self._model_combo.addItem("No models available")
|
| 394 |
+
|
| 395 |
+
# Auto-load API key from environment variables
|
| 396 |
+
if self._api_key_edit is not None:
|
| 397 |
+
env_key = self._get_api_key_from_env(provider)
|
| 398 |
+
if env_key:
|
| 399 |
+
self._api_key_edit.setText(env_key)
|
| 400 |
+
print(f"[CommercialAPIEngine] Loaded {provider} API key from environment")
|
| 401 |
+
|
| 402 |
+
def _get_api_key_from_env(self, provider: str) -> Optional[str]:
|
| 403 |
+
"""Get API key from environment variables based on provider."""
|
| 404 |
+
env_var_map = {
|
| 405 |
+
"OpenAI": "OPENAI_API_KEY",
|
| 406 |
+
"Gemini": "GOOGLE_API_KEY",
|
| 407 |
+
"Claude": "ANTHROPIC_API_KEY"
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
env_var = env_var_map.get(provider)
|
| 411 |
+
if env_var:
|
| 412 |
+
return os.getenv(env_var, "")
|
| 413 |
+
|
| 414 |
+
def _toggle_key_visibility(self, checked: bool):
|
| 415 |
+
"""Toggle API key visibility."""
|
| 416 |
+
if checked:
|
| 417 |
+
self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Normal)
|
| 418 |
+
else:
|
| 419 |
+
self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
|
| 420 |
+
|
| 421 |
+
def _on_custom_model_toggled(self, checked: bool):
|
| 422 |
+
"""Enable/disable custom model field."""
|
| 423 |
+
self._custom_model_edit.setEnabled(checked)
|
| 424 |
+
self._model_combo.setEnabled(not checked)
|
| 425 |
+
|
| 426 |
+
def _on_refresh_models(self):
|
| 427 |
+
"""Refresh model list from API dynamically."""
|
| 428 |
+
if self._model_combo is None or self._api_key_edit is None:
|
| 429 |
+
return
|
| 430 |
+
|
| 431 |
+
provider = self._provider_combo.currentText()
|
| 432 |
+
api_key = self._api_key_edit.text().strip()
|
| 433 |
+
|
| 434 |
+
if not api_key:
|
| 435 |
+
print(f"[CommercialAPIEngine] Cannot refresh models: No API key provided")
|
| 436 |
+
return
|
| 437 |
+
|
| 438 |
+
print(f"[CommercialAPIEngine] Refreshing {provider} models from API...")
|
| 439 |
+
|
| 440 |
+
# Save current selection
|
| 441 |
+
current_model = self._model_combo.currentText()
|
| 442 |
+
|
| 443 |
+
# Fetch models dynamically
|
| 444 |
+
if provider == "OpenAI":
|
| 445 |
+
models = fetch_openai_models(api_key)
|
| 446 |
+
elif provider == "Gemini":
|
| 447 |
+
models = fetch_gemini_models(api_key)
|
| 448 |
+
else:
|
| 449 |
+
print(f"[CommercialAPIEngine] Dynamic refresh not supported for {provider}")
|
| 450 |
+
return
|
| 451 |
+
|
| 452 |
+
# Update dropdown
|
| 453 |
+
self._model_combo.clear()
|
| 454 |
+
self._model_combo.addItems(models)
|
| 455 |
+
|
| 456 |
+
# Restore selection if possible
|
| 457 |
+
idx = self._model_combo.findText(current_model)
|
| 458 |
+
if idx >= 0:
|
| 459 |
+
self._model_combo.setCurrentIndex(idx)
|
| 460 |
+
|
| 461 |
+
print(f"[CommercialAPIEngine] Refreshed {len(models)} models for {provider}")
|
| 462 |
+
|
| 463 |
+
def get_config(self) -> Dict[str, Any]:
|
| 464 |
+
"""Extract configuration from widget controls."""
|
| 465 |
+
if self._config_widget is None:
|
| 466 |
+
return {}
|
| 467 |
+
|
| 468 |
+
prompt_text = self._prompt_edit.toPlainText().strip()
|
| 469 |
+
|
| 470 |
+
# Use custom model if checkbox is enabled, otherwise use dropdown
|
| 471 |
+
if self._use_custom_model_check.isChecked():
|
| 472 |
+
model = self._custom_model_edit.text().strip()
|
| 473 |
+
else:
|
| 474 |
+
model = self._model_combo.currentText()
|
| 475 |
+
|
| 476 |
+
return {
|
| 477 |
+
"provider": self._provider_combo.currentText(),
|
| 478 |
+
"model": model,
|
| 479 |
+
"api_key": self._api_key_edit.text().strip(),
|
| 480 |
+
"custom_prompt": prompt_text if prompt_text else None,
|
| 481 |
+
"use_custom_model": self._use_custom_model_check.isChecked(),
|
| 482 |
+
"custom_model_id": self._custom_model_edit.text().strip(),
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
def set_config(self, config: Dict[str, Any]):
|
| 486 |
+
"""Restore configuration to widget controls."""
|
| 487 |
+
if self._config_widget is None:
|
| 488 |
+
return
|
| 489 |
+
|
| 490 |
+
provider = config.get("provider", "")
|
| 491 |
+
idx = self._provider_combo.findText(provider)
|
| 492 |
+
if idx >= 0:
|
| 493 |
+
self._provider_combo.setCurrentIndex(idx)
|
| 494 |
+
|
| 495 |
+
# Restore custom model checkbox and field
|
| 496 |
+
use_custom = config.get("use_custom_model", False)
|
| 497 |
+
self._use_custom_model_check.setChecked(use_custom)
|
| 498 |
+
|
| 499 |
+
if use_custom:
|
| 500 |
+
custom_model_id = config.get("custom_model_id", "")
|
| 501 |
+
self._custom_model_edit.setText(custom_model_id)
|
| 502 |
+
else:
|
| 503 |
+
model = config.get("model", "")
|
| 504 |
+
idx = self._model_combo.findText(model)
|
| 505 |
+
if idx >= 0:
|
| 506 |
+
self._model_combo.setCurrentIndex(idx)
|
| 507 |
+
|
| 508 |
+
self._api_key_edit.setText(config.get("api_key", ""))
|
| 509 |
+
|
| 510 |
+
custom_prompt = config.get("custom_prompt", "")
|
| 511 |
+
if custom_prompt:
|
| 512 |
+
self._prompt_edit.setPlainText(custom_prompt)
|
| 513 |
+
|
| 514 |
+
def load_model(self, config: Dict[str, Any]) -> bool:
|
| 515 |
+
"""Load (initialize) API client."""
|
| 516 |
+
try:
|
| 517 |
+
provider = config.get("provider", "")
|
| 518 |
+
model_name = config.get("model", "")
|
| 519 |
+
api_key = config.get("api_key", "")
|
| 520 |
+
|
| 521 |
+
if not api_key:
|
| 522 |
+
print("Error: No API key provided")
|
| 523 |
+
return False
|
| 524 |
+
|
| 525 |
+
# Unload previous model
|
| 526 |
+
self.unload_model()
|
| 527 |
+
|
| 528 |
+
# Initialize appropriate client
|
| 529 |
+
if provider == "OpenAI":
|
| 530 |
+
self.model = OpenAIInference(api_key=api_key, model=model_name)
|
| 531 |
+
self._current_provider = "openai"
|
| 532 |
+
elif provider == "Gemini":
|
| 533 |
+
self.model = GeminiInference(api_key=api_key, model=model_name)
|
| 534 |
+
self._current_provider = "gemini"
|
| 535 |
+
elif provider == "Claude":
|
| 536 |
+
self.model = ClaudeInference(api_key=api_key, model=model_name)
|
| 537 |
+
self._current_provider = "claude"
|
| 538 |
+
else:
|
| 539 |
+
return False
|
| 540 |
+
|
| 541 |
+
return True
|
| 542 |
+
|
| 543 |
+
except Exception as e:
|
| 544 |
+
print(f"Error initializing API client: {e}")
|
| 545 |
+
self.model = None
|
| 546 |
+
self._current_provider = None
|
| 547 |
+
return False
|
| 548 |
+
|
| 549 |
+
def unload_model(self):
|
| 550 |
+
"""Unload (clear) API client."""
|
| 551 |
+
if self.model is not None:
|
| 552 |
+
del self.model
|
| 553 |
+
self.model = None
|
| 554 |
+
self._current_provider = None
|
| 555 |
+
|
| 556 |
+
def is_model_loaded(self) -> bool:
|
| 557 |
+
"""Check if API client is initialized."""
|
| 558 |
+
return self.model is not None
|
| 559 |
+
|
| 560 |
+
def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
|
| 561 |
+
"""Transcribe a line image with commercial API."""
|
| 562 |
+
if self.model is None:
|
| 563 |
+
return TranscriptionResult(text="[API client not initialized]", confidence=0.0)
|
| 564 |
+
|
| 565 |
+
if config is None:
|
| 566 |
+
config = self.get_config()
|
| 567 |
+
|
| 568 |
+
custom_prompt = config.get("custom_prompt")
|
| 569 |
+
|
| 570 |
+
try:
|
| 571 |
+
# Convert numpy array to PIL Image
|
| 572 |
+
from PIL import Image
|
| 573 |
+
if isinstance(image, np.ndarray):
|
| 574 |
+
pil_image = Image.fromarray(image)
|
| 575 |
+
else:
|
| 576 |
+
pil_image = image
|
| 577 |
+
|
| 578 |
+
# All API clients have transcribe() method
|
| 579 |
+
# It returns a string directly, not a dict
|
| 580 |
+
# Enable retry logic for Gemini to handle content blocking
|
| 581 |
+
if self._current_provider == "gemini":
|
| 582 |
+
# Get thinking mode setting
|
| 583 |
+
thinking_mode = None
|
| 584 |
+
temperature = None
|
| 585 |
+
if self._thinking_combo is not None:
|
| 586 |
+
thinking_text = self._thinking_combo.currentText()
|
| 587 |
+
if "Low" in thinking_text:
|
| 588 |
+
thinking_mode = "low"
|
| 589 |
+
fast_direct = True # low mode: request immediate output
|
| 590 |
+
elif "High" in thinking_text:
|
| 591 |
+
thinking_mode = "high"
|
| 592 |
+
# else: Auto = None (default)
|
| 593 |
+
else:
|
| 594 |
+
# Web UI context — get thinking_mode from config dict
|
| 595 |
+
thinking_mode = config.get("thinking_mode") or None
|
| 596 |
+
if self._temperature_edit is not None:
|
| 597 |
+
t_text = self._temperature_edit.text().strip()
|
| 598 |
+
if t_text:
|
| 599 |
+
try:
|
| 600 |
+
temperature = float(t_text)
|
| 601 |
+
except ValueError:
|
| 602 |
+
temperature = None
|
| 603 |
+
max_tokens = None
|
| 604 |
+
if self._max_tokens_edit is not None:
|
| 605 |
+
mt_text = self._max_tokens_edit.text().strip()
|
| 606 |
+
if mt_text:
|
| 607 |
+
try:
|
| 608 |
+
max_tokens = int(mt_text)
|
| 609 |
+
except ValueError:
|
| 610 |
+
max_tokens = None
|
| 611 |
+
# Fallback to config dict (web UI context — no Qt widgets)
|
| 612 |
+
if max_tokens is None:
|
| 613 |
+
max_tokens = config.get("max_output_tokens")
|
| 614 |
+
# Treat 0 as "no limit" (HTML number fields send 0 for blank)
|
| 615 |
+
if max_tokens is not None and max_tokens <= 0:
|
| 616 |
+
max_tokens = None
|
| 617 |
+
if temperature is None:
|
| 618 |
+
temperature = config.get("temperature")
|
| 619 |
+
# Web UI (no Qt widgets): disable early exit for full reasoning quality
|
| 620 |
+
if self._early_exit_check is not None:
|
| 621 |
+
fast_direct_early_exit = self._early_exit_check.isChecked()
|
| 622 |
+
else:
|
| 623 |
+
fast_direct_early_exit = False
|
| 624 |
+
# Extract continuation settings
|
| 625 |
+
auto_continue = False
|
| 626 |
+
max_auto_continuations = 2 # Default
|
| 627 |
+
if self._auto_continue_check is not None and self._auto_continue_check.isChecked():
|
| 628 |
+
auto_continue = True
|
| 629 |
+
if self._max_continuations_edit is not None:
|
| 630 |
+
mc_text = self._max_continuations_edit.text().strip()
|
| 631 |
+
if mc_text:
|
| 632 |
+
try:
|
| 633 |
+
max_auto_continuations = int(mc_text)
|
| 634 |
+
except ValueError:
|
| 635 |
+
pass # Keep default of 2
|
| 636 |
+
|
| 637 |
+
# Extract continuation settings with defaults
|
| 638 |
+
continuation_min_new_chars = 50
|
| 639 |
+
if hasattr(self, '_min_new_chars_edit') and self._min_new_chars_edit is not None:
|
| 640 |
+
mnc_text = self._min_new_chars_edit.text().strip()
|
| 641 |
+
if mnc_text:
|
| 642 |
+
try:
|
| 643 |
+
continuation_min_new_chars = int(mnc_text)
|
| 644 |
+
except ValueError:
|
| 645 |
+
pass # Keep default
|
| 646 |
+
|
| 647 |
+
# Web UI (no Qt widgets): disable reasoning fallback (1.0 = never trigger)
|
| 648 |
+
reasoning_fallback_threshold = 1.0 if not (hasattr(self, '_reasoning_fallback_edit') and self._reasoning_fallback_edit is not None) else 0.6
|
| 649 |
+
if hasattr(self, '_reasoning_fallback_edit') and self._reasoning_fallback_edit is not None:
|
| 650 |
+
rft_text = self._reasoning_fallback_edit.text().strip()
|
| 651 |
+
if rft_text:
|
| 652 |
+
try:
|
| 653 |
+
reasoning_fallback_threshold = float(rft_text)
|
| 654 |
+
except ValueError:
|
| 655 |
+
pass # Keep default
|
| 656 |
+
|
| 657 |
+
fallback_cap = 8192
|
| 658 |
+
if hasattr(self, '_fallback_cap_edit') and self._fallback_cap_edit is not None:
|
| 659 |
+
fc_text = self._fallback_cap_edit.text().strip()
|
| 660 |
+
if fc_text:
|
| 661 |
+
try:
|
| 662 |
+
fallback_cap = int(fc_text)
|
| 663 |
+
except ValueError:
|
| 664 |
+
pass # Keep default if invalid value
|
| 665 |
+
|
| 666 |
+
# Override max_tokens for LOW thinking mode if specified
|
| 667 |
+
if thinking_mode == 'low' and hasattr(self, '_low_initial_tokens_edit') and self._low_initial_tokens_edit is not None:
|
| 668 |
+
lit_text = self._low_initial_tokens_edit.text().strip()
|
| 669 |
+
if lit_text:
|
| 670 |
+
try:
|
| 671 |
+
lit_val = int(lit_text)
|
| 672 |
+
if lit_val > 0:
|
| 673 |
+
max_tokens = lit_val
|
| 674 |
+
print(f"🔧 LOW thinking mode: overriding max_output_tokens to {max_tokens}")
|
| 675 |
+
except ValueError:
|
| 676 |
+
pass # Keep existing max_tokens
|
| 677 |
+
|
| 678 |
+
# Debug: show final token budget
|
| 679 |
+
print(f"📊 Final settings: thinking_mode={thinking_mode}, max_output_tokens={max_tokens or 'model default'}, temp={temperature if temperature is not None else 1.0}")
|
| 680 |
+
|
| 681 |
+
text = self.model.transcribe(
|
| 682 |
+
pil_image,
|
| 683 |
+
prompt=custom_prompt,
|
| 684 |
+
temperature=temperature if temperature is not None else 0.0,
|
| 685 |
+
max_output_tokens=max_tokens, # None = no limit, model uses its own maximum
|
| 686 |
+
auto_retry_on_block=True,
|
| 687 |
+
safety_relax=True,
|
| 688 |
+
verbose_block_logging=True,
|
| 689 |
+
thinking_mode=thinking_mode,
|
| 690 |
+
fast_direct=fast_direct if 'fast_direct' in locals() else False,
|
| 691 |
+
fast_direct_early_exit=fast_direct_early_exit,
|
| 692 |
+
auto_continue=auto_continue,
|
| 693 |
+
max_auto_continuations=max_auto_continuations,
|
| 694 |
+
continuation_min_new_chars=continuation_min_new_chars,
|
| 695 |
+
reasoning_fallback_threshold=reasoning_fallback_threshold,
|
| 696 |
+
fallback_max_output_tokens=fallback_cap,
|
| 697 |
+
record_stats_csv="gemini_runs.csv",
|
| 698 |
+
apply_restriction_prompt=False # Let model reason freely — improves transcription quality
|
| 699 |
+
)
|
| 700 |
+
else:
|
| 701 |
+
temperature = None
|
| 702 |
+
if self._temperature_edit is not None:
|
| 703 |
+
t_text = self._temperature_edit.text().strip()
|
| 704 |
+
if t_text:
|
| 705 |
+
try:
|
| 706 |
+
temperature = float(t_text)
|
| 707 |
+
except ValueError:
|
| 708 |
+
temperature = None
|
| 709 |
+
max_tokens = None
|
| 710 |
+
if self._max_tokens_edit is not None:
|
| 711 |
+
mt_text = self._max_tokens_edit.text().strip()
|
| 712 |
+
if mt_text:
|
| 713 |
+
try:
|
| 714 |
+
max_tokens = int(mt_text)
|
| 715 |
+
except ValueError:
|
| 716 |
+
max_tokens = None
|
| 717 |
+
# Fallback to config dict (web UI context — no Qt widgets)
|
| 718 |
+
if max_tokens is None:
|
| 719 |
+
max_tokens = config.get("max_output_tokens")
|
| 720 |
+
# Treat 0 as "no limit" (HTML number fields send 0 for blank)
|
| 721 |
+
if max_tokens is not None and max_tokens <= 0:
|
| 722 |
+
max_tokens = None
|
| 723 |
+
if temperature is None:
|
| 724 |
+
temperature = config.get("temperature")
|
| 725 |
+
thinking_mode = config.get("thinking_mode") or None
|
| 726 |
+
text = self.model.transcribe(
|
| 727 |
+
pil_image,
|
| 728 |
+
prompt=custom_prompt,
|
| 729 |
+
temperature=temperature if temperature is not None else 0.0,
|
| 730 |
+
max_output_tokens=max_tokens, # None = no limit, model uses its own maximum
|
| 731 |
+
thinking_mode=thinking_mode,
|
| 732 |
+
)
|
| 733 |
+
|
| 734 |
+
meta: Dict[str, Any] = {
|
| 735 |
+
"provider": self._current_provider,
|
| 736 |
+
"model": config.get("model", ""),
|
| 737 |
+
}
|
| 738 |
+
if hasattr(self.model, "last_usage") and self.model.last_usage:
|
| 739 |
+
usage = dict(self.model.last_usage)
|
| 740 |
+
thinking_text = usage.pop("thinking_text", None)
|
| 741 |
+
meta["token_usage"] = usage
|
| 742 |
+
if thinking_text:
|
| 743 |
+
meta["thinking_text"] = thinking_text
|
| 744 |
+
return TranscriptionResult(
|
| 745 |
+
text=text if text else "",
|
| 746 |
+
confidence=1.0, # API models don't provide confidence
|
| 747 |
+
metadata=meta,
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
except Exception as e:
|
| 751 |
+
print(f"Error in API transcription: {e}")
|
| 752 |
+
import traceback
|
| 753 |
+
traceback.print_exc()
|
| 754 |
+
return TranscriptionResult(text=f"[API Error: {e}]", confidence=0.0)
|
| 755 |
+
|
| 756 |
+
def get_capabilities(self) -> Dict[str, bool]:
|
| 757 |
+
"""Commercial API capabilities."""
|
| 758 |
+
return {
|
| 759 |
+
"batch_processing": False, # APIs typically process one at a time
|
| 760 |
+
"confidence_scores": False, # Most don't provide confidence
|
| 761 |
+
"beam_search": False, # Internal to API
|
| 762 |
+
"language_model": True, # All are language models
|
| 763 |
+
"preprocessing": True, # APIs handle preprocessing
|
| 764 |
+
}
|
| 765 |
+
|
| 766 |
+
def requires_line_segmentation(self) -> bool:
|
| 767 |
+
"""Commercial APIs can process full pages without segmentation."""
|
| 768 |
+
return False
|
engines/kraken_engine.py
ADDED
|
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Kraken HTR Engine Plugin
|
| 3 |
+
|
| 4 |
+
Wraps the Kraken OCR system as a plugin for the unified GUI.
|
| 5 |
+
Kraken is specialized for historical document OCR with robust segmentation and recognition.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict, Any, Optional
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _print(msg: str) -> None:
|
| 15 |
+
"""Print with graceful fallback if console can't encode the message (e.g. Windows CP-1252)."""
|
| 16 |
+
try:
|
| 17 |
+
print(msg)
|
| 18 |
+
except UnicodeEncodeError:
|
| 19 |
+
print(msg.encode("ascii", errors="replace").decode("ascii"))
|
| 20 |
+
|
| 21 |
+
from htr_engine_base import HTREngine, TranscriptionResult
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from PyQt6.QtWidgets import (
|
| 25 |
+
QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
|
| 26 |
+
QPushButton, QLineEdit, QFileDialog, QGroupBox, QCheckBox
|
| 27 |
+
)
|
| 28 |
+
from PyQt6.QtCore import Qt
|
| 29 |
+
PYQT_AVAILABLE = True
|
| 30 |
+
except ImportError:
|
| 31 |
+
PYQT_AVAILABLE = False
|
| 32 |
+
QWidget = object
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
from kraken import rpred
|
| 36 |
+
from kraken.lib import vgsl, models
|
| 37 |
+
KRAKEN_AVAILABLE = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
KRAKEN_AVAILABLE = False
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# Local model (included in repo)
|
| 43 |
+
LOCAL_BLLA_MODEL = "pagexml/blla.mlmodel"
|
| 44 |
+
|
| 45 |
+
# Preset Kraken models — local + Zenodo community models (auto-download on first use)
|
| 46 |
+
KRAKEN_MODELS = {
|
| 47 |
+
"blla-local": {
|
| 48 |
+
"path": LOCAL_BLLA_MODEL,
|
| 49 |
+
"description": "BLLA Segmentation Model (Local, Default)",
|
| 50 |
+
"language": "multi",
|
| 51 |
+
"source": "local"
|
| 52 |
+
},
|
| 53 |
+
# --- VERIFIED ZENODO MODELS ---
|
| 54 |
+
# CATMuS-Print: printed text, multilingual, verified DOI 10.5281/zenodo.10592716
|
| 55 |
+
"catmus-print": {
|
| 56 |
+
"zenodo_id": "10.5281/zenodo.10592716",
|
| 57 |
+
"description": "CATMuS-Print (Modern Printed Text, multilingual)",
|
| 58 |
+
"language": "multi",
|
| 59 |
+
"source": "zenodo"
|
| 60 |
+
},
|
| 61 |
+
# Arabic handwritten segmentation (Muharaf Corpus), verified DOI 10.5281/zenodo.14295555
|
| 62 |
+
"arabic-muharaf": {
|
| 63 |
+
"zenodo_id": "10.5281/zenodo.14295555",
|
| 64 |
+
"description": "Arabic Handwritten Segmentation (Muharaf Corpus)",
|
| 65 |
+
"language": "arabic",
|
| 66 |
+
"source": "zenodo"
|
| 67 |
+
},
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class KrakenEngine(HTREngine):
|
| 72 |
+
"""Kraken HTR engine plugin."""
|
| 73 |
+
|
| 74 |
+
def __init__(self):
|
| 75 |
+
self.model: Optional[Any] = None # TorchSeqRecognizer
|
| 76 |
+
self._config_widget: Optional[QWidget] = None
|
| 77 |
+
|
| 78 |
+
# Widget references
|
| 79 |
+
self._model_source_combo: Optional[QComboBox] = None
|
| 80 |
+
self._preset_combo: Optional[QComboBox] = None
|
| 81 |
+
self._custom_model_edit: Optional[QLineEdit] = None
|
| 82 |
+
self._bidi_reorder_check: Optional[QCheckBox] = None
|
| 83 |
+
|
| 84 |
+
def get_name(self) -> str:
|
| 85 |
+
return "Kraken"
|
| 86 |
+
|
| 87 |
+
def get_description(self) -> str:
|
| 88 |
+
return "Kraken OCR - Specialized for historical documents with .mlmodel support"
|
| 89 |
+
|
| 90 |
+
def is_available(self) -> bool:
|
| 91 |
+
return KRAKEN_AVAILABLE
|
| 92 |
+
|
| 93 |
+
def get_unavailable_reason(self) -> str:
|
| 94 |
+
if not KRAKEN_AVAILABLE:
|
| 95 |
+
return "Kraken not installed. Install with: pip install kraken"
|
| 96 |
+
return ""
|
| 97 |
+
|
| 98 |
+
def get_config_widget(self) -> QWidget:
|
| 99 |
+
"""Create Kraken configuration panel."""
|
| 100 |
+
if not PYQT_AVAILABLE:
|
| 101 |
+
raise RuntimeError("PyQt6 not installed. Install with: pip install PyQt6")
|
| 102 |
+
if self._config_widget is not None:
|
| 103 |
+
return self._config_widget
|
| 104 |
+
|
| 105 |
+
widget = QWidget()
|
| 106 |
+
layout = QVBoxLayout()
|
| 107 |
+
|
| 108 |
+
# Model source selection
|
| 109 |
+
source_group = QGroupBox("Model Source")
|
| 110 |
+
source_layout = QVBoxLayout()
|
| 111 |
+
|
| 112 |
+
self._model_source_combo = QComboBox()
|
| 113 |
+
self._model_source_combo.addItems(["Preset Models", "Custom Model File"])
|
| 114 |
+
self._model_source_combo.currentTextChanged.connect(self._on_model_source_changed)
|
| 115 |
+
source_layout.addWidget(self._model_source_combo)
|
| 116 |
+
|
| 117 |
+
source_group.setLayout(source_layout)
|
| 118 |
+
layout.addWidget(source_group)
|
| 119 |
+
|
| 120 |
+
# Preset models group
|
| 121 |
+
self._preset_group = QGroupBox("Preset Model")
|
| 122 |
+
preset_layout = QVBoxLayout()
|
| 123 |
+
|
| 124 |
+
self._preset_combo = QComboBox()
|
| 125 |
+
self._populate_preset_models()
|
| 126 |
+
self._preset_combo.currentIndexChanged.connect(self._on_preset_model_changed)
|
| 127 |
+
preset_layout.addWidget(QLabel("Model:"))
|
| 128 |
+
preset_layout.addWidget(self._preset_combo)
|
| 129 |
+
|
| 130 |
+
preset_hint = QLabel("Note: Zenodo models (⬇️) auto-download on first use")
|
| 131 |
+
preset_hint.setStyleSheet("color: gray; font-size: 9pt;")
|
| 132 |
+
preset_layout.addWidget(preset_hint)
|
| 133 |
+
|
| 134 |
+
self._preset_group.setLayout(preset_layout)
|
| 135 |
+
layout.addWidget(self._preset_group)
|
| 136 |
+
|
| 137 |
+
# Custom model group
|
| 138 |
+
self._custom_group = QGroupBox("Custom Model")
|
| 139 |
+
custom_layout = QVBoxLayout()
|
| 140 |
+
|
| 141 |
+
custom_layout.addWidget(QLabel("Model File (.mlmodel):"))
|
| 142 |
+
model_layout = QHBoxLayout()
|
| 143 |
+
self._custom_model_edit = QLineEdit()
|
| 144 |
+
self._custom_model_edit.setPlaceholderText("Path to .mlmodel file")
|
| 145 |
+
model_layout.addWidget(self._custom_model_edit)
|
| 146 |
+
|
| 147 |
+
browse_btn = QPushButton("Browse...")
|
| 148 |
+
browse_btn.clicked.connect(self._browse_model)
|
| 149 |
+
model_layout.addWidget(browse_btn)
|
| 150 |
+
custom_layout.addLayout(model_layout)
|
| 151 |
+
|
| 152 |
+
self._custom_group.setLayout(custom_layout)
|
| 153 |
+
self._custom_group.setVisible(False) # Hidden by default
|
| 154 |
+
layout.addWidget(self._custom_group)
|
| 155 |
+
|
| 156 |
+
# Recognition settings
|
| 157 |
+
settings_group = QGroupBox("Recognition Settings")
|
| 158 |
+
settings_layout = QVBoxLayout()
|
| 159 |
+
|
| 160 |
+
self._bidi_reorder_check = QCheckBox("Bidirectional Text Reordering")
|
| 161 |
+
self._bidi_reorder_check.setChecked(True)
|
| 162 |
+
self._bidi_reorder_check.setToolTip("Enable for RTL languages (Arabic, Hebrew, etc.)")
|
| 163 |
+
settings_layout.addWidget(self._bidi_reorder_check)
|
| 164 |
+
|
| 165 |
+
settings_group.setLayout(settings_layout)
|
| 166 |
+
layout.addWidget(settings_group)
|
| 167 |
+
|
| 168 |
+
layout.addStretch()
|
| 169 |
+
widget.setLayout(layout)
|
| 170 |
+
|
| 171 |
+
self._config_widget = widget
|
| 172 |
+
return widget
|
| 173 |
+
|
| 174 |
+
def _populate_preset_models(self):
|
| 175 |
+
"""Populate preset models dropdown with local and Zenodo models."""
|
| 176 |
+
if self._preset_combo is None:
|
| 177 |
+
return
|
| 178 |
+
|
| 179 |
+
self._preset_combo.clear()
|
| 180 |
+
|
| 181 |
+
if not KRAKEN_MODELS:
|
| 182 |
+
self._preset_combo.addItem("No presets available")
|
| 183 |
+
return
|
| 184 |
+
|
| 185 |
+
# Local model first
|
| 186 |
+
for model_id, info in KRAKEN_MODELS.items():
|
| 187 |
+
if info.get("source") == "local":
|
| 188 |
+
desc = info.get('description', model_id)
|
| 189 |
+
self._preset_combo.addItem(f"📁 {desc}", userData=model_id)
|
| 190 |
+
break
|
| 191 |
+
|
| 192 |
+
self._preset_combo.insertSeparator(self._preset_combo.count())
|
| 193 |
+
|
| 194 |
+
# Zenodo models
|
| 195 |
+
for model_id, info in KRAKEN_MODELS.items():
|
| 196 |
+
if info.get("source") == "zenodo":
|
| 197 |
+
desc = info.get('description', model_id)
|
| 198 |
+
lang = info.get('language', '')
|
| 199 |
+
self._preset_combo.addItem(f"⬇️ {desc} ({lang})", userData=model_id)
|
| 200 |
+
|
| 201 |
+
self._preset_combo.insertSeparator(self._preset_combo.count())
|
| 202 |
+
self._preset_combo.addItem("📂 Browse Custom File...", userData="__custom__")
|
| 203 |
+
|
| 204 |
+
def _on_model_source_changed(self, source: str):
|
| 205 |
+
"""Toggle between preset and custom model selection."""
|
| 206 |
+
is_preset = (source == "Preset Models")
|
| 207 |
+
self._preset_group.setVisible(is_preset)
|
| 208 |
+
self._custom_group.setVisible(not is_preset)
|
| 209 |
+
|
| 210 |
+
def _on_preset_model_changed(self, index: int):
|
| 211 |
+
"""Handle preset selection — open file browser for custom option."""
|
| 212 |
+
model_id = self._preset_combo.currentData()
|
| 213 |
+
if model_id == "__custom__":
|
| 214 |
+
file_path, _ = QFileDialog.getOpenFileName(
|
| 215 |
+
self._config_widget,
|
| 216 |
+
"Select Kraken Model File",
|
| 217 |
+
"",
|
| 218 |
+
"Kraken Models (*.mlmodel);;All Files (*)"
|
| 219 |
+
)
|
| 220 |
+
if file_path:
|
| 221 |
+
self._model_source_combo.setCurrentText("Custom Model File")
|
| 222 |
+
self._custom_model_edit.setText(file_path)
|
| 223 |
+
self._preset_combo.blockSignals(True)
|
| 224 |
+
self._preset_combo.setCurrentIndex(0)
|
| 225 |
+
self._preset_combo.blockSignals(False)
|
| 226 |
+
|
| 227 |
+
def _browse_model(self):
|
| 228 |
+
"""Open file dialog to select model file."""
|
| 229 |
+
file_path, _ = QFileDialog.getOpenFileName(
|
| 230 |
+
self._config_widget,
|
| 231 |
+
"Select Kraken Model",
|
| 232 |
+
"models",
|
| 233 |
+
"Kraken Models (*.mlmodel);;All Files (*)"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
if file_path:
|
| 237 |
+
self._custom_model_edit.setText(file_path)
|
| 238 |
+
|
| 239 |
+
def get_config(self) -> Dict[str, Any]:
|
| 240 |
+
"""Extract configuration from widget controls."""
|
| 241 |
+
if self._config_widget is None:
|
| 242 |
+
return {}
|
| 243 |
+
|
| 244 |
+
is_preset = (self._model_source_combo.currentText() == "Preset Models")
|
| 245 |
+
|
| 246 |
+
config = {
|
| 247 |
+
"model_source": "preset" if is_preset else "custom",
|
| 248 |
+
"bidi_reordering": self._bidi_reorder_check.isChecked(),
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
if is_preset:
|
| 252 |
+
model_id = self._preset_combo.currentData()
|
| 253 |
+
if model_id and model_id in KRAKEN_MODELS:
|
| 254 |
+
config["preset_id"] = model_id
|
| 255 |
+
config["model_path"] = KRAKEN_MODELS[model_id].get("path")
|
| 256 |
+
else:
|
| 257 |
+
config["model_path"] = self._custom_model_edit.text()
|
| 258 |
+
|
| 259 |
+
return config
|
| 260 |
+
|
| 261 |
+
def set_config(self, config: Dict[str, Any]):
|
| 262 |
+
"""Restore configuration to widget controls."""
|
| 263 |
+
if self._config_widget is None:
|
| 264 |
+
return
|
| 265 |
+
|
| 266 |
+
model_source = config.get("model_source", "preset")
|
| 267 |
+
self._model_source_combo.setCurrentText("Preset Models" if model_source == "preset" else "Custom Model File")
|
| 268 |
+
|
| 269 |
+
if model_source == "preset":
|
| 270 |
+
preset_id = config.get("preset_id", "")
|
| 271 |
+
for i in range(self._preset_combo.count()):
|
| 272 |
+
if self._preset_combo.itemData(i) == preset_id:
|
| 273 |
+
self._preset_combo.setCurrentIndex(i)
|
| 274 |
+
break
|
| 275 |
+
else:
|
| 276 |
+
self._custom_model_edit.setText(config.get("model_path", ""))
|
| 277 |
+
|
| 278 |
+
self._bidi_reorder_check.setChecked(config.get("bidi_reordering", True))
|
| 279 |
+
|
| 280 |
+
def load_model(self, config: Dict[str, Any]) -> bool:
|
| 281 |
+
"""Load Kraken model (local or Zenodo auto-download)."""
|
| 282 |
+
try:
|
| 283 |
+
model_path = config.get("model_path")
|
| 284 |
+
preset_id = config.get("preset_id")
|
| 285 |
+
|
| 286 |
+
# Resolve Zenodo preset: download if needed
|
| 287 |
+
if preset_id and preset_id in KRAKEN_MODELS:
|
| 288 |
+
model_info = KRAKEN_MODELS[preset_id]
|
| 289 |
+
if model_info.get("source") == "zenodo":
|
| 290 |
+
zenodo_id = model_info.get("zenodo_id")
|
| 291 |
+
model_path = self._download_zenodo_model(zenodo_id, preset_id)
|
| 292 |
+
if not model_path:
|
| 293 |
+
print(f"Error: Failed to download Zenodo model '{preset_id}'")
|
| 294 |
+
return False
|
| 295 |
+
elif model_info.get("source") == "local":
|
| 296 |
+
model_path = model_info.get("path")
|
| 297 |
+
|
| 298 |
+
# Fall back to default local blla model
|
| 299 |
+
if not model_path:
|
| 300 |
+
model_path = LOCAL_BLLA_MODEL
|
| 301 |
+
print(f"No model specified, using default: {model_path}")
|
| 302 |
+
|
| 303 |
+
if not Path(model_path).exists():
|
| 304 |
+
print(f"Error: Model file not found: {model_path}")
|
| 305 |
+
print("For Zenodo models, run: kraken get <zenodo_id>")
|
| 306 |
+
return False
|
| 307 |
+
|
| 308 |
+
vgsl_model = vgsl.TorchVGSLModel.load_model(model_path)
|
| 309 |
+
from kraken.lib.models import TorchSeqRecognizer
|
| 310 |
+
self.model = TorchSeqRecognizer(vgsl_model, device='cpu')
|
| 311 |
+
print(f"Kraken model loaded from: {model_path}")
|
| 312 |
+
return True
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
import traceback
|
| 316 |
+
print(f"Error loading Kraken model: {e}")
|
| 317 |
+
print(traceback.format_exc())
|
| 318 |
+
self.model = None
|
| 319 |
+
return False
|
| 320 |
+
|
| 321 |
+
def _download_zenodo_model(self, zenodo_id: str, model_name: str) -> Optional[str]:
|
| 322 |
+
"""Download a Kraken model from Zenodo via `kraken get`.
|
| 323 |
+
|
| 324 |
+
Models are cached in `kraken_models/` inside the repo root.
|
| 325 |
+
Returns local path on success, None on failure.
|
| 326 |
+
"""
|
| 327 |
+
import subprocess
|
| 328 |
+
import shutil
|
| 329 |
+
import sys
|
| 330 |
+
import time
|
| 331 |
+
|
| 332 |
+
# Prefer the kraken binary from the same venv as this Python process
|
| 333 |
+
# (shutil.which only searches PATH, which may not include the venv bin/ in
|
| 334 |
+
# systemd services that invoke uvicorn directly without activating the venv).
|
| 335 |
+
venv_kraken = Path(sys.executable).parent / "kraken"
|
| 336 |
+
kraken_cmd = str(venv_kraken) if venv_kraken.exists() else shutil.which("kraken")
|
| 337 |
+
if not kraken_cmd:
|
| 338 |
+
_print("❌ 'kraken' command not found. Install with: pip install kraken")
|
| 339 |
+
_print(f"💡 Manual download: https://zenodo.org/record/{zenodo_id.split('/')[-1]}")
|
| 340 |
+
return None
|
| 341 |
+
|
| 342 |
+
repo_root = Path(__file__).parent.parent
|
| 343 |
+
models_dir = repo_root / "kraken_models"
|
| 344 |
+
models_dir.mkdir(exist_ok=True)
|
| 345 |
+
model_path = models_dir / f"{model_name}.mlmodel"
|
| 346 |
+
|
| 347 |
+
if model_path.exists():
|
| 348 |
+
_print(f"✅ Using cached Zenodo model: {model_path}")
|
| 349 |
+
return str(model_path)
|
| 350 |
+
|
| 351 |
+
# Check for any existing name-matched file
|
| 352 |
+
for existing in models_dir.glob("*.mlmodel"):
|
| 353 |
+
if model_name.lower() in existing.stem.lower():
|
| 354 |
+
_print(f"✅ Found existing model: {existing}")
|
| 355 |
+
return str(existing)
|
| 356 |
+
|
| 357 |
+
_print(f"📥 Downloading Zenodo model {zenodo_id} …")
|
| 358 |
+
_print(f"📂 Will save to: {model_path}")
|
| 359 |
+
_print("⏳ This may take a few minutes on first use …")
|
| 360 |
+
|
| 361 |
+
try:
|
| 362 |
+
result = subprocess.run(
|
| 363 |
+
[kraken_cmd, "get", zenodo_id],
|
| 364 |
+
capture_output=True, text=True, timeout=300
|
| 365 |
+
)
|
| 366 |
+
if result.returncode == 0:
|
| 367 |
+
# Find freshly downloaded .mlmodel (modified within last 2 min)
|
| 368 |
+
search_dirs = [
|
| 369 |
+
Path.home() / "Library" / "Application Support" / "htrmopo",
|
| 370 |
+
Path.home() / ".kraken",
|
| 371 |
+
]
|
| 372 |
+
downloaded = None
|
| 373 |
+
for d in search_dirs:
|
| 374 |
+
if not d.exists():
|
| 375 |
+
continue
|
| 376 |
+
for p in d.rglob("*.mlmodel"):
|
| 377 |
+
if time.time() - p.stat().st_mtime < 120:
|
| 378 |
+
downloaded = p
|
| 379 |
+
break
|
| 380 |
+
if downloaded:
|
| 381 |
+
break
|
| 382 |
+
if downloaded and downloaded.exists():
|
| 383 |
+
shutil.copy2(downloaded, model_path)
|
| 384 |
+
_print(f"✅ Model saved to: {model_path}")
|
| 385 |
+
return str(model_path)
|
| 386 |
+
else:
|
| 387 |
+
_print("⚠️ Download succeeded but couldn't locate the file")
|
| 388 |
+
else:
|
| 389 |
+
_print(f"❌ kraken get failed (exit {result.returncode}): {result.stderr}")
|
| 390 |
+
_print(f"💡 Manual: kraken get {zenodo_id} then copy to {models_dir}/")
|
| 391 |
+
except subprocess.TimeoutExpired:
|
| 392 |
+
_print("⏱️ Download timeout (>5 min). Try manually: kraken get " + zenodo_id)
|
| 393 |
+
except Exception as e:
|
| 394 |
+
_print(f"❌ Download error: {e}")
|
| 395 |
+
|
| 396 |
+
return None
|
| 397 |
+
|
| 398 |
+
def unload_model(self):
|
| 399 |
+
"""Unload model from memory."""
|
| 400 |
+
if self.model is not None:
|
| 401 |
+
del self.model
|
| 402 |
+
self.model = None
|
| 403 |
+
|
| 404 |
+
# Free GPU memory
|
| 405 |
+
import torch
|
| 406 |
+
if torch.cuda.is_available():
|
| 407 |
+
torch.cuda.empty_cache()
|
| 408 |
+
|
| 409 |
+
def is_model_loaded(self) -> bool:
|
| 410 |
+
"""Check if model is loaded."""
|
| 411 |
+
return self.model is not None
|
| 412 |
+
|
| 413 |
+
def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
|
| 414 |
+
"""Transcribe a line image with Kraken."""
|
| 415 |
+
if self.model is None:
|
| 416 |
+
return TranscriptionResult(text="[Model not loaded]", confidence=0.0)
|
| 417 |
+
|
| 418 |
+
if config is None:
|
| 419 |
+
config = self.get_config()
|
| 420 |
+
|
| 421 |
+
try:
|
| 422 |
+
# Import numpy at the start
|
| 423 |
+
import numpy as np
|
| 424 |
+
|
| 425 |
+
# Convert numpy to PIL
|
| 426 |
+
from PIL import Image as PILImage
|
| 427 |
+
if isinstance(image, np.ndarray):
|
| 428 |
+
pil_image = PILImage.fromarray(image)
|
| 429 |
+
else:
|
| 430 |
+
pil_image = image
|
| 431 |
+
|
| 432 |
+
# Convert to grayscale first
|
| 433 |
+
if pil_image.mode != 'L':
|
| 434 |
+
pil_image = pil_image.convert('L')
|
| 435 |
+
|
| 436 |
+
# IMPORTANT: Do NOT binarize! Kraken models work better with grayscale
|
| 437 |
+
# Modern Kraken models are trained on grayscale images and binarization
|
| 438 |
+
# destroys character details, especially in historical manuscripts
|
| 439 |
+
# The previous median threshold was causing poor recognition quality
|
| 440 |
+
binary_image = pil_image # Keep original grayscale
|
| 441 |
+
|
| 442 |
+
# Create a simple segmentation boundary for the full line image
|
| 443 |
+
# Kraken's rpred needs a Segmentation object with line boundaries
|
| 444 |
+
from kraken.containers import BaselineLine, Segmentation
|
| 445 |
+
|
| 446 |
+
height, width = binary_image.height, binary_image.width
|
| 447 |
+
|
| 448 |
+
# Create a baseline (horizontal line through the middle)
|
| 449 |
+
# Use 0-indexed coordinates (width-1, height-1 as maximum)
|
| 450 |
+
baseline = [[0, height // 2], [width - 1, height // 2]]
|
| 451 |
+
|
| 452 |
+
# Create a boundary polygon (rectangle around the entire image)
|
| 453 |
+
# Use 0-indexed coordinates to avoid "outside of image bounds" error
|
| 454 |
+
boundary = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
|
| 455 |
+
|
| 456 |
+
# Create a BaselineLine (not BBoxLine - that doesn't support baselines)
|
| 457 |
+
line = BaselineLine(
|
| 458 |
+
id='line_0',
|
| 459 |
+
baseline=baseline,
|
| 460 |
+
boundary=boundary,
|
| 461 |
+
text='',
|
| 462 |
+
tags=None,
|
| 463 |
+
split=None
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
# Create Segmentation container
|
| 467 |
+
seg = Segmentation(
|
| 468 |
+
type='baselines',
|
| 469 |
+
imagename='line',
|
| 470 |
+
text_direction='horizontal-lr',
|
| 471 |
+
script_detection=False,
|
| 472 |
+
lines=[line],
|
| 473 |
+
regions={},
|
| 474 |
+
line_orders=[]
|
| 475 |
+
)
|
| 476 |
+
|
| 477 |
+
# Run recognition
|
| 478 |
+
bidi = config.get("bidi_reordering", True)
|
| 479 |
+
|
| 480 |
+
# Model is already wrapped as TorchSeqRecognizer in load_model()
|
| 481 |
+
# rpred returns a generator
|
| 482 |
+
results = list(rpred.rpred(
|
| 483 |
+
network=self.model,
|
| 484 |
+
im=binary_image,
|
| 485 |
+
bounds=seg,
|
| 486 |
+
bidi_reordering=bidi
|
| 487 |
+
))
|
| 488 |
+
|
| 489 |
+
# Extract text from first result
|
| 490 |
+
if results and len(results) > 0:
|
| 491 |
+
text = results[0].prediction
|
| 492 |
+
confidence = results[0].confidences
|
| 493 |
+
avg_confidence = sum(confidence) / len(confidence) if confidence else 1.0
|
| 494 |
+
|
| 495 |
+
return TranscriptionResult(
|
| 496 |
+
text=text,
|
| 497 |
+
confidence=avg_confidence,
|
| 498 |
+
metadata={"model": "kraken"}
|
| 499 |
+
)
|
| 500 |
+
else:
|
| 501 |
+
return TranscriptionResult(text="", confidence=0.0)
|
| 502 |
+
|
| 503 |
+
except Exception as e:
|
| 504 |
+
import traceback
|
| 505 |
+
print(f"Error in Kraken transcription: {e}")
|
| 506 |
+
print(traceback.format_exc())
|
| 507 |
+
return TranscriptionResult(text=f"[Error: {e}]", confidence=0.0)
|
| 508 |
+
|
| 509 |
+
def get_capabilities(self) -> Dict[str, bool]:
|
| 510 |
+
"""Kraken capabilities."""
|
| 511 |
+
return {
|
| 512 |
+
"batch_processing": False, # Could be implemented
|
| 513 |
+
"confidence_scores": True, # Kraken provides per-character confidence
|
| 514 |
+
"beam_search": False, # Internal to Kraken
|
| 515 |
+
"language_model": False, # Not explicitly exposed
|
| 516 |
+
"preprocessing": False, # External binarization recommended
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
def download_preset_model(preset_name: str) -> Optional[str]:
|
| 521 |
+
"""Module-level helper: resolve and (if needed) download a Kraken preset model.
|
| 522 |
+
|
| 523 |
+
Used by batch_processing.py and the web server without instantiating KrakenEngine.
|
| 524 |
+
Returns local file path, or None on failure.
|
| 525 |
+
"""
|
| 526 |
+
if preset_name not in KRAKEN_MODELS:
|
| 527 |
+
print(f"Unknown Kraken preset: '{preset_name}'. Available: {list(KRAKEN_MODELS)}")
|
| 528 |
+
return None
|
| 529 |
+
info = KRAKEN_MODELS[preset_name]
|
| 530 |
+
if info.get("source") == "local":
|
| 531 |
+
return info.get("path")
|
| 532 |
+
if info.get("source") == "zenodo":
|
| 533 |
+
engine = KrakenEngine.__new__(KrakenEngine)
|
| 534 |
+
return engine._download_zenodo_model(info["zenodo_id"], preset_name)
|
| 535 |
+
return None
|
engines/openwebui_engine.py
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenWebUI Engine Plugin
|
| 3 |
+
|
| 4 |
+
Wraps the OpenWebUI API (OpenAI-compatible) from uni-freiburg.de as an HTR engine.
|
| 5 |
+
Supports multiple models available on the OpenWebUI platform.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Dict, Any, Optional, List
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import numpy as np
|
| 11 |
+
from PIL import Image
|
| 12 |
+
import io
|
| 13 |
+
import base64
|
| 14 |
+
|
| 15 |
+
from htr_engine_base import HTREngine, TranscriptionResult
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from PyQt6.QtWidgets import (
|
| 19 |
+
QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
|
| 20 |
+
QPushButton, QCheckBox, QLineEdit, QGroupBox, QTextEdit,
|
| 21 |
+
QSpinBox
|
| 22 |
+
)
|
| 23 |
+
from PyQt6.QtCore import Qt
|
| 24 |
+
PYQT_AVAILABLE = True
|
| 25 |
+
except ImportError:
|
| 26 |
+
PYQT_AVAILABLE = False
|
| 27 |
+
QWidget = object
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from openai import OpenAI
|
| 31 |
+
OPENAI_AVAILABLE = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
OPENAI_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
from dotenv import load_dotenv
|
| 37 |
+
DOTENV_AVAILABLE = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
DOTENV_AVAILABLE = False
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class OpenWebUIEngine(HTREngine):
|
| 43 |
+
"""OpenWebUI API HTR engine plugin (OpenAI-compatible)."""
|
| 44 |
+
|
| 45 |
+
def __init__(self):
|
| 46 |
+
self.client: Optional[OpenAI] = None
|
| 47 |
+
self._config_widget: Optional[QWidget] = None
|
| 48 |
+
self._available_models: List[str] = []
|
| 49 |
+
|
| 50 |
+
# Store config from load_model for batch processing
|
| 51 |
+
self._loaded_config: Dict[str, Any] = {}
|
| 52 |
+
|
| 53 |
+
# Widget references
|
| 54 |
+
self._model_combo: Optional[QComboBox] = None
|
| 55 |
+
self._api_key_edit: Optional[QLineEdit] = None
|
| 56 |
+
self._show_key_check: Optional[QCheckBox] = None
|
| 57 |
+
self._prompt_edit: Optional[QTextEdit] = None
|
| 58 |
+
self._temperature_spin: Optional[QSpinBox] = None
|
| 59 |
+
self._max_tokens_spin: Optional[QSpinBox] = None
|
| 60 |
+
self._refresh_models_btn: Optional[QPushButton] = None
|
| 61 |
+
|
| 62 |
+
# Default API configuration
|
| 63 |
+
self.base_url = ""
|
| 64 |
+
|
| 65 |
+
# Load environment variables from .env file (only once when instantiated)
|
| 66 |
+
self._load_env_variables()
|
| 67 |
+
|
| 68 |
+
def _load_env_variables(self):
|
| 69 |
+
"""Load environment variables from .env file if available."""
|
| 70 |
+
try:
|
| 71 |
+
from dotenv import load_dotenv
|
| 72 |
+
# Look for .env in the project root (parent of engines/)
|
| 73 |
+
env_path = Path(__file__).parent.parent / ".env"
|
| 74 |
+
if env_path.exists():
|
| 75 |
+
load_dotenv(env_path)
|
| 76 |
+
except ImportError:
|
| 77 |
+
# Silently skip if python-dotenv is not installed
|
| 78 |
+
# Environment variables can still be set via OS
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
# Load environment variables from .env file (if available)
|
| 82 |
+
self._load_env_file()
|
| 83 |
+
|
| 84 |
+
def _load_env_file(self):
|
| 85 |
+
"""Load environment variables from project root's .env file.
|
| 86 |
+
|
| 87 |
+
Looks for .env in the project root directory (parent of engines/).
|
| 88 |
+
Silently skips loading if python-dotenv is not installed or if .env doesn't exist.
|
| 89 |
+
|
| 90 |
+
If .env loading fails or is skipped, the engine will still work if the API key
|
| 91 |
+
is provided through the config dict.
|
| 92 |
+
"""
|
| 93 |
+
if not DOTENV_AVAILABLE:
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
env_path = Path(__file__).parent.parent / ".env"
|
| 97 |
+
if env_path.exists():
|
| 98 |
+
load_dotenv(env_path)
|
| 99 |
+
|
| 100 |
+
def get_name(self) -> str:
|
| 101 |
+
return "OpenWebUI"
|
| 102 |
+
|
| 103 |
+
def get_description(self) -> str:
|
| 104 |
+
return "OpenWebUI API from openwebui.uni-freiburg.de (OpenAI-compatible, multiple models)"
|
| 105 |
+
|
| 106 |
+
def is_available(self) -> bool:
|
| 107 |
+
return OPENAI_AVAILABLE
|
| 108 |
+
|
| 109 |
+
def get_unavailable_reason(self) -> str:
|
| 110 |
+
if not OPENAI_AVAILABLE:
|
| 111 |
+
return "OpenAI library not installed. Install with: pip install openai"
|
| 112 |
+
return ""
|
| 113 |
+
|
| 114 |
+
def get_config_widget(self) -> QWidget:
|
| 115 |
+
"""Create OpenWebUI configuration panel."""
|
| 116 |
+
if self._config_widget is not None:
|
| 117 |
+
return self._config_widget
|
| 118 |
+
|
| 119 |
+
widget = QWidget()
|
| 120 |
+
layout = QVBoxLayout()
|
| 121 |
+
|
| 122 |
+
# API Key section
|
| 123 |
+
key_group = QGroupBox("API Key")
|
| 124 |
+
key_layout = QVBoxLayout()
|
| 125 |
+
|
| 126 |
+
key_input_layout = QHBoxLayout()
|
| 127 |
+
self._api_key_edit = QLineEdit()
|
| 128 |
+
self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
|
| 129 |
+
self._api_key_edit.setPlaceholderText("Enter your OpenWebUI API key")
|
| 130 |
+
key_input_layout.addWidget(self._api_key_edit)
|
| 131 |
+
|
| 132 |
+
self._show_key_check = QCheckBox("Show")
|
| 133 |
+
self._show_key_check.toggled.connect(self._toggle_key_visibility)
|
| 134 |
+
key_input_layout.addWidget(self._show_key_check)
|
| 135 |
+
key_layout.addLayout(key_input_layout)
|
| 136 |
+
|
| 137 |
+
key_hint = QLabel("Get your API key from https://openwebui.uni-freiburg.de")
|
| 138 |
+
key_hint.setStyleSheet("color: gray; font-size: 9pt;")
|
| 139 |
+
key_layout.addWidget(key_hint)
|
| 140 |
+
|
| 141 |
+
key_group.setLayout(key_layout)
|
| 142 |
+
layout.addWidget(key_group)
|
| 143 |
+
|
| 144 |
+
# Model selection with refresh button
|
| 145 |
+
model_group = QGroupBox("Model Selection")
|
| 146 |
+
model_layout = QVBoxLayout()
|
| 147 |
+
|
| 148 |
+
model_select_layout = QHBoxLayout()
|
| 149 |
+
self._model_combo = QComboBox()
|
| 150 |
+
self._model_combo.setMinimumWidth(300)
|
| 151 |
+
model_select_layout.addWidget(self._model_combo)
|
| 152 |
+
|
| 153 |
+
self._refresh_models_btn = QPushButton("Refresh Models")
|
| 154 |
+
self._refresh_models_btn.clicked.connect(self._refresh_models)
|
| 155 |
+
model_select_layout.addWidget(self._refresh_models_btn)
|
| 156 |
+
|
| 157 |
+
model_layout.addLayout(model_select_layout)
|
| 158 |
+
|
| 159 |
+
model_hint = QLabel("Click 'Refresh Models' to load available models from the server")
|
| 160 |
+
model_hint.setStyleSheet("color: gray; font-size: 9pt;")
|
| 161 |
+
model_layout.addWidget(model_hint)
|
| 162 |
+
|
| 163 |
+
model_group.setLayout(model_layout)
|
| 164 |
+
layout.addWidget(model_group)
|
| 165 |
+
|
| 166 |
+
# Generation parameters
|
| 167 |
+
params_group = QGroupBox("Generation Parameters")
|
| 168 |
+
params_layout = QVBoxLayout()
|
| 169 |
+
|
| 170 |
+
# Temperature
|
| 171 |
+
temp_layout = QHBoxLayout()
|
| 172 |
+
temp_layout.addWidget(QLabel("Temperature:"))
|
| 173 |
+
self._temperature_spin = QSpinBox()
|
| 174 |
+
self._temperature_spin.setRange(0, 100)
|
| 175 |
+
self._temperature_spin.setValue(10) # 0.1
|
| 176 |
+
self._temperature_spin.setSuffix(" (×0.01)")
|
| 177 |
+
temp_layout.addWidget(self._temperature_spin)
|
| 178 |
+
temp_layout.addStretch()
|
| 179 |
+
params_layout.addLayout(temp_layout)
|
| 180 |
+
|
| 181 |
+
# Max tokens
|
| 182 |
+
tokens_layout = QHBoxLayout()
|
| 183 |
+
tokens_layout.addWidget(QLabel("Max Tokens:"))
|
| 184 |
+
self._max_tokens_spin = QSpinBox()
|
| 185 |
+
self._max_tokens_spin.setRange(100, 4096)
|
| 186 |
+
self._max_tokens_spin.setValue(500)
|
| 187 |
+
tokens_layout.addWidget(self._max_tokens_spin)
|
| 188 |
+
tokens_layout.addStretch()
|
| 189 |
+
params_layout.addLayout(tokens_layout)
|
| 190 |
+
|
| 191 |
+
params_group.setLayout(params_layout)
|
| 192 |
+
layout.addWidget(params_group)
|
| 193 |
+
|
| 194 |
+
# Custom prompt section
|
| 195 |
+
prompt_group = QGroupBox("Custom Prompt (Optional)")
|
| 196 |
+
prompt_layout = QVBoxLayout()
|
| 197 |
+
|
| 198 |
+
self._prompt_edit = QTextEdit()
|
| 199 |
+
self._prompt_edit.setPlaceholderText(
|
| 200 |
+
"Enter custom transcription prompt...\n\n"
|
| 201 |
+
"Default prompt:\n"
|
| 202 |
+
"Transcribe the text in this historical manuscript line image. "
|
| 203 |
+
"Return only the transcribed text without any explanation or formatting."
|
| 204 |
+
)
|
| 205 |
+
self._prompt_edit.setMaximumHeight(120)
|
| 206 |
+
prompt_layout.addWidget(self._prompt_edit)
|
| 207 |
+
|
| 208 |
+
prompt_group.setLayout(prompt_layout)
|
| 209 |
+
layout.addWidget(prompt_group)
|
| 210 |
+
|
| 211 |
+
layout.addStretch()
|
| 212 |
+
widget.setLayout(layout)
|
| 213 |
+
|
| 214 |
+
self._config_widget = widget
|
| 215 |
+
|
| 216 |
+
# Try to load saved API key
|
| 217 |
+
self._load_saved_api_key()
|
| 218 |
+
|
| 219 |
+
return widget
|
| 220 |
+
|
| 221 |
+
def _toggle_key_visibility(self, checked: bool):
|
| 222 |
+
"""Toggle API key visibility."""
|
| 223 |
+
if checked:
|
| 224 |
+
self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Normal)
|
| 225 |
+
else:
|
| 226 |
+
self._api_key_edit.setEchoMode(QLineEdit.EchoMode.Password)
|
| 227 |
+
|
| 228 |
+
def _get_api_key_file(self) -> 'Path':
|
| 229 |
+
"""Get path to API key storage file."""
|
| 230 |
+
from pathlib import Path
|
| 231 |
+
storage_dir = Path.home() / ".trocr_gui"
|
| 232 |
+
storage_dir.mkdir(exist_ok=True)
|
| 233 |
+
return storage_dir / "api_keys.json"
|
| 234 |
+
|
| 235 |
+
def _load_saved_api_key(self):
|
| 236 |
+
"""Load saved API key."""
|
| 237 |
+
try:
|
| 238 |
+
import json
|
| 239 |
+
key_file = self._get_api_key_file()
|
| 240 |
+
|
| 241 |
+
if key_file.exists():
|
| 242 |
+
with open(key_file, "r") as f:
|
| 243 |
+
keys = json.load(f)
|
| 244 |
+
|
| 245 |
+
if "openwebui" in keys:
|
| 246 |
+
self._api_key_edit.setText(keys["openwebui"])
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"Warning: Could not load saved API key: {e}")
|
| 249 |
+
|
| 250 |
+
def _save_api_key(self):
|
| 251 |
+
"""Save API key."""
|
| 252 |
+
try:
|
| 253 |
+
import json
|
| 254 |
+
key_file = self._get_api_key_file()
|
| 255 |
+
|
| 256 |
+
# Load existing keys
|
| 257 |
+
keys = {}
|
| 258 |
+
if key_file.exists():
|
| 259 |
+
with open(key_file, "r") as f:
|
| 260 |
+
keys = json.load(f)
|
| 261 |
+
|
| 262 |
+
# Update key for OpenWebUI
|
| 263 |
+
api_key = self._api_key_edit.text().strip()
|
| 264 |
+
|
| 265 |
+
if api_key:
|
| 266 |
+
keys["openwebui"] = api_key
|
| 267 |
+
|
| 268 |
+
with open(key_file, "w") as f:
|
| 269 |
+
json.dump(keys, f, indent=2)
|
| 270 |
+
except Exception as e:
|
| 271 |
+
print(f"Warning: Could not save API key: {e}")
|
| 272 |
+
|
| 273 |
+
def _refresh_models(self):
|
| 274 |
+
"""Fetch available models from OpenWebUI API."""
|
| 275 |
+
api_key = self._api_key_edit.text().strip()
|
| 276 |
+
|
| 277 |
+
if not api_key:
|
| 278 |
+
self._model_combo.clear()
|
| 279 |
+
self._model_combo.addItem("Please enter API key first")
|
| 280 |
+
return
|
| 281 |
+
|
| 282 |
+
try:
|
| 283 |
+
# Create temporary client to fetch models
|
| 284 |
+
client = OpenAI(
|
| 285 |
+
base_url=self.base_url,
|
| 286 |
+
api_key=api_key
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
# Fetch models
|
| 290 |
+
models = client.models.list()
|
| 291 |
+
|
| 292 |
+
self._available_models = []
|
| 293 |
+
for model in models.data:
|
| 294 |
+
self._available_models.append(model.id)
|
| 295 |
+
|
| 296 |
+
# Update combo box
|
| 297 |
+
self._model_combo.clear()
|
| 298 |
+
if self._available_models:
|
| 299 |
+
self._model_combo.addItems(sorted(self._available_models))
|
| 300 |
+
print(f"[OpenWebUI] Loaded {len(self._available_models)} models")
|
| 301 |
+
else:
|
| 302 |
+
self._model_combo.addItem("No models found")
|
| 303 |
+
|
| 304 |
+
except Exception as e:
|
| 305 |
+
print(f"Error fetching models: {e}")
|
| 306 |
+
self._model_combo.clear()
|
| 307 |
+
self._model_combo.addItem(f"Error: {str(e)[:50]}")
|
| 308 |
+
|
| 309 |
+
def get_config(self) -> Dict[str, Any]:
|
| 310 |
+
"""Extract configuration from widget controls."""
|
| 311 |
+
if self._config_widget is None:
|
| 312 |
+
return {}
|
| 313 |
+
|
| 314 |
+
prompt_text = self._prompt_edit.toPlainText().strip()
|
| 315 |
+
|
| 316 |
+
return {
|
| 317 |
+
"api_key": self._api_key_edit.text().strip(),
|
| 318 |
+
"model": self._model_combo.currentText(),
|
| 319 |
+
"temperature": self._temperature_spin.value() / 100.0,
|
| 320 |
+
"max_tokens": self._max_tokens_spin.value(),
|
| 321 |
+
"custom_prompt": prompt_text if prompt_text else None,
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
def set_config(self, config: Dict[str, Any]):
|
| 325 |
+
"""Restore configuration to widget controls."""
|
| 326 |
+
if self._config_widget is None:
|
| 327 |
+
return
|
| 328 |
+
|
| 329 |
+
self._api_key_edit.setText(config.get("api_key", ""))
|
| 330 |
+
|
| 331 |
+
model = config.get("model", "")
|
| 332 |
+
idx = self._model_combo.findText(model)
|
| 333 |
+
if idx >= 0:
|
| 334 |
+
self._model_combo.setCurrentIndex(idx)
|
| 335 |
+
|
| 336 |
+
temp = int(config.get("temperature", 0.1) * 100)
|
| 337 |
+
self._temperature_spin.setValue(temp)
|
| 338 |
+
|
| 339 |
+
self._max_tokens_spin.setValue(config.get("max_tokens", 500))
|
| 340 |
+
|
| 341 |
+
custom_prompt = config.get("custom_prompt", "")
|
| 342 |
+
if custom_prompt:
|
| 343 |
+
self._prompt_edit.setPlainText(custom_prompt)
|
| 344 |
+
|
| 345 |
+
def load_model(self, config: Dict[str, Any]) -> bool:
|
| 346 |
+
"""Initialize OpenWebUI client."""
|
| 347 |
+
try:
|
| 348 |
+
api_key = config.get("api_key", "")
|
| 349 |
+
|
| 350 |
+
if not api_key:
|
| 351 |
+
print("Error: No API key provided. Paste your key in the field.")
|
| 352 |
+
return False
|
| 353 |
+
|
| 354 |
+
base_url = config.get("base_url", "").strip().rstrip("/")
|
| 355 |
+
if not base_url:
|
| 356 |
+
print("Error: No OpenWebUI base URL provided.")
|
| 357 |
+
return False
|
| 358 |
+
|
| 359 |
+
# Store config for batch processing (model, temperature, etc.)
|
| 360 |
+
self._loaded_config = config.copy()
|
| 361 |
+
|
| 362 |
+
# Save API key for future use
|
| 363 |
+
if self._api_key_edit and self._api_key_edit.text().strip():
|
| 364 |
+
self._save_api_key()
|
| 365 |
+
|
| 366 |
+
self.base_url = base_url
|
| 367 |
+
|
| 368 |
+
# Initialize client
|
| 369 |
+
self.client = OpenAI(
|
| 370 |
+
base_url=self.base_url,
|
| 371 |
+
api_key=api_key
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
model = config.get("model", config.get("model_id", "unknown"))
|
| 375 |
+
print(f"[OpenWebUI] Client initialized with base URL: {self.base_url}, model: {model}")
|
| 376 |
+
return True
|
| 377 |
+
|
| 378 |
+
except Exception as e:
|
| 379 |
+
print(f"Error initializing OpenWebUI client: {e}")
|
| 380 |
+
self.client = None
|
| 381 |
+
return False
|
| 382 |
+
|
| 383 |
+
def unload_model(self):
|
| 384 |
+
"""Unload OpenWebUI client."""
|
| 385 |
+
if self.client is not None:
|
| 386 |
+
self.client = None
|
| 387 |
+
self._loaded_config = {}
|
| 388 |
+
|
| 389 |
+
def is_model_loaded(self) -> bool:
|
| 390 |
+
"""Check if client is initialized."""
|
| 391 |
+
return self.client is not None
|
| 392 |
+
|
| 393 |
+
def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
|
| 394 |
+
"""Transcribe a line image with OpenWebUI API."""
|
| 395 |
+
if self.client is None:
|
| 396 |
+
return TranscriptionResult(text="[OpenWebUI client not initialized]", confidence=0.0)
|
| 397 |
+
|
| 398 |
+
if config is None:
|
| 399 |
+
# First try loaded config (from batch processing), then GUI config
|
| 400 |
+
if self._loaded_config:
|
| 401 |
+
config = self._loaded_config
|
| 402 |
+
else:
|
| 403 |
+
config = self.get_config()
|
| 404 |
+
|
| 405 |
+
try:
|
| 406 |
+
# Convert numpy array to PIL Image
|
| 407 |
+
if isinstance(image, np.ndarray):
|
| 408 |
+
pil_image = Image.fromarray(image)
|
| 409 |
+
else:
|
| 410 |
+
pil_image = image
|
| 411 |
+
|
| 412 |
+
# Convert to RGB if needed
|
| 413 |
+
if pil_image.mode != 'RGB':
|
| 414 |
+
pil_image = pil_image.convert('RGB')
|
| 415 |
+
|
| 416 |
+
# Encode image to base64
|
| 417 |
+
buffered = io.BytesIO()
|
| 418 |
+
pil_image.save(buffered, format="PNG")
|
| 419 |
+
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
| 420 |
+
|
| 421 |
+
# Prepare prompt
|
| 422 |
+
custom_prompt = config.get("custom_prompt")
|
| 423 |
+
if custom_prompt:
|
| 424 |
+
prompt = custom_prompt
|
| 425 |
+
else:
|
| 426 |
+
prompt = (
|
| 427 |
+
"Transcribe the text in this historical manuscript line image. "
|
| 428 |
+
"Return only the transcribed text without any explanation or formatting."
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
# Get model and parameters
|
| 432 |
+
model = config.get("model", "gpt-4-vision-preview")
|
| 433 |
+
temperature = config.get("temperature", 0.1)
|
| 434 |
+
max_tokens = config.get("max_tokens")
|
| 435 |
+
# Treat 0 as "no limit" (HTML number fields send 0 for blank)
|
| 436 |
+
if max_tokens is not None and max_tokens <= 0:
|
| 437 |
+
max_tokens = None
|
| 438 |
+
|
| 439 |
+
# Call OpenWebUI API (OpenAI-compatible)
|
| 440 |
+
api_kwargs = dict(
|
| 441 |
+
model=model,
|
| 442 |
+
messages=[
|
| 443 |
+
{
|
| 444 |
+
"role": "user",
|
| 445 |
+
"content": [
|
| 446 |
+
{
|
| 447 |
+
"type": "text",
|
| 448 |
+
"text": prompt
|
| 449 |
+
},
|
| 450 |
+
{
|
| 451 |
+
"type": "image_url",
|
| 452 |
+
"image_url": {
|
| 453 |
+
"url": f"data:image/png;base64,{img_base64}"
|
| 454 |
+
}
|
| 455 |
+
}
|
| 456 |
+
]
|
| 457 |
+
}
|
| 458 |
+
],
|
| 459 |
+
temperature=temperature,
|
| 460 |
+
)
|
| 461 |
+
if max_tokens is not None:
|
| 462 |
+
api_kwargs["max_tokens"] = max_tokens
|
| 463 |
+
response = self.client.chat.completions.create(**api_kwargs)
|
| 464 |
+
|
| 465 |
+
# Extract transcription
|
| 466 |
+
text = response.choices[0].message.content.strip()
|
| 467 |
+
|
| 468 |
+
# Extract usage info
|
| 469 |
+
usage = {}
|
| 470 |
+
if hasattr(response, 'usage') and response.usage:
|
| 471 |
+
usage = {
|
| 472 |
+
"prompt_tokens": response.usage.prompt_tokens,
|
| 473 |
+
"completion_tokens": response.usage.completion_tokens,
|
| 474 |
+
"total_tokens": response.usage.total_tokens
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
return TranscriptionResult(
|
| 478 |
+
text=text,
|
| 479 |
+
confidence=1.0, # OpenWebUI doesn't provide confidence
|
| 480 |
+
metadata={
|
| 481 |
+
"provider": "openwebui",
|
| 482 |
+
"model": model,
|
| 483 |
+
"usage": usage
|
| 484 |
+
}
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
except Exception as e:
|
| 488 |
+
print(f"Error in OpenWebUI transcription: {e}")
|
| 489 |
+
import traceback
|
| 490 |
+
traceback.print_exc()
|
| 491 |
+
return TranscriptionResult(text=f"[OpenWebUI Error: {e}]", confidence=0.0)
|
| 492 |
+
|
| 493 |
+
def get_capabilities(self) -> Dict[str, bool]:
|
| 494 |
+
"""OpenWebUI capabilities."""
|
| 495 |
+
return {
|
| 496 |
+
"batch_processing": False,
|
| 497 |
+
"confidence_scores": False,
|
| 498 |
+
"beam_search": False,
|
| 499 |
+
"language_model": True,
|
| 500 |
+
"preprocessing": True,
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
def requires_line_segmentation(self) -> bool:
|
| 504 |
+
"""OpenWebUI VLMs can process full pages directly without segmentation."""
|
| 505 |
+
return False # VLMs process full page images
|
engines/pylaia_engine.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PyLaia Engine Plugin
|
| 3 |
+
|
| 4 |
+
Wraps the PyLaia CTC-based HTR inference system as a plugin.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, Any, List, Optional
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
from htr_engine_base import HTREngine, TranscriptionResult
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from PyQt6.QtWidgets import (
|
| 15 |
+
QWidget, QVBoxLayout, QHBoxLayout, QLabel, QComboBox,
|
| 16 |
+
QPushButton, QCheckBox, QLineEdit, QFileDialog,
|
| 17 |
+
QGroupBox, QDoubleSpinBox
|
| 18 |
+
)
|
| 19 |
+
from PyQt6.QtCore import Qt
|
| 20 |
+
PYQT_AVAILABLE = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
PYQT_AVAILABLE = False
|
| 23 |
+
QWidget = object
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
# Use native Linux implementation (no WSL dependency)
|
| 27 |
+
from inference_pylaia_native import PyLaiaInference, PYLAIA_MODELS
|
| 28 |
+
PYLAIA_AVAILABLE = True
|
| 29 |
+
PYLAIA_LM_AVAILABLE = False # Language model not yet implemented
|
| 30 |
+
except ImportError:
|
| 31 |
+
PYLAIA_AVAILABLE = False
|
| 32 |
+
PYLAIA_MODELS = {}
|
| 33 |
+
PYLAIA_LM_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class PyLaiaEngine(HTREngine):
|
| 37 |
+
"""PyLaia CTC-based HTR engine plugin."""
|
| 38 |
+
|
| 39 |
+
def __init__(self):
|
| 40 |
+
self.model: Optional[PyLaiaInference] = None
|
| 41 |
+
self.model_lm: Optional[PyLaiaInferenceLM] = None
|
| 42 |
+
self._config_widget: Optional[QWidget] = None
|
| 43 |
+
|
| 44 |
+
# Widget references
|
| 45 |
+
self._model_combo: Optional[QComboBox] = None
|
| 46 |
+
self._use_lm_check: Optional[QCheckBox] = None
|
| 47 |
+
self._lm_weight_spin: Optional[QDoubleSpinBox] = None
|
| 48 |
+
self._custom_model_edit: Optional[QLineEdit] = None
|
| 49 |
+
self._custom_lm_edit: Optional[QLineEdit] = None
|
| 50 |
+
self._enable_spaces_check: Optional[QCheckBox] = None
|
| 51 |
+
|
| 52 |
+
def get_name(self) -> str:
|
| 53 |
+
return "CRNN-CTC (PyLaia-inspired)"
|
| 54 |
+
|
| 55 |
+
def get_description(self) -> str:
|
| 56 |
+
return "Puigcerver CRNN-CTC: clean-room PyTorch reimplementation of the PyLaia architecture"
|
| 57 |
+
|
| 58 |
+
def get_aliases(self) -> List[str]:
|
| 59 |
+
return ["crnn-ctc", "CRNN-CTC", "PyLaia"] # "PyLaia" kept for backward compatibility
|
| 60 |
+
|
| 61 |
+
def is_available(self) -> bool:
|
| 62 |
+
return PYLAIA_AVAILABLE
|
| 63 |
+
|
| 64 |
+
def get_unavailable_reason(self) -> str:
|
| 65 |
+
if not PYLAIA_AVAILABLE:
|
| 66 |
+
return "CRNN-CTC engine not available. Check that inference_pylaia_native.py exists and dependencies are installed."
|
| 67 |
+
return ""
|
| 68 |
+
|
| 69 |
+
def get_config_widget(self) -> QWidget:
|
| 70 |
+
"""Create PyLaia configuration panel."""
|
| 71 |
+
if self._config_widget is not None:
|
| 72 |
+
return self._config_widget
|
| 73 |
+
|
| 74 |
+
widget = QWidget()
|
| 75 |
+
layout = QVBoxLayout()
|
| 76 |
+
|
| 77 |
+
# Model selection
|
| 78 |
+
model_group = QGroupBox("Model Selection")
|
| 79 |
+
model_layout = QVBoxLayout()
|
| 80 |
+
|
| 81 |
+
# Preset models
|
| 82 |
+
model_layout.addWidget(QLabel("Preset Model:"))
|
| 83 |
+
self._model_combo = QComboBox()
|
| 84 |
+
self._populate_preset_models()
|
| 85 |
+
self._model_combo.currentTextChanged.connect(self._on_preset_changed)
|
| 86 |
+
model_layout.addWidget(self._model_combo)
|
| 87 |
+
|
| 88 |
+
# Custom model path
|
| 89 |
+
model_layout.addWidget(QLabel("Custom Model Path:"))
|
| 90 |
+
custom_layout = QHBoxLayout()
|
| 91 |
+
self._custom_model_edit = QLineEdit()
|
| 92 |
+
self._custom_model_edit.setPlaceholderText("Leave empty to use preset model")
|
| 93 |
+
custom_layout.addWidget(self._custom_model_edit)
|
| 94 |
+
browse_model_btn = QPushButton("Browse...")
|
| 95 |
+
browse_model_btn.clicked.connect(self._browse_model)
|
| 96 |
+
custom_layout.addWidget(browse_model_btn)
|
| 97 |
+
model_layout.addLayout(custom_layout)
|
| 98 |
+
|
| 99 |
+
model_group.setLayout(model_layout)
|
| 100 |
+
layout.addWidget(model_group)
|
| 101 |
+
|
| 102 |
+
# Language model settings
|
| 103 |
+
lm_group = QGroupBox("Language Model (Optional)")
|
| 104 |
+
lm_layout = QVBoxLayout()
|
| 105 |
+
|
| 106 |
+
self._use_lm_check = QCheckBox("Use Language Model")
|
| 107 |
+
self._use_lm_check.setChecked(False)
|
| 108 |
+
self._use_lm_check.toggled.connect(self._on_lm_toggled)
|
| 109 |
+
if not PYLAIA_LM_AVAILABLE:
|
| 110 |
+
self._use_lm_check.setEnabled(False)
|
| 111 |
+
self._use_lm_check.setToolTip("KenLM not available. Install with: pip install kenlm")
|
| 112 |
+
lm_layout.addWidget(self._use_lm_check)
|
| 113 |
+
|
| 114 |
+
# LM weight
|
| 115 |
+
weight_layout = QHBoxLayout()
|
| 116 |
+
weight_layout.addWidget(QLabel("LM Weight:"))
|
| 117 |
+
self._lm_weight_spin = QDoubleSpinBox()
|
| 118 |
+
self._lm_weight_spin.setRange(0.0, 10.0)
|
| 119 |
+
self._lm_weight_spin.setValue(1.5)
|
| 120 |
+
self._lm_weight_spin.setSingleStep(0.1)
|
| 121 |
+
self._lm_weight_spin.setToolTip("Higher = more influence from language model")
|
| 122 |
+
self._lm_weight_spin.setEnabled(False)
|
| 123 |
+
weight_layout.addWidget(self._lm_weight_spin)
|
| 124 |
+
weight_layout.addStretch()
|
| 125 |
+
lm_layout.addLayout(weight_layout)
|
| 126 |
+
|
| 127 |
+
# Custom LM path
|
| 128 |
+
lm_layout.addWidget(QLabel("Custom LM Path:"))
|
| 129 |
+
custom_lm_layout = QHBoxLayout()
|
| 130 |
+
self._custom_lm_edit = QLineEdit()
|
| 131 |
+
self._custom_lm_edit.setPlaceholderText("Leave empty for auto-detection")
|
| 132 |
+
self._custom_lm_edit.setEnabled(False)
|
| 133 |
+
custom_lm_layout.addWidget(self._custom_lm_edit)
|
| 134 |
+
browse_lm_btn = QPushButton("Browse...")
|
| 135 |
+
browse_lm_btn.clicked.connect(self._browse_lm)
|
| 136 |
+
browse_lm_btn.setEnabled(False)
|
| 137 |
+
self._browse_lm_btn = browse_lm_btn
|
| 138 |
+
custom_lm_layout.addWidget(browse_lm_btn)
|
| 139 |
+
lm_layout.addLayout(custom_lm_layout)
|
| 140 |
+
|
| 141 |
+
lm_group.setLayout(lm_layout)
|
| 142 |
+
layout.addWidget(lm_group)
|
| 143 |
+
|
| 144 |
+
# Output options
|
| 145 |
+
output_group = QGroupBox("Output Options")
|
| 146 |
+
output_layout = QVBoxLayout()
|
| 147 |
+
|
| 148 |
+
self._enable_spaces_check = QCheckBox("Convert <space> tokens to spaces")
|
| 149 |
+
self._enable_spaces_check.setChecked(True)
|
| 150 |
+
self._enable_spaces_check.setToolTip(
|
| 151 |
+
"When enabled, <space> or <SPACE> tokens in the vocabulary are converted to actual spaces.\n"
|
| 152 |
+
"Disable to keep them as literal <space> text."
|
| 153 |
+
)
|
| 154 |
+
output_layout.addWidget(self._enable_spaces_check)
|
| 155 |
+
|
| 156 |
+
self._flip_rtl_check = QCheckBox("RTL manuscript (flip line images)")
|
| 157 |
+
self._flip_rtl_check.setChecked(False)
|
| 158 |
+
self._flip_rtl_check.setToolTip(
|
| 159 |
+
"Flip line images horizontally for right-to-left scripts.\n"
|
| 160 |
+
"Required for models trained on RTL manuscripts (Ottoman, Arabic, Hebrew, etc.)\n"
|
| 161 |
+
"with left-to-right transcriptions (Latin transliteration)."
|
| 162 |
+
)
|
| 163 |
+
output_layout.addWidget(self._flip_rtl_check)
|
| 164 |
+
|
| 165 |
+
output_group.setLayout(output_layout)
|
| 166 |
+
layout.addWidget(output_group)
|
| 167 |
+
|
| 168 |
+
layout.addStretch()
|
| 169 |
+
widget.setLayout(layout)
|
| 170 |
+
|
| 171 |
+
self._config_widget = widget
|
| 172 |
+
return widget
|
| 173 |
+
|
| 174 |
+
def _populate_preset_models(self):
|
| 175 |
+
"""Populate preset models dropdown."""
|
| 176 |
+
if self._model_combo is None:
|
| 177 |
+
return
|
| 178 |
+
|
| 179 |
+
self._model_combo.clear()
|
| 180 |
+
|
| 181 |
+
if not PYLAIA_MODELS:
|
| 182 |
+
self._model_combo.addItem("No preset models found")
|
| 183 |
+
return
|
| 184 |
+
|
| 185 |
+
for model_id in PYLAIA_MODELS.keys():
|
| 186 |
+
self._model_combo.addItem(model_id)
|
| 187 |
+
|
| 188 |
+
def _on_preset_changed(self, preset_name: str):
|
| 189 |
+
"""Update when preset changes."""
|
| 190 |
+
# Could add description display here
|
| 191 |
+
pass
|
| 192 |
+
|
| 193 |
+
def _on_lm_toggled(self, checked: bool):
|
| 194 |
+
"""Enable/disable LM controls."""
|
| 195 |
+
self._lm_weight_spin.setEnabled(checked)
|
| 196 |
+
self._custom_lm_edit.setEnabled(checked)
|
| 197 |
+
self._browse_lm_btn.setEnabled(checked)
|
| 198 |
+
|
| 199 |
+
def _browse_model(self):
|
| 200 |
+
"""Open file dialog to select model file."""
|
| 201 |
+
file_path, _ = QFileDialog.getOpenFileName(
|
| 202 |
+
self._config_widget,
|
| 203 |
+
"Select CRNN-CTC Model",
|
| 204 |
+
"models",
|
| 205 |
+
"CRNN-CTC Models (*.ckpt *.pth *.pt);;All Files (*)"
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
if file_path:
|
| 209 |
+
self._custom_model_edit.setText(file_path)
|
| 210 |
+
|
| 211 |
+
def _browse_lm(self):
|
| 212 |
+
"""Open file dialog to select LM file."""
|
| 213 |
+
file_path, _ = QFileDialog.getOpenFileName(
|
| 214 |
+
self._config_widget,
|
| 215 |
+
"Select KenLM Model",
|
| 216 |
+
"models",
|
| 217 |
+
"KenLM Models (*.arpa *.klm *.bin);;All Files (*)"
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
if file_path:
|
| 221 |
+
self._custom_lm_edit.setText(file_path)
|
| 222 |
+
|
| 223 |
+
def get_config(self) -> Dict[str, Any]:
|
| 224 |
+
"""Extract configuration from widget controls."""
|
| 225 |
+
if self._config_widget is None:
|
| 226 |
+
return {}
|
| 227 |
+
|
| 228 |
+
custom_model = self._custom_model_edit.text().strip()
|
| 229 |
+
preset_model = self._model_combo.currentText()
|
| 230 |
+
|
| 231 |
+
config = {
|
| 232 |
+
"model_path": custom_model if custom_model else preset_model,
|
| 233 |
+
"use_lm": self._use_lm_check.isChecked(),
|
| 234 |
+
"lm_weight": self._lm_weight_spin.value(),
|
| 235 |
+
"enable_spaces": self._enable_spaces_check.isChecked(),
|
| 236 |
+
"flip_rtl": self._flip_rtl_check.isChecked(),
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
if config["use_lm"]:
|
| 240 |
+
custom_lm = self._custom_lm_edit.text().strip()
|
| 241 |
+
if custom_lm:
|
| 242 |
+
config["lm_path"] = custom_lm
|
| 243 |
+
|
| 244 |
+
return config
|
| 245 |
+
|
| 246 |
+
def set_config(self, config: Dict[str, Any]):
|
| 247 |
+
"""Restore configuration to widget controls."""
|
| 248 |
+
if self._config_widget is None:
|
| 249 |
+
return
|
| 250 |
+
|
| 251 |
+
model_path = config.get("model_path", "")
|
| 252 |
+
|
| 253 |
+
# Try to find in presets
|
| 254 |
+
idx = self._model_combo.findText(model_path)
|
| 255 |
+
if idx >= 0:
|
| 256 |
+
self._model_combo.setCurrentIndex(idx)
|
| 257 |
+
self._custom_model_edit.clear()
|
| 258 |
+
else:
|
| 259 |
+
self._custom_model_edit.setText(model_path)
|
| 260 |
+
|
| 261 |
+
self._use_lm_check.setChecked(config.get("use_lm", False))
|
| 262 |
+
self._lm_weight_spin.setValue(config.get("lm_weight", 1.5))
|
| 263 |
+
self._enable_spaces_check.setChecked(config.get("enable_spaces", True))
|
| 264 |
+
if hasattr(self, '_flip_rtl_check'):
|
| 265 |
+
self._flip_rtl_check.setChecked(config.get("flip_rtl", False))
|
| 266 |
+
|
| 267 |
+
if "lm_path" in config:
|
| 268 |
+
self._custom_lm_edit.setText(config["lm_path"])
|
| 269 |
+
|
| 270 |
+
def load_model(self, config: Dict[str, Any]) -> bool:
|
| 271 |
+
"""Load PyLaia model."""
|
| 272 |
+
try:
|
| 273 |
+
model_path = config.get("model_path", "")
|
| 274 |
+
if not model_path or model_path == "No preset models found":
|
| 275 |
+
return False
|
| 276 |
+
|
| 277 |
+
# If it's a preset name, resolve to actual path and syms
|
| 278 |
+
syms_path = None
|
| 279 |
+
if model_path in PYLAIA_MODELS:
|
| 280 |
+
preset_info = PYLAIA_MODELS[model_path]
|
| 281 |
+
if isinstance(preset_info, dict):
|
| 282 |
+
if preset_info.get("repo_id"):
|
| 283 |
+
try:
|
| 284 |
+
from huggingface_hub import hf_hub_download
|
| 285 |
+
except ImportError as exc:
|
| 286 |
+
raise RuntimeError(
|
| 287 |
+
"huggingface_hub is required for Hugging Face model presets"
|
| 288 |
+
) from exc
|
| 289 |
+
repo_id = preset_info["repo_id"]
|
| 290 |
+
model_path = hf_hub_download(
|
| 291 |
+
repo_id=repo_id,
|
| 292 |
+
filename=preset_info.get("checkpoint", "best_model.pt"),
|
| 293 |
+
)
|
| 294 |
+
syms_path = hf_hub_download(
|
| 295 |
+
repo_id=repo_id,
|
| 296 |
+
filename=preset_info.get("syms", "symbols.txt"),
|
| 297 |
+
)
|
| 298 |
+
else:
|
| 299 |
+
model_path = preset_info.get("checkpoint", preset_info.get("path", model_path))
|
| 300 |
+
syms_path = preset_info.get("syms")
|
| 301 |
+
# If preset_info is just a string, use it as the path
|
| 302 |
+
elif isinstance(preset_info, str):
|
| 303 |
+
model_path = preset_info
|
| 304 |
+
|
| 305 |
+
use_lm = config.get("use_lm", False)
|
| 306 |
+
|
| 307 |
+
# Unload previous model
|
| 308 |
+
self.unload_model()
|
| 309 |
+
|
| 310 |
+
if use_lm and PYLAIA_LM_AVAILABLE:
|
| 311 |
+
# Load with language model
|
| 312 |
+
lm_weight = config.get("lm_weight", 1.5)
|
| 313 |
+
lm_path = config.get("lm_path")
|
| 314 |
+
|
| 315 |
+
self.model_lm = PyLaiaInferenceLM(
|
| 316 |
+
model_path=model_path,
|
| 317 |
+
lm_path=lm_path,
|
| 318 |
+
lm_weight=lm_weight
|
| 319 |
+
)
|
| 320 |
+
self.model = None
|
| 321 |
+
else:
|
| 322 |
+
# Load without language model
|
| 323 |
+
# PyLaiaInference expects checkpoint_path, syms_path, and enable_spaces
|
| 324 |
+
enable_spaces = config.get("enable_spaces", True)
|
| 325 |
+
self.model = PyLaiaInference(
|
| 326 |
+
checkpoint_path=model_path,
|
| 327 |
+
syms_path=syms_path,
|
| 328 |
+
enable_spaces=enable_spaces
|
| 329 |
+
)
|
| 330 |
+
self.model_lm = None
|
| 331 |
+
|
| 332 |
+
return True
|
| 333 |
+
|
| 334 |
+
except Exception as e:
|
| 335 |
+
import traceback
|
| 336 |
+
print(f"Error loading PyLaia model: {e}")
|
| 337 |
+
print(traceback.format_exc())
|
| 338 |
+
self.model = None
|
| 339 |
+
self.model_lm = None
|
| 340 |
+
return False
|
| 341 |
+
|
| 342 |
+
def unload_model(self):
|
| 343 |
+
"""Unload model from memory."""
|
| 344 |
+
if self.model is not None:
|
| 345 |
+
del self.model
|
| 346 |
+
self.model = None
|
| 347 |
+
|
| 348 |
+
if self.model_lm is not None:
|
| 349 |
+
del self.model_lm
|
| 350 |
+
self.model_lm = None
|
| 351 |
+
|
| 352 |
+
# Free GPU memory
|
| 353 |
+
import torch
|
| 354 |
+
if torch.cuda.is_available():
|
| 355 |
+
torch.cuda.empty_cache()
|
| 356 |
+
|
| 357 |
+
def is_model_loaded(self) -> bool:
|
| 358 |
+
"""Check if model is loaded."""
|
| 359 |
+
return self.model is not None or self.model_lm is not None
|
| 360 |
+
|
| 361 |
+
def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
|
| 362 |
+
"""Transcribe a line image with PyLaia."""
|
| 363 |
+
if not self.is_model_loaded():
|
| 364 |
+
return TranscriptionResult(text="[Model not loaded]", confidence=0.0)
|
| 365 |
+
|
| 366 |
+
try:
|
| 367 |
+
# Convert numpy to PIL
|
| 368 |
+
from PIL import Image as PILImage
|
| 369 |
+
if isinstance(image, np.ndarray):
|
| 370 |
+
pil_image = PILImage.fromarray(image)
|
| 371 |
+
else:
|
| 372 |
+
pil_image = image
|
| 373 |
+
|
| 374 |
+
# Flip horizontally for RTL scripts
|
| 375 |
+
if config and config.get("flip_rtl", False):
|
| 376 |
+
pil_image = pil_image.transpose(PILImage.FLIP_LEFT_RIGHT)
|
| 377 |
+
|
| 378 |
+
# PyLaiaInferenceWSL uses transcribe() which returns (text, confidence) tuple
|
| 379 |
+
# Use LM version if available (not yet implemented for WSL)
|
| 380 |
+
if self.model_lm is not None:
|
| 381 |
+
# PyLaiaInferenceLM might have different method
|
| 382 |
+
result = self.model_lm.transcribe(pil_image)
|
| 383 |
+
else:
|
| 384 |
+
result = self.model.transcribe(pil_image)
|
| 385 |
+
|
| 386 |
+
# Result is a tuple: (text, confidence)
|
| 387 |
+
if isinstance(result, tuple):
|
| 388 |
+
text, confidence = result
|
| 389 |
+
else:
|
| 390 |
+
# Fallback for dict-style results
|
| 391 |
+
text = result.get("text", "")
|
| 392 |
+
confidence = result.get("confidence", 1.0)
|
| 393 |
+
|
| 394 |
+
return TranscriptionResult(
|
| 395 |
+
text=text,
|
| 396 |
+
confidence=confidence,
|
| 397 |
+
metadata={"model": "pylaia"}
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
except Exception as e:
|
| 401 |
+
import traceback
|
| 402 |
+
print(f"Error in PyLaia transcription: {e}")
|
| 403 |
+
print(traceback.format_exc())
|
| 404 |
+
return TranscriptionResult(text=f"[Error: {e}]", confidence=0.0)
|
| 405 |
+
|
| 406 |
+
def get_capabilities(self) -> Dict[str, bool]:
|
| 407 |
+
"""PyLaia capabilities."""
|
| 408 |
+
return {
|
| 409 |
+
"batch_processing": False, # Could be implemented
|
| 410 |
+
"confidence_scores": True, # CTC provides confidence
|
| 411 |
+
"beam_search": False, # CTC uses greedy/beam decoding
|
| 412 |
+
"language_model": PYLAIA_LM_AVAILABLE, # Optional KenLM
|
| 413 |
+
"preprocessing": False, # External preprocessing recommended
|
| 414 |
+
}
|
hf-space/README.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Polyscriptor HTR Demo
|
| 3 |
+
emoji: 📝
|
| 4 |
+
colorFrom: teal
|
| 5 |
+
colorTo: slate
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Polyscriptor HTR Demo
|
| 12 |
+
|
| 13 |
+
This is the hosted Hugging Face Spaces demo for Polyscriptor. It runs the
|
| 14 |
+
existing FastAPI/Web UI with a constrained demo mode:
|
| 15 |
+
|
| 16 |
+
- CRNN-CTC (PyLaia-inspired) engines only.
|
| 17 |
+
- Public model presets are downloaded from `achimrabus/*` Hugging Face model repos.
|
| 18 |
+
- CPU inference only.
|
| 19 |
+
- Kraken Classical line segmentation, with HPP as a lightweight fallback.
|
| 20 |
+
- Temporary uploads only.
|
| 21 |
+
|
| 22 |
+
The normal Polyscriptor server, local GPU workflow, and the existing mobile PWA
|
| 23 |
+
demo under `web/static/pwa/` are not changed by this Space configuration.
|
| 24 |
+
|
| 25 |
+
## Source Code
|
| 26 |
+
|
| 27 |
+
Public source repository:
|
| 28 |
+
|
| 29 |
+
https://github.com/achimrabus/polyscriptor
|
| 30 |
+
|
| 31 |
+
The Space repository is a curated deployment snapshot for the hosted demo. The
|
| 32 |
+
GitHub repository contains the broader Polyscriptor codebase and local workflows.
|
| 33 |
+
|
| 34 |
+
## Deployment Note
|
| 35 |
+
|
| 36 |
+
Hugging Face Docker Spaces expect the `Dockerfile` at the root of the Space
|
| 37 |
+
repository. This branch includes a root `Dockerfile` for direct Space builds and
|
| 38 |
+
keeps the Space-specific notes and dependency set in `hf-space/`.
|
| 39 |
+
|
| 40 |
+
When publishing into a dedicated Space repository under
|
| 41 |
+
`https://huggingface.co/spaces/achimrabus/...`, use `hf-space/SPACE_README.md`
|
| 42 |
+
as the Space repository root `README.md`. The Polyscriptor project README is
|
| 43 |
+
left untouched in this branch.
|
hf-space/SPACE_README.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Polyscriptor HTR Demo
|
| 3 |
+
emoji: 📝
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Polyscriptor HTR Demo
|
| 12 |
+
|
| 13 |
+
Polyscriptor is a browser-based demo for handwritten text recognition (HTR) on
|
| 14 |
+
historical Slavic manuscript material. This Hugging Face Space runs a constrained
|
| 15 |
+
public version of the Polyscriptor FastAPI/Web interface.
|
| 16 |
+
|
| 17 |
+
The hosted demo is intended for quick inspection and teaching. It is not the full
|
| 18 |
+
local research environment used for training, batch processing, GPU inference, or
|
| 19 |
+
private manuscript collections.
|
| 20 |
+
|
| 21 |
+
## Source Code
|
| 22 |
+
|
| 23 |
+
The public Polyscriptor source code is available on GitHub:
|
| 24 |
+
|
| 25 |
+
https://github.com/achimrabus/polyscriptor
|
| 26 |
+
|
| 27 |
+
This Hugging Face Space contains the curated hosted demo deployment. The GitHub
|
| 28 |
+
repository contains the broader Polyscriptor codebase, including the web UI,
|
| 29 |
+
engine plugins, segmentation code, training utilities, and local workflows.
|
| 30 |
+
|
| 31 |
+
## What This Demo Supports
|
| 32 |
+
|
| 33 |
+
- CRNN-CTC / PyLaia-inspired HTR presets for selected public model repositories.
|
| 34 |
+
- User-supplied API keys for OpenAI, Gemini, Claude, and OpenWebUI-compatible
|
| 35 |
+
endpoints.
|
| 36 |
+
- Public model download from the Hugging Face Hub, primarily under
|
| 37 |
+
`achimrabus/*`.
|
| 38 |
+
- CPU-only inference.
|
| 39 |
+
- Kraken Classical line segmentation, with HPP as a lightweight fallback.
|
| 40 |
+
- Temporary image uploads during the active session.
|
| 41 |
+
|
| 42 |
+
## Limitations
|
| 43 |
+
|
| 44 |
+
- No private models are bundled with this Space.
|
| 45 |
+
- API-based engines require users to paste their own API key in the browser
|
| 46 |
+
form. The Space does not ship with shared provider credentials.
|
| 47 |
+
- Uploaded files are treated as temporary runtime data and are not part of the
|
| 48 |
+
repository.
|
| 49 |
+
- Large local GPU/VLM engines from the full Polyscriptor workflow are not
|
| 50 |
+
enabled here.
|
| 51 |
+
- Accuracy depends strongly on script, language, writing style, image quality,
|
| 52 |
+
and segmentation quality.
|
| 53 |
+
|
| 54 |
+
## Model Notes
|
| 55 |
+
|
| 56 |
+
The demo uses publicly available model presets. For best results, choose a model
|
| 57 |
+
that matches the manuscript tradition as closely as possible. The current public
|
| 58 |
+
Polyscriptor model cards are available at:
|
| 59 |
+
|
| 60 |
+
https://huggingface.co/achimrabus
|
| 61 |
+
|
| 62 |
+
## Project Context
|
| 63 |
+
|
| 64 |
+
Polyscriptor is developed for historical HTR workflows, with a focus on Slavic
|
| 65 |
+
manuscripts and reproducible comparison of OCR/HTR engines. The full development
|
| 66 |
+
repository contains additional tooling for local use, training, evaluation, and
|
| 67 |
+
batch processing; this Space contains only the hosted demo configuration.
|
| 68 |
+
|
| 69 |
+
## Privacy
|
| 70 |
+
|
| 71 |
+
Do not upload sensitive or unpublished manuscript images unless you are
|
| 72 |
+
comfortable processing them in a hosted public demo environment. The application
|
| 73 |
+
uses temporary server-side files during processing, but this Space should be
|
| 74 |
+
treated as a public demonstration service rather than a secure private workflow.
|
| 75 |
+
|
| 76 |
+
For API-based engines, provider keys are entered by the user at runtime. Do not
|
| 77 |
+
commit keys to this repository or add them to the Space configuration unless you
|
| 78 |
+
intend to provide a shared project credential.
|
hf-space/requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 2 |
+
|
| 3 |
+
torch>=2.5.1,<2.10
|
| 4 |
+
torchvision>=0.20.1,<0.25
|
| 5 |
+
numpy>=2.0,<2.1
|
| 6 |
+
pillow==11.1.0
|
| 7 |
+
opencv-python-headless==4.11.0.86
|
| 8 |
+
scikit-learn>=1.5,<1.6
|
| 9 |
+
scipy>=1.13,<1.14
|
| 10 |
+
kraken>=6.0.0,<7.0.0
|
| 11 |
+
fastapi>=0.111.0
|
| 12 |
+
uvicorn[standard]>=0.29.0
|
| 13 |
+
python-multipart>=0.0.9
|
| 14 |
+
pymupdf>=1.24.0
|
| 15 |
+
pyyaml==6.0.2
|
| 16 |
+
huggingface_hub>=0.23.0
|
| 17 |
+
python-Levenshtein>=0.23.0
|
| 18 |
+
openai>=1.50.0
|
| 19 |
+
anthropic>=0.34.0
|
| 20 |
+
google-genai>=1.0.0
|
| 21 |
+
google-generativeai>=0.8.0
|
htr_engine_base.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HTR Engine Plugin System - Base Classes and Registry
|
| 3 |
+
|
| 4 |
+
This module defines the plugin architecture for HTR (Handwritten Text Recognition) engines.
|
| 5 |
+
All HTR engines (TrOCR, Qwen3, CRNN-CTC, Kraken, etc.) implement the HTREngine interface.
|
| 6 |
+
|
| 7 |
+
Design principles:
|
| 8 |
+
- Abstraction: Each engine is self-contained and interchangeable
|
| 9 |
+
- Scalability: New engines can be added without modifying existing code
|
| 10 |
+
- Consistency: All engines expose the same interface to the GUI
|
| 11 |
+
- Flexibility: Each engine can have custom configuration widgets
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from abc import ABC, abstractmethod
|
| 15 |
+
from typing import Dict, Any, Optional, List
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
import os
|
| 18 |
+
import numpy as np
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from PyQt6.QtWidgets import QWidget, QVBoxLayout, QLabel
|
| 22 |
+
PYQT_AVAILABLE = True
|
| 23 |
+
except ImportError:
|
| 24 |
+
PYQT_AVAILABLE = False
|
| 25 |
+
QWidget = object
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class TranscriptionResult:
|
| 30 |
+
"""Result from HTR engine transcription."""
|
| 31 |
+
text: str
|
| 32 |
+
confidence: float = 1.0
|
| 33 |
+
metadata: Dict[str, Any] = None
|
| 34 |
+
|
| 35 |
+
def __post_init__(self):
|
| 36 |
+
if self.metadata is None:
|
| 37 |
+
self.metadata = {}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class HTREngine(ABC):
|
| 41 |
+
"""Abstract base class for HTR engines.
|
| 42 |
+
|
| 43 |
+
All HTR engines must implement this interface to be compatible
|
| 44 |
+
with the GUI and batch processing systems.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
@abstractmethod
|
| 48 |
+
def get_name(self) -> str:
|
| 49 |
+
"""Get display name for the engine.
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
str: Human-readable engine name (e.g., "TrOCR", "Qwen3 VLM")
|
| 53 |
+
"""
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
@abstractmethod
|
| 57 |
+
def get_description(self) -> str:
|
| 58 |
+
"""Get brief description of the engine.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
str: One-line description (e.g., "Transformer-based OCR for manuscripts")
|
| 62 |
+
"""
|
| 63 |
+
pass
|
| 64 |
+
|
| 65 |
+
@abstractmethod
|
| 66 |
+
def is_available(self) -> bool:
|
| 67 |
+
"""Check if engine dependencies are installed and functional.
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
bool: True if engine can be used, False otherwise
|
| 71 |
+
"""
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
@abstractmethod
|
| 75 |
+
def get_unavailable_reason(self) -> str:
|
| 76 |
+
"""Get reason why engine is unavailable (if is_available() == False).
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
str: Explanation and installation instructions
|
| 80 |
+
"""
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
@abstractmethod
|
| 84 |
+
def get_config_widget(self) -> QWidget:
|
| 85 |
+
"""Create and return configuration widget for this engine.
|
| 86 |
+
|
| 87 |
+
The widget should contain all engine-specific controls (model selection,
|
| 88 |
+
beam search, preprocessing options, etc.). The GUI will embed this widget
|
| 89 |
+
in the configuration panel.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
QWidget: Qt widget with engine configuration controls
|
| 93 |
+
"""
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
@abstractmethod
|
| 97 |
+
def get_config(self) -> Dict[str, Any]:
|
| 98 |
+
"""Get current configuration from the config widget.
|
| 99 |
+
|
| 100 |
+
This method extracts values from the widget controls and returns
|
| 101 |
+
them as a dictionary that can be passed to transcribe_line().
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
Dict[str, Any]: Configuration parameters
|
| 105 |
+
"""
|
| 106 |
+
pass
|
| 107 |
+
|
| 108 |
+
@abstractmethod
|
| 109 |
+
def set_config(self, config: Dict[str, Any]):
|
| 110 |
+
"""Set configuration values in the config widget.
|
| 111 |
+
|
| 112 |
+
Used to restore saved settings when switching engines.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
config: Configuration parameters
|
| 116 |
+
"""
|
| 117 |
+
pass
|
| 118 |
+
|
| 119 |
+
@abstractmethod
|
| 120 |
+
def load_model(self, config: Dict[str, Any]) -> bool:
|
| 121 |
+
"""Load the HTR model with given configuration.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
config: Configuration parameters (from get_config())
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
bool: True if model loaded successfully, False otherwise
|
| 128 |
+
"""
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
@abstractmethod
|
| 132 |
+
def unload_model(self):
|
| 133 |
+
"""Unload model from memory to free resources.
|
| 134 |
+
|
| 135 |
+
Called when switching to a different engine or closing the application.
|
| 136 |
+
"""
|
| 137 |
+
pass
|
| 138 |
+
|
| 139 |
+
@abstractmethod
|
| 140 |
+
def is_model_loaded(self) -> bool:
|
| 141 |
+
"""Check if model is currently loaded.
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
bool: True if model is ready for inference
|
| 145 |
+
"""
|
| 146 |
+
pass
|
| 147 |
+
|
| 148 |
+
@abstractmethod
|
| 149 |
+
def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
|
| 150 |
+
"""Transcribe a single line image.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
image: Line image as numpy array (RGB, shape: H x W x 3)
|
| 154 |
+
config: Optional configuration overrides
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
TranscriptionResult: Transcription text and metadata
|
| 158 |
+
"""
|
| 159 |
+
pass
|
| 160 |
+
|
| 161 |
+
def requires_line_segmentation(self) -> bool:
|
| 162 |
+
"""Check if engine requires pre-segmented lines or can process full pages.
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
bool: True if lines must be segmented first (TrOCR, CRNN-CTC),
|
| 166 |
+
False if engine handles full pages (Qwen3, Commercial APIs)
|
| 167 |
+
"""
|
| 168 |
+
return True # Default: most engines need line segmentation
|
| 169 |
+
|
| 170 |
+
def transcribe_lines(self, images: List[np.ndarray], config: Optional[Dict[str, Any]] = None) -> List[TranscriptionResult]:
|
| 171 |
+
"""Transcribe multiple line images (batch processing).
|
| 172 |
+
|
| 173 |
+
Default implementation calls transcribe_line() for each image.
|
| 174 |
+
Engines can override this for optimized batch processing.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
images: List of line images
|
| 178 |
+
config: Optional configuration overrides
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
List[TranscriptionResult]: Transcriptions for each image
|
| 182 |
+
"""
|
| 183 |
+
return [self.transcribe_line(img, config) for img in images]
|
| 184 |
+
|
| 185 |
+
def supports_batch(self) -> bool:
|
| 186 |
+
"""Check if engine supports optimized batch processing.
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
bool: True if transcribe_lines() is optimized, False if it just loops
|
| 190 |
+
"""
|
| 191 |
+
return False
|
| 192 |
+
|
| 193 |
+
def get_aliases(self) -> List[str]:
|
| 194 |
+
"""Get alternative names for this engine (e.g., short CLI aliases).
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
List[str]: Alternative names accepted by the registry (default: none)
|
| 198 |
+
"""
|
| 199 |
+
return []
|
| 200 |
+
|
| 201 |
+
def get_capabilities(self) -> Dict[str, bool]:
|
| 202 |
+
"""Get engine capabilities.
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Dict with capability flags:
|
| 206 |
+
- batch_processing: Supports batch inference
|
| 207 |
+
- confidence_scores: Returns confidence scores
|
| 208 |
+
- beam_search: Supports beam search decoding
|
| 209 |
+
- language_model: Uses language model for post-processing
|
| 210 |
+
- preprocessing: Has built-in preprocessing
|
| 211 |
+
"""
|
| 212 |
+
return {
|
| 213 |
+
"batch_processing": self.supports_batch(),
|
| 214 |
+
"confidence_scores": False,
|
| 215 |
+
"beam_search": False,
|
| 216 |
+
"language_model": False,
|
| 217 |
+
"preprocessing": False,
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
class HTREngineRegistry:
|
| 222 |
+
"""Registry of available HTR engines.
|
| 223 |
+
|
| 224 |
+
Manages discovery, registration, and instantiation of HTR engines.
|
| 225 |
+
"""
|
| 226 |
+
|
| 227 |
+
def __init__(self):
|
| 228 |
+
self.engines: List[HTREngine] = []
|
| 229 |
+
self._engine_cache: Dict[str, HTREngine] = {}
|
| 230 |
+
|
| 231 |
+
def register(self, engine: HTREngine):
|
| 232 |
+
"""Register an HTR engine.
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
engine: HTREngine instance to register
|
| 236 |
+
"""
|
| 237 |
+
self.engines.append(engine)
|
| 238 |
+
self._engine_cache[engine.get_name()] = engine
|
| 239 |
+
for alias in engine.get_aliases():
|
| 240 |
+
self._engine_cache[alias] = engine
|
| 241 |
+
|
| 242 |
+
def discover_engines(self):
|
| 243 |
+
"""Automatically discover and register all available engines.
|
| 244 |
+
|
| 245 |
+
Tries to import each engine module and registers it if available.
|
| 246 |
+
"""
|
| 247 |
+
if os.environ.get("POLYSCRIPTOR_DEMO_MODE") == "hf_space":
|
| 248 |
+
demo_engines = [
|
| 249 |
+
("CRNN-CTC", "engines.pylaia_engine", "PyLaiaEngine"),
|
| 250 |
+
("Commercial APIs", "engines.commercial_api_engine", "CommercialAPIEngine"),
|
| 251 |
+
("OpenWebUI", "engines.openwebui_engine", "OpenWebUIEngine"),
|
| 252 |
+
]
|
| 253 |
+
for label, module_name, class_name in demo_engines:
|
| 254 |
+
try:
|
| 255 |
+
module = __import__(module_name, fromlist=[class_name])
|
| 256 |
+
self.register(getattr(module, class_name)())
|
| 257 |
+
except ImportError as e:
|
| 258 |
+
print(f"Warning: Failed to load {label} engine: {e}")
|
| 259 |
+
return
|
| 260 |
+
|
| 261 |
+
# Import and register TrOCR engine
|
| 262 |
+
try:
|
| 263 |
+
from engines.trocr_engine import TrOCREngine
|
| 264 |
+
self.register(TrOCREngine())
|
| 265 |
+
except ImportError as e:
|
| 266 |
+
print(f"Warning: Failed to load TrOCR engine: {e}")
|
| 267 |
+
|
| 268 |
+
# Import and register Qwen3 engine
|
| 269 |
+
try:
|
| 270 |
+
from engines.qwen3_engine import Qwen3Engine
|
| 271 |
+
self.register(Qwen3Engine())
|
| 272 |
+
except ImportError as e:
|
| 273 |
+
print(f"Warning: Failed to load Qwen3 engine: {e}")
|
| 274 |
+
|
| 275 |
+
# Import and register Churro engine
|
| 276 |
+
try:
|
| 277 |
+
from engines.churro_engine import ChurroEngine
|
| 278 |
+
self.register(ChurroEngine())
|
| 279 |
+
except ImportError as e:
|
| 280 |
+
print(f"Warning: Failed to load Churro engine: {e}")
|
| 281 |
+
|
| 282 |
+
# Import and register CRNN-CTC engine
|
| 283 |
+
try:
|
| 284 |
+
from engines.pylaia_engine import PyLaiaEngine
|
| 285 |
+
self.register(PyLaiaEngine())
|
| 286 |
+
except ImportError as e:
|
| 287 |
+
print(f"Warning: Failed to load CRNN-CTC engine: {e}")
|
| 288 |
+
|
| 289 |
+
# Import and register Kraken engine
|
| 290 |
+
try:
|
| 291 |
+
from engines.kraken_engine import KrakenEngine
|
| 292 |
+
self.register(KrakenEngine())
|
| 293 |
+
except ImportError as e:
|
| 294 |
+
print(f"Warning: Failed to load Kraken engine: {e}")
|
| 295 |
+
|
| 296 |
+
# Import and register Commercial API engine
|
| 297 |
+
try:
|
| 298 |
+
from engines.commercial_api_engine import CommercialAPIEngine
|
| 299 |
+
self.register(CommercialAPIEngine())
|
| 300 |
+
except ImportError as e:
|
| 301 |
+
print(f"Warning: Failed to load Commercial API engine: {e}")
|
| 302 |
+
|
| 303 |
+
# Import and register Party engine
|
| 304 |
+
try:
|
| 305 |
+
from engines.party_engine import PartyEngine
|
| 306 |
+
self.register(PartyEngine())
|
| 307 |
+
except ImportError as e:
|
| 308 |
+
print(f"Warning: Failed to load Party engine: {e}")
|
| 309 |
+
|
| 310 |
+
# Import and register OpenWebUI engine
|
| 311 |
+
try:
|
| 312 |
+
from engines.openwebui_engine import OpenWebUIEngine
|
| 313 |
+
self.register(OpenWebUIEngine())
|
| 314 |
+
except ImportError as e:
|
| 315 |
+
print(f"Warning: Failed to load OpenWebUI engine: {e}")
|
| 316 |
+
|
| 317 |
+
# Import and register DeepSeek-OCR engine
|
| 318 |
+
try:
|
| 319 |
+
from engines.deepseek_ocr_engine import DeepSeekOCREngine
|
| 320 |
+
self.register(DeepSeekOCREngine())
|
| 321 |
+
except ImportError as e:
|
| 322 |
+
print(f"Warning: Failed to load DeepSeek-OCR engine: {e}")
|
| 323 |
+
|
| 324 |
+
# Import and register LightOnOCR engine
|
| 325 |
+
try:
|
| 326 |
+
from engines.lighton_ocr_engine import LightOnOCREngine
|
| 327 |
+
self.register(LightOnOCREngine())
|
| 328 |
+
except ImportError as e:
|
| 329 |
+
print(f"Warning: Failed to load LightOnOCR engine: {e}")
|
| 330 |
+
|
| 331 |
+
# Import and register PaddleOCR engine
|
| 332 |
+
try:
|
| 333 |
+
from engines.paddle_engine import PaddleOCREngine
|
| 334 |
+
self.register(PaddleOCREngine())
|
| 335 |
+
except ImportError as e:
|
| 336 |
+
print(f"Warning: Failed to load PaddleOCR engine: {e}")
|
| 337 |
+
|
| 338 |
+
def get_available_engines(self) -> List[HTREngine]:
|
| 339 |
+
"""Get list of engines with satisfied dependencies.
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
List[HTREngine]: Engines that can be used
|
| 343 |
+
"""
|
| 344 |
+
return [e for e in self.engines if e.is_available()]
|
| 345 |
+
|
| 346 |
+
def get_all_engines(self) -> List[HTREngine]:
|
| 347 |
+
"""Get all registered engines (including unavailable ones).
|
| 348 |
+
|
| 349 |
+
Returns:
|
| 350 |
+
List[HTREngine]: All registered engines
|
| 351 |
+
"""
|
| 352 |
+
return self.engines
|
| 353 |
+
|
| 354 |
+
def get_engine_by_name(self, name: str) -> Optional[HTREngine]:
|
| 355 |
+
"""Get engine by display name.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
name: Engine display name
|
| 359 |
+
|
| 360 |
+
Returns:
|
| 361 |
+
Optional[HTREngine]: Engine instance or None if not found
|
| 362 |
+
"""
|
| 363 |
+
return self._engine_cache.get(name)
|
| 364 |
+
|
| 365 |
+
def get_engine_names(self) -> List[str]:
|
| 366 |
+
"""Get list of available engine names.
|
| 367 |
+
|
| 368 |
+
Returns:
|
| 369 |
+
List[str]: Engine display names
|
| 370 |
+
"""
|
| 371 |
+
return [e.get_name() for e in self.get_available_engines()]
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
# Global registry instance (singleton pattern)
|
| 375 |
+
_global_registry: Optional[HTREngineRegistry] = None
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def get_global_registry() -> HTREngineRegistry:
|
| 379 |
+
"""Get global HTR engine registry (singleton).
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
HTREngineRegistry: Global registry instance
|
| 383 |
+
"""
|
| 384 |
+
global _global_registry
|
| 385 |
+
if _global_registry is None:
|
| 386 |
+
_global_registry = HTREngineRegistry()
|
| 387 |
+
_global_registry.discover_engines()
|
| 388 |
+
return _global_registry
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
# Convenience function for GUI
|
| 392 |
+
def get_available_engine_names() -> List[str]:
|
| 393 |
+
"""Get list of available engine names (convenience function).
|
| 394 |
+
|
| 395 |
+
Returns:
|
| 396 |
+
List[str]: Engine display names
|
| 397 |
+
"""
|
| 398 |
+
return get_global_registry().get_engine_names()
|
inference_commercial_api.py
ADDED
|
@@ -0,0 +1,760 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Commercial VLM/LLM API inference for manuscript transcription.
|
| 3 |
+
|
| 4 |
+
Supports:
|
| 5 |
+
- OpenAI GPT-4 Vision / GPT-4o
|
| 6 |
+
- Google Gemini Pro Vision / Gemini Flash
|
| 7 |
+
- Anthropic Claude 3 (Opus, Sonnet, Haiku)
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
# OpenAI
|
| 11 |
+
api = OpenAIInference(api_key="YOUR_OPENAI_API_KEY")
|
| 12 |
+
text = api.transcribe(image)
|
| 13 |
+
|
| 14 |
+
# Gemini
|
| 15 |
+
api = GeminiInference(api_key="YOUR_GEMINI_API_KEY")
|
| 16 |
+
text = api.transcribe(image)
|
| 17 |
+
|
| 18 |
+
# Claude
|
| 19 |
+
api = ClaudeInference(api_key="YOUR_ANTHROPIC_API_KEY")
|
| 20 |
+
text = api.transcribe(image)
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
import base64
|
| 24 |
+
import io
|
| 25 |
+
import time
|
| 26 |
+
from abc import ABC, abstractmethod
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
from typing import Optional, Dict, Any
|
| 29 |
+
from PIL import Image
|
| 30 |
+
|
| 31 |
+
# API clients (install with: pip install openai google-generativeai anthropic)
|
| 32 |
+
try:
|
| 33 |
+
from openai import OpenAI
|
| 34 |
+
OPENAI_AVAILABLE = True
|
| 35 |
+
except ImportError:
|
| 36 |
+
OPENAI_AVAILABLE = False
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
from google import genai as _google_genai_new
|
| 40 |
+
from google.genai import types as _google_genai_types
|
| 41 |
+
GEMINI_AVAILABLE = True
|
| 42 |
+
GEMINI_NEW_SDK = True
|
| 43 |
+
except ImportError:
|
| 44 |
+
GEMINI_NEW_SDK = False
|
| 45 |
+
try:
|
| 46 |
+
import google.generativeai as genai # legacy fallback
|
| 47 |
+
GEMINI_AVAILABLE = True
|
| 48 |
+
except ImportError:
|
| 49 |
+
GEMINI_AVAILABLE = False
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
from anthropic import Anthropic
|
| 53 |
+
CLAUDE_AVAILABLE = True
|
| 54 |
+
except ImportError:
|
| 55 |
+
CLAUDE_AVAILABLE = False
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class BaseAPIInference(ABC):
|
| 59 |
+
"""Base class for commercial API inference."""
|
| 60 |
+
|
| 61 |
+
def __init__(self, api_key: str, default_prompt: Optional[str] = None):
|
| 62 |
+
"""
|
| 63 |
+
Initialize API client.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
api_key: API key for the service
|
| 67 |
+
default_prompt: Default transcription prompt
|
| 68 |
+
"""
|
| 69 |
+
self.api_key = api_key
|
| 70 |
+
self.default_prompt = default_prompt or self._get_default_prompt()
|
| 71 |
+
|
| 72 |
+
@abstractmethod
|
| 73 |
+
def _get_default_prompt(self) -> str:
|
| 74 |
+
"""Get default transcription prompt."""
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
@abstractmethod
|
| 78 |
+
def transcribe(
|
| 79 |
+
self,
|
| 80 |
+
image: Image.Image,
|
| 81 |
+
prompt: Optional[str] = None,
|
| 82 |
+
**kwargs
|
| 83 |
+
) -> str:
|
| 84 |
+
"""
|
| 85 |
+
Transcribe a manuscript line image.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
image: PIL Image
|
| 89 |
+
prompt: Custom prompt (uses default if None)
|
| 90 |
+
**kwargs: Provider-specific parameters
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Transcribed text
|
| 94 |
+
"""
|
| 95 |
+
pass
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def encode_image_base64(image: Image.Image, format: str = "PNG") -> str:
|
| 99 |
+
"""
|
| 100 |
+
Encode PIL Image to base64 string.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
image: PIL Image
|
| 104 |
+
format: Image format (PNG, JPEG, etc.)
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Base64-encoded image string
|
| 108 |
+
"""
|
| 109 |
+
buffered = io.BytesIO()
|
| 110 |
+
image.save(buffered, format=format)
|
| 111 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 112 |
+
|
| 113 |
+
@staticmethod
|
| 114 |
+
def resize_image_if_needed(
|
| 115 |
+
image: Image.Image,
|
| 116 |
+
max_dimension: int = 2048
|
| 117 |
+
) -> Image.Image:
|
| 118 |
+
"""
|
| 119 |
+
Resize image if larger than max dimension while preserving aspect ratio.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
image: PIL Image
|
| 123 |
+
max_dimension: Maximum width or height
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
Resized image (or original if already small enough)
|
| 127 |
+
"""
|
| 128 |
+
width, height = image.size
|
| 129 |
+
|
| 130 |
+
if width <= max_dimension and height <= max_dimension:
|
| 131 |
+
return image
|
| 132 |
+
|
| 133 |
+
# Calculate new size preserving aspect ratio
|
| 134 |
+
if width > height:
|
| 135 |
+
new_width = max_dimension
|
| 136 |
+
new_height = int(height * (max_dimension / width))
|
| 137 |
+
else:
|
| 138 |
+
new_height = max_dimension
|
| 139 |
+
new_width = int(width * (max_dimension / height))
|
| 140 |
+
|
| 141 |
+
return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class OpenAIInference(BaseAPIInference):
|
| 145 |
+
"""OpenAI GPT-4 Vision / GPT-4o inference."""
|
| 146 |
+
|
| 147 |
+
def __init__(
|
| 148 |
+
self,
|
| 149 |
+
api_key: str,
|
| 150 |
+
model: str = "gpt-4o", # gpt-4o, gpt-4-vision-preview, gpt-4-turbo
|
| 151 |
+
default_prompt: Optional[str] = None
|
| 152 |
+
):
|
| 153 |
+
"""
|
| 154 |
+
Initialize OpenAI inference.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
api_key: OpenAI API key
|
| 158 |
+
model: Model name
|
| 159 |
+
default_prompt: Default transcription prompt
|
| 160 |
+
"""
|
| 161 |
+
if not OPENAI_AVAILABLE:
|
| 162 |
+
raise ImportError("OpenAI library not installed. Install with: pip install openai")
|
| 163 |
+
|
| 164 |
+
super().__init__(api_key, default_prompt)
|
| 165 |
+
self.model = model
|
| 166 |
+
self.client = OpenAI(api_key=api_key)
|
| 167 |
+
|
| 168 |
+
def _get_default_prompt(self) -> str:
|
| 169 |
+
return (
|
| 170 |
+
"Transcribe all handwritten text in this manuscript image. "
|
| 171 |
+
"Preserve the original language (Cyrillic, Latin, etc.) and layout. "
|
| 172 |
+
"Output only the transcribed text without any additional commentary."
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
def transcribe(
|
| 176 |
+
self,
|
| 177 |
+
image: Image.Image,
|
| 178 |
+
prompt: Optional[str] = None,
|
| 179 |
+
max_tokens: int = 500,
|
| 180 |
+
temperature: float = 1.0,
|
| 181 |
+
**kwargs
|
| 182 |
+
) -> str:
|
| 183 |
+
"""
|
| 184 |
+
Transcribe with OpenAI GPT-4 Vision.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
image: PIL Image
|
| 188 |
+
prompt: Custom prompt
|
| 189 |
+
max_tokens: Maximum tokens to generate
|
| 190 |
+
temperature: Sampling temperature (web default ~1.0). Lower (0-0.3) = deterministic; higher = more variation.
|
| 191 |
+
**kwargs: Additional OpenAI parameters
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Transcribed text
|
| 195 |
+
"""
|
| 196 |
+
prompt = prompt or self.default_prompt
|
| 197 |
+
|
| 198 |
+
# Resize if needed (GPT-4V supports up to 2048x2048)
|
| 199 |
+
image = self.resize_image_if_needed(image, max_dimension=2048)
|
| 200 |
+
|
| 201 |
+
# Encode image
|
| 202 |
+
base64_image = self.encode_image_base64(image, format="PNG")
|
| 203 |
+
|
| 204 |
+
# API call
|
| 205 |
+
response = self.client.chat.completions.create(
|
| 206 |
+
model=self.model,
|
| 207 |
+
messages=[
|
| 208 |
+
{
|
| 209 |
+
"role": "user",
|
| 210 |
+
"content": [
|
| 211 |
+
{"type": "text", "text": prompt},
|
| 212 |
+
{
|
| 213 |
+
"type": "image_url",
|
| 214 |
+
"image_url": {
|
| 215 |
+
"url": f"data:image/png;base64,{base64_image}"
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
]
|
| 219 |
+
}
|
| 220 |
+
],
|
| 221 |
+
max_tokens=max_tokens,
|
| 222 |
+
temperature=temperature,
|
| 223 |
+
**kwargs
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
return response.choices[0].message.content.strip()
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
class GeminiInference(BaseAPIInference):
|
| 230 |
+
"""Google Gemini inference via google-genai SDK (with legacy google-generativeai fallback)."""
|
| 231 |
+
|
| 232 |
+
# thinking_mode string -> thinking_budget token count (max tokens for internal reasoning)
|
| 233 |
+
# "low": 8000 — moderate budget; fast enough for most lines
|
| 234 |
+
# "high": None — no ThinkingConfig passed at all; model decides dynamically (no cap)
|
| 235 |
+
_THINKING_BUDGETS = {"low": 8000, "high": None}
|
| 236 |
+
|
| 237 |
+
def __init__(
|
| 238 |
+
self,
|
| 239 |
+
api_key: str,
|
| 240 |
+
model: str = "gemini-2.0-flash",
|
| 241 |
+
default_prompt: Optional[str] = None,
|
| 242 |
+
):
|
| 243 |
+
if not GEMINI_AVAILABLE:
|
| 244 |
+
raise ImportError(
|
| 245 |
+
"Google AI library not installed. Install with: pip install google-genai"
|
| 246 |
+
)
|
| 247 |
+
super().__init__(api_key, default_prompt)
|
| 248 |
+
self.model_name = model
|
| 249 |
+
# Populated after each transcribe() call — for UI token display
|
| 250 |
+
self.last_usage: Dict[str, Any] = {}
|
| 251 |
+
self._last_call_usage: Dict[str, Any] = {}
|
| 252 |
+
|
| 253 |
+
if GEMINI_NEW_SDK:
|
| 254 |
+
self._client = _google_genai_new.Client(api_key=api_key)
|
| 255 |
+
else:
|
| 256 |
+
# Legacy fallback
|
| 257 |
+
genai.configure(api_key=api_key)
|
| 258 |
+
self._legacy_model = genai.GenerativeModel(model)
|
| 259 |
+
|
| 260 |
+
def _get_default_prompt(self) -> str:
|
| 261 |
+
return (
|
| 262 |
+
"Transcribe all handwritten text in this manuscript image. "
|
| 263 |
+
"Preserve the original language (Cyrillic, Latin, etc.) and layout. "
|
| 264 |
+
"Output only the transcribed text without any additional commentary."
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
def _build_config(self, temperature, max_output_tokens, thinking_budget, safety_settings,
|
| 268 |
+
request_thoughts: bool = True):
|
| 269 |
+
"""Build GenerateContentConfig for google-genai SDK.
|
| 270 |
+
|
| 271 |
+
request_thoughts=True (default): always sets include_thoughts=True so thought parts
|
| 272 |
+
appear in candidates[].content.parts[] and can be exported. Pass False when retrying
|
| 273 |
+
against a model that rejects ThinkingConfig entirely.
|
| 274 |
+
"""
|
| 275 |
+
kw: Dict[str, Any] = {"temperature": temperature}
|
| 276 |
+
if max_output_tokens:
|
| 277 |
+
kw["max_output_tokens"] = max_output_tokens
|
| 278 |
+
if safety_settings:
|
| 279 |
+
kw["safety_settings"] = safety_settings
|
| 280 |
+
if request_thoughts:
|
| 281 |
+
# Always request thought text back; only cap thinking_budget when explicitly set
|
| 282 |
+
tc_kw: Dict[str, Any] = {"include_thoughts": True}
|
| 283 |
+
if thinking_budget is not None:
|
| 284 |
+
tc_kw["thinking_budget"] = thinking_budget
|
| 285 |
+
kw["thinking_config"] = _google_genai_types.ThinkingConfig(**tc_kw)
|
| 286 |
+
return _google_genai_types.GenerateContentConfig(**kw)
|
| 287 |
+
|
| 288 |
+
def _generate(self, prompt, image, temperature, thinking_budget, safety_settings, verbose):
|
| 289 |
+
"""Single generate call. Handles thinking-not-supported gracefully."""
|
| 290 |
+
if not GEMINI_NEW_SDK:
|
| 291 |
+
# Legacy google-generativeai path
|
| 292 |
+
gen_cfg = genai.GenerationConfig(temperature=temperature or 0.0)
|
| 293 |
+
resp = self._legacy_model.generate_content(
|
| 294 |
+
[prompt, image], generation_config=gen_cfg, safety_settings=safety_settings
|
| 295 |
+
)
|
| 296 |
+
self._last_call_usage = {}
|
| 297 |
+
return resp.text.strip()
|
| 298 |
+
|
| 299 |
+
config = self._build_config(temperature or 0.0, None, thinking_budget, safety_settings,
|
| 300 |
+
request_thoughts=True)
|
| 301 |
+
try:
|
| 302 |
+
resp = self._client.models.generate_content(
|
| 303 |
+
model=self.model_name, contents=[prompt, image], config=config
|
| 304 |
+
)
|
| 305 |
+
except Exception as e:
|
| 306 |
+
err = str(e)
|
| 307 |
+
# Non-thinking models reject ThinkingConfig with a 400/invalid error — retry without it
|
| 308 |
+
if "thinking" in err.lower() or ("400" in err and "invalid" in err.lower()):
|
| 309 |
+
if verbose:
|
| 310 |
+
print(f"Model does not support ThinkingConfig, retrying without.")
|
| 311 |
+
config = self._build_config(temperature or 0.0, None, thinking_budget,
|
| 312 |
+
safety_settings, request_thoughts=False)
|
| 313 |
+
resp = self._client.models.generate_content(
|
| 314 |
+
model=self.model_name, contents=[prompt, image], config=config
|
| 315 |
+
)
|
| 316 |
+
else:
|
| 317 |
+
raise
|
| 318 |
+
|
| 319 |
+
usage = getattr(resp, "usage_metadata", None)
|
| 320 |
+
self._last_call_usage = {
|
| 321 |
+
"prompt_tokens": getattr(usage, "prompt_token_count", None) if usage else None,
|
| 322 |
+
"output_tokens": getattr(usage, "candidates_token_count", None) if usage else None,
|
| 323 |
+
"thinking_tokens": getattr(usage, "thoughts_token_count", None) if usage else None,
|
| 324 |
+
"total_tokens": getattr(usage, "total_token_count", None) if usage else None,
|
| 325 |
+
}
|
| 326 |
+
# Extract thinking text from thought parts (present when include_thoughts=True was sent)
|
| 327 |
+
thinking_parts = []
|
| 328 |
+
try:
|
| 329 |
+
for cand in (getattr(resp, "candidates", None) or []):
|
| 330 |
+
for part in (getattr(getattr(cand, "content", None), "parts", None) or []):
|
| 331 |
+
if getattr(part, "thought", False) and getattr(part, "text", None):
|
| 332 |
+
thinking_parts.append(part.text)
|
| 333 |
+
except Exception:
|
| 334 |
+
pass
|
| 335 |
+
self._last_call_usage["thinking_text"] = "\n\n".join(thinking_parts) if thinking_parts else None
|
| 336 |
+
return resp.text.strip()
|
| 337 |
+
|
| 338 |
+
def _maybe_continue(
|
| 339 |
+
self,
|
| 340 |
+
current_text: str,
|
| 341 |
+
prompt: str,
|
| 342 |
+
image,
|
| 343 |
+
thinking_budget,
|
| 344 |
+
safety_settings,
|
| 345 |
+
auto_continue: bool,
|
| 346 |
+
max_auto_continuations: int,
|
| 347 |
+
continuation_min_new_chars: int,
|
| 348 |
+
verbose_block_logging: bool,
|
| 349 |
+
) -> str:
|
| 350 |
+
if not auto_continue:
|
| 351 |
+
return current_text
|
| 352 |
+
accumulated = current_text
|
| 353 |
+
for pass_idx in range(1, max_auto_continuations + 1):
|
| 354 |
+
continuation_prompt = (
|
| 355 |
+
f"{prompt}\n\nPartial transcription so far (DO NOT repeat it):\n"
|
| 356 |
+
f"{accumulated}\n\nContinue transcribing remaining, previously UNTRANSCRIBED text. "
|
| 357 |
+
"Output ONLY the new continuation without repeating prior characters."
|
| 358 |
+
)
|
| 359 |
+
try:
|
| 360 |
+
new_chunk = self._generate(
|
| 361 |
+
continuation_prompt, image, None, thinking_budget,
|
| 362 |
+
safety_settings, verbose_block_logging
|
| 363 |
+
)
|
| 364 |
+
except Exception as e:
|
| 365 |
+
if verbose_block_logging:
|
| 366 |
+
print(f"Continuation {pass_idx} failed: {e}")
|
| 367 |
+
break
|
| 368 |
+
if not new_chunk:
|
| 369 |
+
if verbose_block_logging:
|
| 370 |
+
print(f"Continuation {pass_idx}: no new text, stopping.")
|
| 371 |
+
break
|
| 372 |
+
# Guard against repetition
|
| 373 |
+
if accumulated and new_chunk.startswith(accumulated[:200]):
|
| 374 |
+
overlap_pos = new_chunk.find(accumulated[-50:])
|
| 375 |
+
if overlap_pos > 0:
|
| 376 |
+
new_chunk = new_chunk[overlap_pos + len(accumulated[-50:]):]
|
| 377 |
+
delta = len(new_chunk)
|
| 378 |
+
if delta < continuation_min_new_chars:
|
| 379 |
+
if verbose_block_logging:
|
| 380 |
+
print(f"Continuation {pass_idx}: only {delta} chars, stopping.")
|
| 381 |
+
break
|
| 382 |
+
accumulated += ("\n" if not accumulated.endswith("\n") else "") + new_chunk
|
| 383 |
+
if verbose_block_logging:
|
| 384 |
+
print(f"Continuation {pass_idx}: +{delta} chars (total {len(accumulated)})")
|
| 385 |
+
return accumulated
|
| 386 |
+
|
| 387 |
+
def transcribe(
|
| 388 |
+
self,
|
| 389 |
+
image,
|
| 390 |
+
prompt: Optional[str] = None,
|
| 391 |
+
temperature: float = 0.0,
|
| 392 |
+
max_output_tokens: Optional[int] = None,
|
| 393 |
+
auto_retry_on_block: bool = True,
|
| 394 |
+
safety_relax: bool = True,
|
| 395 |
+
verbose_block_logging: bool = True,
|
| 396 |
+
thinking_mode: Optional[str] = None,
|
| 397 |
+
fast_direct: bool = False,
|
| 398 |
+
fast_direct_early_exit: bool = True,
|
| 399 |
+
auto_continue: bool = False,
|
| 400 |
+
max_auto_continuations: int = 2,
|
| 401 |
+
continuation_min_new_chars: int = 50,
|
| 402 |
+
reasoning_fallback_threshold: float = 1.0,
|
| 403 |
+
record_stats_csv: Optional[str] = None,
|
| 404 |
+
apply_restriction_prompt: bool = False,
|
| 405 |
+
fallback_max_output_tokens: int = 8192,
|
| 406 |
+
**kwargs,
|
| 407 |
+
) -> str:
|
| 408 |
+
"""Transcribe a manuscript image with Google Gemini.
|
| 409 |
+
|
| 410 |
+
Args:
|
| 411 |
+
image: PIL Image or numpy array
|
| 412 |
+
prompt: Transcription prompt (uses default if None)
|
| 413 |
+
temperature: Sampling temperature (0.0 = deterministic)
|
| 414 |
+
max_output_tokens: Output token cap (None = model default)
|
| 415 |
+
thinking_mode: None | "low" | "high" -- maps to thinking_budget
|
| 416 |
+
record_stats_csv: Path to append usage CSV row (None to skip)
|
| 417 |
+
auto_continue: Request continuation calls if output seems truncated
|
| 418 |
+
"""
|
| 419 |
+
from PIL import Image as _PIL_Image
|
| 420 |
+
import numpy as np
|
| 421 |
+
if isinstance(image, np.ndarray):
|
| 422 |
+
image = _PIL_Image.fromarray(image)
|
| 423 |
+
image = self.resize_image_if_needed(image, max_dimension=3072)
|
| 424 |
+
prompt = prompt or self.default_prompt
|
| 425 |
+
|
| 426 |
+
# Map thinking_mode to thinking_budget
|
| 427 |
+
thinking_budget = self._THINKING_BUDGETS.get(thinking_mode) # None if mode is None/unknown
|
| 428 |
+
|
| 429 |
+
# Safety settings
|
| 430 |
+
safety_settings = None
|
| 431 |
+
if safety_relax and GEMINI_NEW_SDK:
|
| 432 |
+
safety_settings = [
|
| 433 |
+
_google_genai_types.SafetySetting(category=cat, threshold="BLOCK_NONE")
|
| 434 |
+
for cat in (
|
| 435 |
+
"HARM_CATEGORY_HARASSMENT",
|
| 436 |
+
"HARM_CATEGORY_HATE_SPEECH",
|
| 437 |
+
"HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
| 438 |
+
"HARM_CATEGORY_DANGEROUS_CONTENT",
|
| 439 |
+
)
|
| 440 |
+
]
|
| 441 |
+
|
| 442 |
+
self._last_call_usage = {}
|
| 443 |
+
|
| 444 |
+
try:
|
| 445 |
+
result_text = self._generate(
|
| 446 |
+
prompt, image, temperature, thinking_budget, safety_settings, verbose_block_logging
|
| 447 |
+
)
|
| 448 |
+
except Exception as e:
|
| 449 |
+
raise ValueError(f"Gemini transcription failed: {e}") from e
|
| 450 |
+
|
| 451 |
+
# Persist usage for callers (e.g. statistics panel, CSV logging)
|
| 452 |
+
self.last_usage = dict(self._last_call_usage)
|
| 453 |
+
u = self.last_usage
|
| 454 |
+
if verbose_block_logging and u.get("total_tokens"):
|
| 455 |
+
print(
|
| 456 |
+
f"[tokens] prompt={u.get('prompt_tokens')} "
|
| 457 |
+
f"output={u.get('output_tokens')} "
|
| 458 |
+
f"thinking={u.get('thinking_tokens')} "
|
| 459 |
+
f"total={u.get('total_tokens')}"
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
if record_stats_csv:
|
| 463 |
+
try:
|
| 464 |
+
from datetime import datetime
|
| 465 |
+
with open(record_stats_csv, "a") as f:
|
| 466 |
+
f.write(
|
| 467 |
+
f"{datetime.utcnow().isoformat()},"
|
| 468 |
+
f"{self.model_name},"
|
| 469 |
+
f"{thinking_mode or 'default'},"
|
| 470 |
+
f"final_success,"
|
| 471 |
+
f"{u.get('prompt_tokens')},"
|
| 472 |
+
f"{u.get('output_tokens')},"
|
| 473 |
+
f"{u.get('thinking_tokens')},"
|
| 474 |
+
f"{u.get('total_tokens')},"
|
| 475 |
+
f"{len(result_text)}\n"
|
| 476 |
+
)
|
| 477 |
+
except Exception as csv_e:
|
| 478 |
+
if verbose_block_logging:
|
| 479 |
+
print(f"Stats logging failed: {csv_e}")
|
| 480 |
+
|
| 481 |
+
return self._maybe_continue(
|
| 482 |
+
result_text, prompt, image, thinking_budget, safety_settings,
|
| 483 |
+
auto_continue, max_auto_continuations, continuation_min_new_chars,
|
| 484 |
+
verbose_block_logging,
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
class ClaudeInference(BaseAPIInference):
|
| 488 |
+
"""Anthropic Claude 3 inference (Opus, Sonnet, Haiku)."""
|
| 489 |
+
|
| 490 |
+
def __init__(
|
| 491 |
+
self,
|
| 492 |
+
api_key: str,
|
| 493 |
+
model: str = "claude-3-5-sonnet-20241022", # claude-3-5-sonnet-20241022, claude-3-opus-20240229, claude-3-haiku-20240307
|
| 494 |
+
default_prompt: Optional[str] = None
|
| 495 |
+
):
|
| 496 |
+
"""
|
| 497 |
+
Initialize Claude inference.
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
api_key: Anthropic API key
|
| 501 |
+
model: Model name
|
| 502 |
+
default_prompt: Default transcription prompt
|
| 503 |
+
"""
|
| 504 |
+
if not CLAUDE_AVAILABLE:
|
| 505 |
+
raise ImportError("Anthropic library not installed. Install with: pip install anthropic")
|
| 506 |
+
|
| 507 |
+
super().__init__(api_key, default_prompt)
|
| 508 |
+
self.model = model
|
| 509 |
+
self.client = Anthropic(api_key=api_key)
|
| 510 |
+
|
| 511 |
+
def _get_default_prompt(self) -> str:
|
| 512 |
+
return (
|
| 513 |
+
"Transcribe all handwritten text in this manuscript image. "
|
| 514 |
+
"Preserve the original language (Cyrillic, Latin, etc.) and layout. "
|
| 515 |
+
"Output only the transcribed text without any additional commentary."
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
def transcribe(
|
| 519 |
+
self,
|
| 520 |
+
image: Image.Image,
|
| 521 |
+
prompt: Optional[str] = None,
|
| 522 |
+
max_tokens: int = 500,
|
| 523 |
+
temperature: float = 0.0,
|
| 524 |
+
**kwargs
|
| 525 |
+
) -> str:
|
| 526 |
+
"""
|
| 527 |
+
Transcribe with Anthropic Claude.
|
| 528 |
+
|
| 529 |
+
Args:
|
| 530 |
+
image: PIL Image
|
| 531 |
+
prompt: Custom prompt
|
| 532 |
+
max_tokens: Maximum tokens to generate
|
| 533 |
+
temperature: Sampling temperature (0.0 = deterministic)
|
| 534 |
+
**kwargs: Additional Claude parameters
|
| 535 |
+
|
| 536 |
+
Returns:
|
| 537 |
+
Transcribed text
|
| 538 |
+
"""
|
| 539 |
+
prompt = prompt or self.default_prompt
|
| 540 |
+
|
| 541 |
+
# Resize if needed (Claude supports up to 1568px on longest side)
|
| 542 |
+
image = self.resize_image_if_needed(image, max_dimension=1568)
|
| 543 |
+
|
| 544 |
+
# Encode image
|
| 545 |
+
base64_image = self.encode_image_base64(image, format="PNG")
|
| 546 |
+
|
| 547 |
+
# API call
|
| 548 |
+
response = self.client.messages.create(
|
| 549 |
+
model=self.model,
|
| 550 |
+
max_tokens=max_tokens,
|
| 551 |
+
temperature=temperature,
|
| 552 |
+
messages=[
|
| 553 |
+
{
|
| 554 |
+
"role": "user",
|
| 555 |
+
"content": [
|
| 556 |
+
{
|
| 557 |
+
"type": "image",
|
| 558 |
+
"source": {
|
| 559 |
+
"type": "base64",
|
| 560 |
+
"media_type": "image/png",
|
| 561 |
+
"data": base64_image
|
| 562 |
+
}
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"type": "text",
|
| 566 |
+
"text": prompt
|
| 567 |
+
}
|
| 568 |
+
]
|
| 569 |
+
}
|
| 570 |
+
],
|
| 571 |
+
**kwargs
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
return response.content[0].text.strip()
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
# Model availability checks
|
| 578 |
+
def check_api_availability() -> Dict[str, bool]:
|
| 579 |
+
"""Check which API libraries are installed."""
|
| 580 |
+
return {
|
| 581 |
+
"openai": OPENAI_AVAILABLE,
|
| 582 |
+
"gemini": GEMINI_AVAILABLE,
|
| 583 |
+
"claude": CLAUDE_AVAILABLE,
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
# Fallback API model lists (used if dynamic fetching fails)
|
| 588 |
+
OPENAI_MODELS_FALLBACK = [
|
| 589 |
+
"gpt-4o",
|
| 590 |
+
"gpt-4o-mini",
|
| 591 |
+
"gpt-4o-2024-11-20",
|
| 592 |
+
"chatgpt-4o-latest",
|
| 593 |
+
"gpt-4-turbo",
|
| 594 |
+
"gpt-4-vision-preview",
|
| 595 |
+
"o1-preview",
|
| 596 |
+
"o1-mini",
|
| 597 |
+
]
|
| 598 |
+
|
| 599 |
+
GEMINI_MODELS_FALLBACK = [
|
| 600 |
+
# Free tier models (generally available)
|
| 601 |
+
"gemini-1.5-flash",
|
| 602 |
+
"gemini-1.5-flash-002",
|
| 603 |
+
"gemini-1.5-flash-8b",
|
| 604 |
+
"gemini-2.0-flash-exp",
|
| 605 |
+
# Paid/preview models (may require upgrade)
|
| 606 |
+
"gemini-1.5-pro",
|
| 607 |
+
"gemini-1.5-pro-002",
|
| 608 |
+
"gemini-1.5-pro-exp-0827",
|
| 609 |
+
# Experimental (may not be available to all users)
|
| 610 |
+
"gemini-exp-1206",
|
| 611 |
+
"gemini-exp-1121",
|
| 612 |
+
# Gemini 3 preview models (latest, may have restrictions)
|
| 613 |
+
"gemini-3-pro-preview",
|
| 614 |
+
]
|
| 615 |
+
|
| 616 |
+
CLAUDE_MODELS_FALLBACK = [
|
| 617 |
+
"claude-opus-4-6",
|
| 618 |
+
"claude-sonnet-4-6",
|
| 619 |
+
"claude-haiku-4-5-20251001",
|
| 620 |
+
"claude-3-5-sonnet-20241022",
|
| 621 |
+
"claude-3-5-haiku-20241022",
|
| 622 |
+
"claude-3-opus-20240229",
|
| 623 |
+
"claude-3-haiku-20240307",
|
| 624 |
+
]
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
def fetch_openai_models(api_key: str = None) -> list:
|
| 628 |
+
"""
|
| 629 |
+
Dynamically fetch available OpenAI models from API.
|
| 630 |
+
|
| 631 |
+
Args:
|
| 632 |
+
api_key: OpenAI API key (uses env var if not provided)
|
| 633 |
+
|
| 634 |
+
Returns:
|
| 635 |
+
List of vision-capable model IDs, or fallback list if fetch fails
|
| 636 |
+
"""
|
| 637 |
+
if not OPENAI_AVAILABLE:
|
| 638 |
+
return OPENAI_MODELS_FALLBACK
|
| 639 |
+
|
| 640 |
+
try:
|
| 641 |
+
if not api_key:
|
| 642 |
+
return OPENAI_MODELS_FALLBACK
|
| 643 |
+
|
| 644 |
+
client = OpenAI(api_key=api_key)
|
| 645 |
+
models = client.models.list()
|
| 646 |
+
|
| 647 |
+
# Filter for vision-capable models (GPT-4 family + o1)
|
| 648 |
+
vision_models = []
|
| 649 |
+
for model in models.data:
|
| 650 |
+
model_id = model.id
|
| 651 |
+
# Include GPT-4 vision models and o1 models
|
| 652 |
+
if any(prefix in model_id for prefix in [
|
| 653 |
+
"gpt-4o", "gpt-4-turbo", "gpt-4-vision",
|
| 654 |
+
"chatgpt-4o", "o1-", "gpt-4.5" # Include potential GPT-4.5
|
| 655 |
+
]):
|
| 656 |
+
vision_models.append(model_id)
|
| 657 |
+
|
| 658 |
+
# Sort with newest/best models first
|
| 659 |
+
vision_models.sort(reverse=True)
|
| 660 |
+
|
| 661 |
+
# Return dynamic list if we found models, otherwise fallback
|
| 662 |
+
return vision_models if vision_models else OPENAI_MODELS_FALLBACK
|
| 663 |
+
|
| 664 |
+
except Exception as e:
|
| 665 |
+
print(f"[OpenAI] Could not fetch models dynamically: {e}")
|
| 666 |
+
print(f"[OpenAI] Using fallback model list")
|
| 667 |
+
return OPENAI_MODELS_FALLBACK
|
| 668 |
+
|
| 669 |
+
|
| 670 |
+
def fetch_gemini_models(api_key: str = None) -> list:
|
| 671 |
+
"""Dynamically fetch available Gemini models; returns fallback list on failure."""
|
| 672 |
+
if not GEMINI_AVAILABLE:
|
| 673 |
+
return GEMINI_MODELS_FALLBACK
|
| 674 |
+
try:
|
| 675 |
+
if not api_key:
|
| 676 |
+
return GEMINI_MODELS_FALLBACK
|
| 677 |
+
if GEMINI_NEW_SDK:
|
| 678 |
+
client = _google_genai_new.Client(api_key=api_key)
|
| 679 |
+
models = [
|
| 680 |
+
m.name.replace("models/", "")
|
| 681 |
+
for m in client.models.list()
|
| 682 |
+
if "generateContent" in (getattr(m, "supported_actions", None) or [])
|
| 683 |
+
]
|
| 684 |
+
else:
|
| 685 |
+
genai.configure(api_key=api_key)
|
| 686 |
+
models = [
|
| 687 |
+
m.name.replace("models/", "")
|
| 688 |
+
for m in genai.list_models()
|
| 689 |
+
if "generateContent" in m.supported_generation_methods
|
| 690 |
+
]
|
| 691 |
+
models = [m for m in models if m.startswith("gemini")]
|
| 692 |
+
models.sort(reverse=True)
|
| 693 |
+
return models if models else GEMINI_MODELS_FALLBACK
|
| 694 |
+
except Exception as e:
|
| 695 |
+
print(f"[Gemini] Could not fetch models: {e}")
|
| 696 |
+
return GEMINI_MODELS_FALLBACK
|
| 697 |
+
|
| 698 |
+
def fetch_claude_models(api_key: str = None) -> list:
|
| 699 |
+
"""
|
| 700 |
+
Dynamically fetch available Claude models via Anthropic API.
|
| 701 |
+
|
| 702 |
+
Returns:
|
| 703 |
+
List of Claude model IDs (newest first), or fallback list if fetch fails.
|
| 704 |
+
"""
|
| 705 |
+
if not CLAUDE_AVAILABLE:
|
| 706 |
+
return CLAUDE_MODELS_FALLBACK
|
| 707 |
+
|
| 708 |
+
try:
|
| 709 |
+
if not api_key:
|
| 710 |
+
return CLAUDE_MODELS_FALLBACK
|
| 711 |
+
|
| 712 |
+
client = Anthropic(api_key=api_key)
|
| 713 |
+
models_page = client.models.list()
|
| 714 |
+
model_ids = [m.id for m in models_page.data]
|
| 715 |
+
# Sort newest first (IDs contain dates like -20241022 or version numbers)
|
| 716 |
+
model_ids.sort(reverse=True)
|
| 717 |
+
return model_ids if model_ids else CLAUDE_MODELS_FALLBACK
|
| 718 |
+
|
| 719 |
+
except Exception as e:
|
| 720 |
+
print(f"[Claude] Could not fetch models dynamically: {e}")
|
| 721 |
+
return CLAUDE_MODELS_FALLBACK
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
# Initialize model lists (will be updated when API keys are provided)
|
| 725 |
+
OPENAI_MODELS = OPENAI_MODELS_FALLBACK.copy()
|
| 726 |
+
GEMINI_MODELS = GEMINI_MODELS_FALLBACK.copy()
|
| 727 |
+
CLAUDE_MODELS = CLAUDE_MODELS_FALLBACK.copy()
|
| 728 |
+
|
| 729 |
+
|
| 730 |
+
if __name__ == "__main__":
|
| 731 |
+
# Example usage
|
| 732 |
+
import sys
|
| 733 |
+
|
| 734 |
+
if len(sys.argv) < 4:
|
| 735 |
+
print("Usage: python inference_commercial_api.py <provider> <api_key> <image_path>")
|
| 736 |
+
print("Providers: openai, gemini, claude")
|
| 737 |
+
sys.exit(1)
|
| 738 |
+
|
| 739 |
+
provider = sys.argv[1].lower()
|
| 740 |
+
api_key = sys.argv[2]
|
| 741 |
+
image_path = sys.argv[3]
|
| 742 |
+
|
| 743 |
+
# Load image
|
| 744 |
+
image = Image.open(image_path).convert("RGB")
|
| 745 |
+
|
| 746 |
+
# Initialize appropriate inference client
|
| 747 |
+
if provider == "openai":
|
| 748 |
+
api = OpenAIInference(api_key)
|
| 749 |
+
elif provider == "gemini":
|
| 750 |
+
api = GeminiInference(api_key)
|
| 751 |
+
elif provider == "claude":
|
| 752 |
+
api = ClaudeInference(api_key)
|
| 753 |
+
else:
|
| 754 |
+
print(f"Unknown provider: {provider}")
|
| 755 |
+
sys.exit(1)
|
| 756 |
+
|
| 757 |
+
# Transcribe
|
| 758 |
+
print(f"Transcribing with {provider}...")
|
| 759 |
+
text = api.transcribe(image)
|
| 760 |
+
print(f"\nResult: {text}")
|
inference_page.py
ADDED
|
@@ -0,0 +1,946 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Whole-page OCR inference for Ukrainian handwritten text using TrOCR.
|
| 3 |
+
|
| 4 |
+
This script performs line segmentation and transcription on unsegmented page images.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
# Basic usage with checkpoint
|
| 8 |
+
python inference_page.py --image path/to/page.jpg --checkpoint models/ukrainian_model/checkpoint-3000
|
| 9 |
+
|
| 10 |
+
# With custom settings
|
| 11 |
+
python inference_page.py --image page.jpg --checkpoint checkpoint-3000 --num_beams 4 --output output.txt
|
| 12 |
+
|
| 13 |
+
# With Transkribus PAGE XML (uses existing segmentation)
|
| 14 |
+
python inference_page.py --image page.jpg --xml page.xml --checkpoint checkpoint-3000
|
| 15 |
+
|
| 16 |
+
Future: Can be extended with a GUI using tkinter or PyQt.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import argparse
|
| 20 |
+
import torch
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from PIL import Image, ImageDraw
|
| 23 |
+
import numpy as np
|
| 24 |
+
from typing import List, Tuple, Optional
|
| 25 |
+
import xml.etree.ElementTree as ET
|
| 26 |
+
from dataclasses import dataclass
|
| 27 |
+
import cv2
|
| 28 |
+
|
| 29 |
+
# Disable PIL DecompressionBomb protection for large manuscript images
|
| 30 |
+
Image.MAX_IMAGE_PIXELS = None
|
| 31 |
+
|
| 32 |
+
# Optional: the hosted Hugging Face Space uses this module for segmentation, but
|
| 33 |
+
# does not enable TrOCR inference. Avoid making transformers a startup dependency.
|
| 34 |
+
try:
|
| 35 |
+
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
|
| 36 |
+
except ImportError:
|
| 37 |
+
VisionEncoderDecoderModel = None
|
| 38 |
+
TrOCRProcessor = None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@dataclass
|
| 42 |
+
class LineSegment:
|
| 43 |
+
"""Represents a segmented text line."""
|
| 44 |
+
image: Image.Image
|
| 45 |
+
bbox: Tuple[int, int, int, int] # x1, y1, x2, y2
|
| 46 |
+
coords: Optional[List[Tuple[int, int]]] = None # polygon coordinates if available
|
| 47 |
+
text: Optional[str] = None # transcription result
|
| 48 |
+
confidence: Optional[float] = None # average confidence score (0-1)
|
| 49 |
+
char_confidences: Optional[List[float]] = None # per-character confidence scores
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def sort_lines_by_region(regions, lines):
|
| 53 |
+
"""
|
| 54 |
+
Sort lines in reading order: regions left-to-right, lines top-to-bottom
|
| 55 |
+
within each region.
|
| 56 |
+
|
| 57 |
+
Works with SegRegion objects from kraken_segmenter (which carry bbox and
|
| 58 |
+
line_ids) and any list of line-like objects that have a ``.bbox`` attribute
|
| 59 |
+
with (x1, y1, x2, y2) format.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
regions: List of SegRegion (from kraken_segmenter) with .bbox and .line_ids.
|
| 63 |
+
If empty/None, lines are returned sorted top-to-bottom.
|
| 64 |
+
lines: List of LineSegment (or kraken LineSegment).
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
List of lines re-ordered by region reading order.
|
| 68 |
+
"""
|
| 69 |
+
if not regions or not lines:
|
| 70 |
+
# No region info — simple top-to-bottom sort
|
| 71 |
+
return sorted(lines, key=lambda l: l.bbox[1])
|
| 72 |
+
|
| 73 |
+
# Sort regions left-to-right by mean x-center
|
| 74 |
+
sorted_regions = sorted(
|
| 75 |
+
regions,
|
| 76 |
+
key=lambda r: (r.bbox[0] + r.bbox[2]) / 2,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Assign each line to the region whose bbox contains the line's center
|
| 80 |
+
region_buckets = {r.id: [] for r in sorted_regions}
|
| 81 |
+
unassigned = []
|
| 82 |
+
|
| 83 |
+
for line in lines:
|
| 84 |
+
cx = (line.bbox[0] + line.bbox[2]) / 2
|
| 85 |
+
cy = (line.bbox[1] + line.bbox[3]) / 2
|
| 86 |
+
assigned = False
|
| 87 |
+
for r in sorted_regions:
|
| 88 |
+
rx1, ry1, rx2, ry2 = r.bbox
|
| 89 |
+
if rx1 <= cx <= rx2 and ry1 <= cy <= ry2:
|
| 90 |
+
region_buckets[r.id].append(line)
|
| 91 |
+
assigned = True
|
| 92 |
+
break
|
| 93 |
+
if not assigned:
|
| 94 |
+
unassigned.append(line)
|
| 95 |
+
|
| 96 |
+
# Build ordered list: per-region top-to-bottom, then unassigned at the end
|
| 97 |
+
ordered = []
|
| 98 |
+
for r in sorted_regions:
|
| 99 |
+
bucket = region_buckets[r.id]
|
| 100 |
+
bucket.sort(key=lambda l: l.bbox[1])
|
| 101 |
+
ordered.extend(bucket)
|
| 102 |
+
|
| 103 |
+
unassigned.sort(key=lambda l: l.bbox[1])
|
| 104 |
+
ordered.extend(unassigned)
|
| 105 |
+
return ordered
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def normalize_background(image: Image.Image) -> Image.Image:
|
| 109 |
+
"""
|
| 110 |
+
Normalize background to light gray (similar to Efendiev dataset).
|
| 111 |
+
|
| 112 |
+
CRITICAL for Ukrainian dataset: Models trained on data with background
|
| 113 |
+
normalization MUST have normalization applied at inference time as well.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
image: PIL Image with potentially aged/colored background
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
PIL Image with normalized background
|
| 120 |
+
"""
|
| 121 |
+
# Convert PIL to OpenCV format
|
| 122 |
+
img_array = np.array(image)
|
| 123 |
+
|
| 124 |
+
# Convert to LAB color space for better lighting normalization
|
| 125 |
+
lab = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB)
|
| 126 |
+
l, a, b = cv2.split(lab)
|
| 127 |
+
|
| 128 |
+
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) to L channel
|
| 129 |
+
# This normalizes lighting variations across the image
|
| 130 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 131 |
+
l_normalized = clahe.apply(l)
|
| 132 |
+
|
| 133 |
+
# Merge back and convert to RGB
|
| 134 |
+
lab_normalized = cv2.merge([l_normalized, a, b])
|
| 135 |
+
rgb_normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2RGB)
|
| 136 |
+
|
| 137 |
+
# Convert to grayscale to remove color variations (aged paper tones)
|
| 138 |
+
gray = cv2.cvtColor(rgb_normalized, cv2.COLOR_RGB2GRAY)
|
| 139 |
+
|
| 140 |
+
# Convert back to RGB with uniform background
|
| 141 |
+
# This creates a light gray background similar to Efendiev dataset
|
| 142 |
+
normalized_rgb = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
|
| 143 |
+
|
| 144 |
+
return Image.fromarray(normalized_rgb)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class LineSegmenter:
|
| 148 |
+
"""Improved line segmentation using horizontal projection with multiple strategies."""
|
| 149 |
+
|
| 150 |
+
def __init__(self, min_line_height: int = 15, min_gap: int = 5,
|
| 151 |
+
sensitivity: float = 0.02, use_morph: bool = True):
|
| 152 |
+
"""
|
| 153 |
+
Initialize LineSegmenter.
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
min_line_height: Minimum height of a line in pixels (default: 15, lowered for tighter spacing)
|
| 157 |
+
min_gap: Minimum gap between lines in pixels (default: 5, lowered for tight spacing)
|
| 158 |
+
sensitivity: Threshold for detecting text (0.01-0.1, lower = more sensitive, default: 0.02)
|
| 159 |
+
use_morph: Apply morphological operations to clean up detection (default: True)
|
| 160 |
+
"""
|
| 161 |
+
self.min_line_height = min_line_height
|
| 162 |
+
self.min_gap = min_gap
|
| 163 |
+
self.sensitivity = sensitivity
|
| 164 |
+
self.use_morph = use_morph
|
| 165 |
+
|
| 166 |
+
def segment_lines(self, image: Image.Image, debug: bool = False) -> List[LineSegment]:
|
| 167 |
+
"""
|
| 168 |
+
Segment page image into text lines using horizontal projection.
|
| 169 |
+
|
| 170 |
+
Improved algorithm:
|
| 171 |
+
1. Multiple binarization strategies (Otsu + Sauvola for different scripts)
|
| 172 |
+
2. Morphological operations to connect broken text
|
| 173 |
+
3. Lower sensitivity threshold for tight line spacing
|
| 174 |
+
4. Smart gap detection based on local context
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
image: Input page image (PIL Image)
|
| 178 |
+
debug: If True, visualize segmentation
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
List of LineSegment objects
|
| 182 |
+
"""
|
| 183 |
+
# Convert to grayscale
|
| 184 |
+
gray = np.array(image.convert('L'))
|
| 185 |
+
|
| 186 |
+
# Try multiple binarization strategies and combine
|
| 187 |
+
from scipy.ndimage import gaussian_filter
|
| 188 |
+
blurred = gaussian_filter(gray, sigma=1.0)
|
| 189 |
+
|
| 190 |
+
# Strategy 1: Otsu's method (global threshold)
|
| 191 |
+
threshold_otsu = self._otsu_threshold(blurred)
|
| 192 |
+
binary_otsu = blurred < threshold_otsu
|
| 193 |
+
|
| 194 |
+
# Strategy 2: Adaptive threshold (local threshold, better for varying contrast)
|
| 195 |
+
binary_adaptive = self._adaptive_threshold(gray)
|
| 196 |
+
|
| 197 |
+
# Combine both strategies (logical OR to catch text in both)
|
| 198 |
+
binary = np.logical_or(binary_otsu, binary_adaptive)
|
| 199 |
+
|
| 200 |
+
# Apply morphological closing to connect broken characters
|
| 201 |
+
if self.use_morph:
|
| 202 |
+
from scipy.ndimage import binary_closing
|
| 203 |
+
# Horizontal structuring element to connect characters on same line
|
| 204 |
+
struct = np.ones((3, 5)) # 3 pixels tall, 5 pixels wide
|
| 205 |
+
binary = binary_closing(binary, structure=struct, iterations=2)
|
| 206 |
+
|
| 207 |
+
# Horizontal projection (sum of black pixels per row)
|
| 208 |
+
h_projection = binary.sum(axis=1)
|
| 209 |
+
|
| 210 |
+
# Adaptive threshold based on image statistics
|
| 211 |
+
# Use lower threshold for better sensitivity
|
| 212 |
+
if h_projection.max() > 0:
|
| 213 |
+
threshold = h_projection.max() * self.sensitivity
|
| 214 |
+
else:
|
| 215 |
+
# Fallback if no text detected
|
| 216 |
+
threshold = 1
|
| 217 |
+
|
| 218 |
+
is_text = h_projection > threshold
|
| 219 |
+
|
| 220 |
+
# Apply median filter to smooth out noise in projection
|
| 221 |
+
from scipy.ndimage import median_filter
|
| 222 |
+
is_text_smoothed = median_filter(is_text.astype(float), size=3) > 0.5
|
| 223 |
+
|
| 224 |
+
# Find continuous text regions with improved gap detection
|
| 225 |
+
lines = []
|
| 226 |
+
in_line = False
|
| 227 |
+
start_y = 0
|
| 228 |
+
gap_count = 0
|
| 229 |
+
|
| 230 |
+
for y in range(len(is_text_smoothed)):
|
| 231 |
+
if is_text_smoothed[y]:
|
| 232 |
+
if not in_line:
|
| 233 |
+
# Start of new line
|
| 234 |
+
start_y = y
|
| 235 |
+
in_line = True
|
| 236 |
+
gap_count = 0
|
| 237 |
+
else:
|
| 238 |
+
# Continue line, reset gap counter
|
| 239 |
+
gap_count = 0
|
| 240 |
+
else:
|
| 241 |
+
if in_line:
|
| 242 |
+
# Potential gap - count consecutive gap pixels
|
| 243 |
+
gap_count += 1
|
| 244 |
+
if gap_count >= self.min_gap:
|
| 245 |
+
# End of line (gap is large enough)
|
| 246 |
+
end_y = y - gap_count
|
| 247 |
+
if end_y - start_y >= self.min_line_height:
|
| 248 |
+
lines.append((start_y, end_y))
|
| 249 |
+
in_line = False
|
| 250 |
+
gap_count = 0
|
| 251 |
+
|
| 252 |
+
# Don't forget last line if image ends with text
|
| 253 |
+
if in_line and len(is_text_smoothed) - start_y >= self.min_line_height:
|
| 254 |
+
lines.append((start_y, len(is_text_smoothed)))
|
| 255 |
+
|
| 256 |
+
# Post-process: Merge lines that are too close (likely one line split incorrectly)
|
| 257 |
+
merged_lines = self._merge_close_lines(lines, max_gap=self.min_gap * 2)
|
| 258 |
+
|
| 259 |
+
# Create LineSegment objects
|
| 260 |
+
segments = []
|
| 261 |
+
width = image.width
|
| 262 |
+
|
| 263 |
+
for y1, y2 in merged_lines:
|
| 264 |
+
# Add padding (larger padding for better context)
|
| 265 |
+
padding = 8
|
| 266 |
+
y1_pad = max(0, y1 - padding)
|
| 267 |
+
y2_pad = min(image.height, y2 + padding)
|
| 268 |
+
|
| 269 |
+
# Crop line (full width for now, could be refined with vertical projection)
|
| 270 |
+
bbox = (0, y1_pad, width, y2_pad)
|
| 271 |
+
line_img = image.crop(bbox)
|
| 272 |
+
|
| 273 |
+
segments.append(LineSegment(
|
| 274 |
+
image=line_img,
|
| 275 |
+
bbox=bbox
|
| 276 |
+
))
|
| 277 |
+
|
| 278 |
+
if debug:
|
| 279 |
+
self._visualize_segmentation(image, segments, h_projection)
|
| 280 |
+
|
| 281 |
+
print(f"[LineSegmenter] Detected {len(segments)} lines (sensitivity={self.sensitivity}, min_height={self.min_line_height})")
|
| 282 |
+
|
| 283 |
+
return segments
|
| 284 |
+
|
| 285 |
+
@staticmethod
|
| 286 |
+
def _adaptive_threshold(gray: np.ndarray, block_size: int = 35) -> np.ndarray:
|
| 287 |
+
"""
|
| 288 |
+
Apply adaptive thresholding using a local window.
|
| 289 |
+
Better for images with varying illumination or contrast.
|
| 290 |
+
"""
|
| 291 |
+
# Use cv2 if available, otherwise fallback to simple method
|
| 292 |
+
try:
|
| 293 |
+
import cv2
|
| 294 |
+
# Adaptive Gaussian thresholding
|
| 295 |
+
binary = cv2.adaptiveThreshold(
|
| 296 |
+
gray.astype(np.uint8),
|
| 297 |
+
255,
|
| 298 |
+
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 299 |
+
cv2.THRESH_BINARY_INV,
|
| 300 |
+
block_size,
|
| 301 |
+
10
|
| 302 |
+
)
|
| 303 |
+
return binary > 0
|
| 304 |
+
except:
|
| 305 |
+
# Fallback: simple global threshold
|
| 306 |
+
threshold = np.mean(gray) - np.std(gray) * 0.5
|
| 307 |
+
return gray < threshold
|
| 308 |
+
|
| 309 |
+
@staticmethod
|
| 310 |
+
def _merge_close_lines(lines: List[Tuple[int, int]], max_gap: int = 10) -> List[Tuple[int, int]]:
|
| 311 |
+
"""Merge lines that are very close together (likely one line split incorrectly)."""
|
| 312 |
+
if not lines:
|
| 313 |
+
return lines
|
| 314 |
+
|
| 315 |
+
merged = [lines[0]]
|
| 316 |
+
for y1, y2 in lines[1:]:
|
| 317 |
+
prev_y1, prev_y2 = merged[-1]
|
| 318 |
+
gap = y1 - prev_y2
|
| 319 |
+
|
| 320 |
+
if gap <= max_gap:
|
| 321 |
+
# Merge with previous line
|
| 322 |
+
merged[-1] = (prev_y1, y2)
|
| 323 |
+
else:
|
| 324 |
+
# Add as new line
|
| 325 |
+
merged.append((y1, y2))
|
| 326 |
+
|
| 327 |
+
return merged
|
| 328 |
+
|
| 329 |
+
@staticmethod
|
| 330 |
+
def _otsu_threshold(gray_array: np.ndarray) -> float:
|
| 331 |
+
"""Compute Otsu's threshold."""
|
| 332 |
+
hist, bin_edges = np.histogram(gray_array, bins=256, range=(0, 256))
|
| 333 |
+
hist = hist.astype(float)
|
| 334 |
+
|
| 335 |
+
# Normalize
|
| 336 |
+
hist /= hist.sum()
|
| 337 |
+
|
| 338 |
+
# Cumulative sums
|
| 339 |
+
weight1 = np.cumsum(hist)
|
| 340 |
+
weight2 = np.cumsum(hist[::-1])[::-1]
|
| 341 |
+
|
| 342 |
+
# Cumulative means
|
| 343 |
+
mean1 = np.cumsum(hist * np.arange(256))
|
| 344 |
+
mean2 = (np.cumsum((hist * np.arange(256))[::-1])[::-1])
|
| 345 |
+
|
| 346 |
+
# Avoid division by zero
|
| 347 |
+
weight1 = np.clip(weight1, 1e-10, 1)
|
| 348 |
+
weight2 = np.clip(weight2, 1e-10, 1)
|
| 349 |
+
|
| 350 |
+
# Between-class variance
|
| 351 |
+
variance = weight1 * weight2 * ((mean1 / weight1) - (mean2 / weight2)) ** 2
|
| 352 |
+
|
| 353 |
+
return np.argmax(variance)
|
| 354 |
+
|
| 355 |
+
@staticmethod
|
| 356 |
+
def _visualize_segmentation(image: Image.Image, segments: List[LineSegment],
|
| 357 |
+
h_projection: Optional[np.ndarray] = None):
|
| 358 |
+
"""Visualize line segmentation for debugging."""
|
| 359 |
+
vis = image.copy()
|
| 360 |
+
draw = ImageDraw.Draw(vis)
|
| 361 |
+
|
| 362 |
+
for i, seg in enumerate(segments):
|
| 363 |
+
x1, y1, x2, y2 = seg.bbox
|
| 364 |
+
# Alternate colors for visibility
|
| 365 |
+
color = 'red' if i % 2 == 0 else 'blue'
|
| 366 |
+
draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
|
| 367 |
+
draw.text((x1 + 5, y1 + 5), f"Line {i+1}", fill=color)
|
| 368 |
+
|
| 369 |
+
vis.show()
|
| 370 |
+
|
| 371 |
+
# Optionally show projection profile
|
| 372 |
+
if h_projection is not None:
|
| 373 |
+
import matplotlib.pyplot as plt
|
| 374 |
+
plt.figure(figsize=(12, 4))
|
| 375 |
+
plt.plot(h_projection)
|
| 376 |
+
plt.title("Horizontal Projection Profile")
|
| 377 |
+
plt.xlabel("Y Position")
|
| 378 |
+
plt.ylabel("Text Density")
|
| 379 |
+
plt.grid(True)
|
| 380 |
+
plt.show()
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
class PageXMLSegmenter:
|
| 384 |
+
"""Segment using existing Transkribus PAGE XML annotations."""
|
| 385 |
+
|
| 386 |
+
NS = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
|
| 387 |
+
|
| 388 |
+
def __init__(self, xml_path: str):
|
| 389 |
+
self.xml_path = Path(xml_path)
|
| 390 |
+
|
| 391 |
+
def segment_lines(self, image: Image.Image) -> List[LineSegment]:
|
| 392 |
+
"""Extract lines using PAGE XML coordinates with correct reading order."""
|
| 393 |
+
tree = ET.parse(self.xml_path)
|
| 394 |
+
root = tree.getroot()
|
| 395 |
+
|
| 396 |
+
# Determine scale factors: PAGE XML stores absolute pixel coords for the
|
| 397 |
+
# original scan. If the uploaded image was resized, we must scale coords.
|
| 398 |
+
ns = self.NS
|
| 399 |
+
# Try both common PAGE XML namespaces (2013 and 2019 Transkribus variants)
|
| 400 |
+
page_elem = root.find('.//page:Page', ns)
|
| 401 |
+
if page_elem is None:
|
| 402 |
+
ns_2019 = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
|
| 403 |
+
page_elem = root.find('.//page:Page', ns_2019)
|
| 404 |
+
if page_elem is not None:
|
| 405 |
+
ns = ns_2019
|
| 406 |
+
xml_w = int(page_elem.get('imageWidth', image.width)) if page_elem is not None else image.width
|
| 407 |
+
xml_h = int(page_elem.get('imageHeight', image.height)) if page_elem is not None else image.height
|
| 408 |
+
scale_x = image.width / xml_w if xml_w > 0 else 1.0
|
| 409 |
+
scale_y = image.height / xml_h if xml_h > 0 else 1.0
|
| 410 |
+
|
| 411 |
+
# Will be populated below for visualization in the viewer
|
| 412 |
+
self.region_data: list = []
|
| 413 |
+
|
| 414 |
+
# Store regions with their reading order
|
| 415 |
+
regions_with_order = []
|
| 416 |
+
|
| 417 |
+
for region in root.findall('.//page:TextRegion', ns):
|
| 418 |
+
# Extract region reading order from custom attribute
|
| 419 |
+
region_order = self._extract_reading_order(region.get('custom', ''))
|
| 420 |
+
|
| 421 |
+
# Get region Y coordinate as fallback (from first TextLine or Coords)
|
| 422 |
+
region_y = self._get_region_y_position(region, ns)
|
| 423 |
+
|
| 424 |
+
# Store lines for this region with their reading order
|
| 425 |
+
lines_with_order = []
|
| 426 |
+
|
| 427 |
+
for text_line in region.findall('.//page:TextLine', ns):
|
| 428 |
+
# Get coordinates
|
| 429 |
+
coords_elem = text_line.find('page:Coords', ns)
|
| 430 |
+
if coords_elem is None:
|
| 431 |
+
continue
|
| 432 |
+
|
| 433 |
+
coords_str = coords_elem.get('points')
|
| 434 |
+
if not coords_str:
|
| 435 |
+
continue
|
| 436 |
+
|
| 437 |
+
# Parse coordinates and scale to uploaded image dimensions
|
| 438 |
+
coords = self._parse_coords(coords_str)
|
| 439 |
+
if scale_x != 1.0 or scale_y != 1.0:
|
| 440 |
+
coords = [(int(x * scale_x), int(y * scale_y)) for x, y in coords]
|
| 441 |
+
x1, y1, x2, y2 = self._get_bounding_box(coords)
|
| 442 |
+
|
| 443 |
+
# Crop line with padding
|
| 444 |
+
padding = 5
|
| 445 |
+
x1_pad = max(0, x1 - padding)
|
| 446 |
+
y1_pad = max(0, y1 - padding)
|
| 447 |
+
x2_pad = min(image.width, x2 + padding)
|
| 448 |
+
y2_pad = min(image.height, y2 + padding)
|
| 449 |
+
|
| 450 |
+
bbox = (x1_pad, y1_pad, x2_pad, y2_pad)
|
| 451 |
+
line_img = image.crop(bbox)
|
| 452 |
+
|
| 453 |
+
segment = LineSegment(
|
| 454 |
+
image=line_img,
|
| 455 |
+
bbox=bbox,
|
| 456 |
+
coords=coords
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
# Extract line reading order from custom attribute
|
| 460 |
+
line_order = self._extract_reading_order(text_line.get('custom', ''))
|
| 461 |
+
|
| 462 |
+
# Use line reading order if available, otherwise Y coordinate
|
| 463 |
+
sort_key = line_order if line_order is not None else y1
|
| 464 |
+
lines_with_order.append((sort_key, segment))
|
| 465 |
+
|
| 466 |
+
# Sort lines within this region
|
| 467 |
+
lines_with_order.sort(key=lambda x: x[0])
|
| 468 |
+
sorted_lines = [seg for _, seg in lines_with_order]
|
| 469 |
+
|
| 470 |
+
# Collect TextRegion bbox for viewer visualization
|
| 471 |
+
region_id = region.get('id', f'region_{len(regions_with_order)}')
|
| 472 |
+
region_coords_elem = region.find('page:Coords', ns)
|
| 473 |
+
if region_coords_elem is not None:
|
| 474 |
+
rc_str = region_coords_elem.get('points', '')
|
| 475 |
+
if rc_str:
|
| 476 |
+
rc = self._parse_coords(rc_str)
|
| 477 |
+
if scale_x != 1.0 or scale_y != 1.0:
|
| 478 |
+
rc = [(int(x * scale_x), int(y * scale_y)) for x, y in rc]
|
| 479 |
+
rx1, ry1, rx2, ry2 = self._get_bounding_box(rc)
|
| 480 |
+
self.region_data.append({
|
| 481 |
+
"id": region_id,
|
| 482 |
+
"bbox": [rx1, ry1, rx2, ry2],
|
| 483 |
+
"num_lines": len(sorted_lines),
|
| 484 |
+
})
|
| 485 |
+
|
| 486 |
+
# Use region reading order if available, otherwise region Y position
|
| 487 |
+
region_sort_key = region_order if region_order is not None else region_y
|
| 488 |
+
regions_with_order.append((region_sort_key, sorted_lines))
|
| 489 |
+
|
| 490 |
+
# Sort regions by reading order (or Y position fallback)
|
| 491 |
+
regions_with_order.sort(key=lambda x: x[0])
|
| 492 |
+
|
| 493 |
+
# Flatten: concatenate all lines from all regions in order
|
| 494 |
+
segments = []
|
| 495 |
+
for _, region_lines in regions_with_order:
|
| 496 |
+
segments.extend(region_lines)
|
| 497 |
+
|
| 498 |
+
return segments
|
| 499 |
+
|
| 500 |
+
@staticmethod
|
| 501 |
+
def _extract_reading_order(custom_attr: str) -> Optional[int]:
|
| 502 |
+
"""Extract reading order index from custom attribute.
|
| 503 |
+
|
| 504 |
+
Format: custom="readingOrder {index:5;}"
|
| 505 |
+
Returns: 5 (or None if not found/parseable)
|
| 506 |
+
"""
|
| 507 |
+
if not custom_attr or 'readingOrder' not in custom_attr:
|
| 508 |
+
return None
|
| 509 |
+
|
| 510 |
+
try:
|
| 511 |
+
# Find "index:X;" pattern
|
| 512 |
+
start = custom_attr.index('index:') + 6
|
| 513 |
+
end = custom_attr.index(';', start)
|
| 514 |
+
return int(custom_attr[start:end])
|
| 515 |
+
except (ValueError, IndexError):
|
| 516 |
+
return None
|
| 517 |
+
|
| 518 |
+
def _get_region_y_position(self, region, ns=None) -> int:
|
| 519 |
+
"""Get Y position of region for fallback sorting.
|
| 520 |
+
|
| 521 |
+
Uses the Y coordinate of the region's Coords or first TextLine.
|
| 522 |
+
"""
|
| 523 |
+
if ns is None:
|
| 524 |
+
ns = self.NS
|
| 525 |
+
# Try region Coords first
|
| 526 |
+
coords_elem = region.find('page:Coords', ns)
|
| 527 |
+
if coords_elem is not None:
|
| 528 |
+
coords_str = coords_elem.get('points')
|
| 529 |
+
if coords_str:
|
| 530 |
+
coords = self._parse_coords(coords_str)
|
| 531 |
+
_, y1, _, _ = self._get_bounding_box(coords)
|
| 532 |
+
return y1
|
| 533 |
+
|
| 534 |
+
# Fallback: use first TextLine Y position
|
| 535 |
+
text_line = region.find('.//page:TextLine', ns)
|
| 536 |
+
if text_line is not None:
|
| 537 |
+
coords_elem = text_line.find('page:Coords', ns)
|
| 538 |
+
if coords_elem is not None:
|
| 539 |
+
coords_str = coords_elem.get('points')
|
| 540 |
+
if coords_str:
|
| 541 |
+
coords = self._parse_coords(coords_str)
|
| 542 |
+
_, y1, _, _ = self._get_bounding_box(coords)
|
| 543 |
+
return y1
|
| 544 |
+
|
| 545 |
+
# Default fallback
|
| 546 |
+
return 0
|
| 547 |
+
|
| 548 |
+
@staticmethod
|
| 549 |
+
def _parse_coords(coords_str: str) -> List[Tuple[int, int]]:
|
| 550 |
+
"""Parse coordinate string from PAGE XML."""
|
| 551 |
+
points = coords_str.split()
|
| 552 |
+
return [(int(p.split(',')[0]), int(p.split(',')[1])) for p in points]
|
| 553 |
+
|
| 554 |
+
@staticmethod
|
| 555 |
+
def _get_bounding_box(coords: List[Tuple[int, int]]) -> Tuple[int, int, int, int]:
|
| 556 |
+
"""Get bounding box from polygon coordinates."""
|
| 557 |
+
xs = [p[0] for p in coords]
|
| 558 |
+
ys = [p[1] for p in coords]
|
| 559 |
+
return min(xs), min(ys), max(xs), max(ys)
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
class TrOCRInference:
|
| 563 |
+
"""TrOCR model inference."""
|
| 564 |
+
|
| 565 |
+
def __init__(self, model_path: str, device: Optional[str] = None,
|
| 566 |
+
base_model: str = "kazars24/trocr-base-handwritten-ru",
|
| 567 |
+
normalize_bg: bool = False,
|
| 568 |
+
flip_rtl: bool = False,
|
| 569 |
+
is_huggingface: bool = False):
|
| 570 |
+
"""
|
| 571 |
+
Initialize TrOCR inference.
|
| 572 |
+
|
| 573 |
+
Args:
|
| 574 |
+
model_path: Path to local checkpoint or HuggingFace model ID
|
| 575 |
+
device: 'cuda', 'cpu', or None for auto-detect
|
| 576 |
+
base_model: Base model for processor (used with local checkpoints)
|
| 577 |
+
normalize_bg: Apply background normalization
|
| 578 |
+
flip_rtl: Flip line images horizontally for RTL scripts
|
| 579 |
+
is_huggingface: If True, load from HuggingFace Hub instead of local path
|
| 580 |
+
"""
|
| 581 |
+
self.model_path = model_path
|
| 582 |
+
self.base_model = base_model
|
| 583 |
+
self.normalize_bg = normalize_bg
|
| 584 |
+
self.flip_rtl = flip_rtl
|
| 585 |
+
self.is_huggingface = is_huggingface
|
| 586 |
+
|
| 587 |
+
if device is None:
|
| 588 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 589 |
+
else:
|
| 590 |
+
self.device = device
|
| 591 |
+
|
| 592 |
+
print(f"Loading model from {'HuggingFace Hub' if is_huggingface else 'local checkpoint'}: {model_path}...")
|
| 593 |
+
print(f"Using device: {self.device}")
|
| 594 |
+
print(f"Background normalization: {'Enabled' if self.normalize_bg else 'Disabled'}")
|
| 595 |
+
|
| 596 |
+
if VisionEncoderDecoderModel is None or TrOCRProcessor is None:
|
| 597 |
+
raise ImportError("TrOCR inference requires transformers to be installed")
|
| 598 |
+
|
| 599 |
+
if is_huggingface:
|
| 600 |
+
# Load both processor and model from HuggingFace Hub
|
| 601 |
+
print(f"Downloading from HuggingFace Hub (if not cached): {model_path}")
|
| 602 |
+
|
| 603 |
+
# Try to load processor from model first, fallback to base_model if it fails
|
| 604 |
+
try:
|
| 605 |
+
print(f"Attempting to load processor from {model_path}...")
|
| 606 |
+
self.processor = TrOCRProcessor.from_pretrained(model_path)
|
| 607 |
+
# Some models (e.g. dh-unibe/trocr-kurrent) ship a truncated tokenizer
|
| 608 |
+
# with only special tokens (vocab_size=5). The model itself uses the full
|
| 609 |
+
# microsoft/trocr-base-handwritten vocabulary (50265 tokens). Detect this
|
| 610 |
+
# by checking vocab_size and replace only the tokenizer – keep the image
|
| 611 |
+
# processor from the model so preprocessing stays correct.
|
| 612 |
+
if self.processor.tokenizer.vocab_size < 100:
|
| 613 |
+
print(f"WARNING: tokenizer from '{model_path}' has vocab_size="
|
| 614 |
+
f"{self.processor.tokenizer.vocab_size} (looks broken). "
|
| 615 |
+
f"Replacing tokenizer with microsoft/trocr-base-handwritten.")
|
| 616 |
+
_fallback = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
|
| 617 |
+
self.processor.tokenizer = _fallback.tokenizer
|
| 618 |
+
except Exception as e:
|
| 619 |
+
print(f"Failed to load processor from model: {e}")
|
| 620 |
+
print(f"Falling back to base model processor: {self.base_model}")
|
| 621 |
+
self.processor = TrOCRProcessor.from_pretrained(self.base_model)
|
| 622 |
+
|
| 623 |
+
self.model = VisionEncoderDecoderModel.from_pretrained(
|
| 624 |
+
model_path, low_cpu_mem_usage=False)
|
| 625 |
+
# For backwards compatibility
|
| 626 |
+
self.checkpoint_path = model_path
|
| 627 |
+
else:
|
| 628 |
+
# Load processor from base model, model from local checkpoint
|
| 629 |
+
self.checkpoint_path = Path(model_path)
|
| 630 |
+
|
| 631 |
+
# If model_path points to a specific file (e.g., model.safetensors),
|
| 632 |
+
# use the parent directory for from_pretrained()
|
| 633 |
+
if self.checkpoint_path.is_file():
|
| 634 |
+
model_dir = self.checkpoint_path.parent
|
| 635 |
+
print(f"Model path is a file, using directory: {model_dir}")
|
| 636 |
+
else:
|
| 637 |
+
model_dir = self.checkpoint_path
|
| 638 |
+
|
| 639 |
+
# Try to load processor from the local model first (correct tokenizer),
|
| 640 |
+
# fall back to base_model for old checkpoints that lack processor files.
|
| 641 |
+
try:
|
| 642 |
+
print(f"Attempting to load processor from local model: {model_dir}")
|
| 643 |
+
self.processor = TrOCRProcessor.from_pretrained(model_dir)
|
| 644 |
+
except Exception as e:
|
| 645 |
+
print(f"Local processor not found ({e}), falling back to base model: {self.base_model}")
|
| 646 |
+
self.processor = TrOCRProcessor.from_pretrained(self.base_model)
|
| 647 |
+
self.model = VisionEncoderDecoderModel.from_pretrained(
|
| 648 |
+
model_dir, low_cpu_mem_usage=False)
|
| 649 |
+
|
| 650 |
+
self.model.to(self.device)
|
| 651 |
+
# mBART decoder creates _float_tensor lazily on CPU; force it to the right device now.
|
| 652 |
+
for m in self.model.modules():
|
| 653 |
+
if hasattr(m, '_float_tensor'):
|
| 654 |
+
m._float_tensor = m._float_tensor.to(self.device)
|
| 655 |
+
self.model.eval()
|
| 656 |
+
|
| 657 |
+
print("Model loaded successfully!")
|
| 658 |
+
|
| 659 |
+
def transcribe_line(self, line_image: Image.Image, num_beams: int = 4,
|
| 660 |
+
max_length: int = 128, return_confidence: bool = False):
|
| 661 |
+
"""
|
| 662 |
+
Transcribe a single line image.
|
| 663 |
+
|
| 664 |
+
Args:
|
| 665 |
+
line_image: PIL Image of text line
|
| 666 |
+
num_beams: Number of beams for beam search (higher = better quality, slower)
|
| 667 |
+
max_length: Maximum sequence length
|
| 668 |
+
return_confidence: If True, return (text, confidence) tuple
|
| 669 |
+
|
| 670 |
+
Returns:
|
| 671 |
+
If return_confidence=False: Transcribed text string
|
| 672 |
+
If return_confidence=True: Tuple of (text, confidence_score, char_confidences)
|
| 673 |
+
"""
|
| 674 |
+
# Apply background normalization if enabled
|
| 675 |
+
if self.normalize_bg:
|
| 676 |
+
line_image = normalize_background(line_image)
|
| 677 |
+
|
| 678 |
+
# Flip horizontally for RTL scripts (model trained on flipped images)
|
| 679 |
+
if self.flip_rtl:
|
| 680 |
+
line_image = line_image.transpose(Image.FLIP_LEFT_RIGHT)
|
| 681 |
+
|
| 682 |
+
# Ensure image is in RGB mode (TrOCR requires 3 channels)
|
| 683 |
+
if line_image.mode != 'RGB':
|
| 684 |
+
line_image = line_image.convert('RGB')
|
| 685 |
+
|
| 686 |
+
# Prepare image
|
| 687 |
+
pixel_values = self.processor(
|
| 688 |
+
images=line_image,
|
| 689 |
+
return_tensors="pt"
|
| 690 |
+
).pixel_values.to(self.device)
|
| 691 |
+
|
| 692 |
+
# Generate text with scores
|
| 693 |
+
with torch.no_grad():
|
| 694 |
+
if return_confidence:
|
| 695 |
+
# Generate with output scores for confidence
|
| 696 |
+
outputs = self.model.generate(
|
| 697 |
+
pixel_values,
|
| 698 |
+
num_beams=num_beams,
|
| 699 |
+
max_length=max_length,
|
| 700 |
+
early_stopping=True,
|
| 701 |
+
output_scores=True,
|
| 702 |
+
return_dict_in_generate=True
|
| 703 |
+
)
|
| 704 |
+
generated_ids = outputs.sequences
|
| 705 |
+
|
| 706 |
+
# Calculate confidence from scores
|
| 707 |
+
# scores is a tuple of tensors, one per generation step
|
| 708 |
+
# generated_ids shape: (batch_size, sequence_length)
|
| 709 |
+
if hasattr(outputs, 'scores') and outputs.scores and len(outputs.scores) > 0:
|
| 710 |
+
import torch.nn.functional as F
|
| 711 |
+
|
| 712 |
+
# Get the actual generated tokens (excluding special tokens like BOS)
|
| 713 |
+
# generated_ids[0] is the first (and only) sequence in the batch
|
| 714 |
+
generated_tokens = generated_ids[0].cpu().numpy()
|
| 715 |
+
|
| 716 |
+
# scores is a tuple with one tensor per generation step
|
| 717 |
+
# Each tensor has shape (batch_size * num_beams, vocab_size)
|
| 718 |
+
token_confidences = []
|
| 719 |
+
|
| 720 |
+
for step_idx, score_tensor in enumerate(outputs.scores):
|
| 721 |
+
# Get probabilities for this generation step
|
| 722 |
+
# score_tensor shape: (num_beams, vocab_size) for batch_size=1
|
| 723 |
+
probs = F.softmax(score_tensor, dim=-1)
|
| 724 |
+
|
| 725 |
+
# The actual generated token at this step
|
| 726 |
+
# Skip BOS token (index 0), so generated token index is step_idx + 1
|
| 727 |
+
if step_idx + 1 < len(generated_tokens):
|
| 728 |
+
actual_token_id = generated_tokens[step_idx + 1]
|
| 729 |
+
|
| 730 |
+
# Get probability of the actual selected token (from best beam, index 0)
|
| 731 |
+
token_prob = probs[0, actual_token_id].item()
|
| 732 |
+
token_confidences.append(token_prob)
|
| 733 |
+
|
| 734 |
+
# Calculate average confidence
|
| 735 |
+
avg_confidence = sum(token_confidences) / len(token_confidences) if token_confidences else 0.0
|
| 736 |
+
char_confidences = token_confidences
|
| 737 |
+
else:
|
| 738 |
+
avg_confidence = 0.0
|
| 739 |
+
char_confidences = []
|
| 740 |
+
else:
|
| 741 |
+
generated_ids = self.model.generate(
|
| 742 |
+
pixel_values,
|
| 743 |
+
num_beams=num_beams,
|
| 744 |
+
max_length=max_length,
|
| 745 |
+
early_stopping=True
|
| 746 |
+
)
|
| 747 |
+
avg_confidence = None
|
| 748 |
+
char_confidences = None
|
| 749 |
+
|
| 750 |
+
# Decode
|
| 751 |
+
text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 752 |
+
|
| 753 |
+
if return_confidence:
|
| 754 |
+
return text, avg_confidence, char_confidences
|
| 755 |
+
else:
|
| 756 |
+
return text
|
| 757 |
+
|
| 758 |
+
def transcribe_segments(self, segments: List[LineSegment],
|
| 759 |
+
num_beams: int = 4, max_length: int = 128,
|
| 760 |
+
show_progress: bool = True) -> List[LineSegment]:
|
| 761 |
+
"""
|
| 762 |
+
Transcribe multiple line segments.
|
| 763 |
+
|
| 764 |
+
Args:
|
| 765 |
+
segments: List of LineSegment objects
|
| 766 |
+
num_beams: Beam search parameter
|
| 767 |
+
max_length: Max sequence length
|
| 768 |
+
show_progress: Show progress bar
|
| 769 |
+
|
| 770 |
+
Returns:
|
| 771 |
+
Updated segments with text field filled
|
| 772 |
+
"""
|
| 773 |
+
if show_progress:
|
| 774 |
+
from tqdm import tqdm
|
| 775 |
+
iterator = tqdm(segments, desc="Transcribing lines")
|
| 776 |
+
else:
|
| 777 |
+
iterator = segments
|
| 778 |
+
|
| 779 |
+
for segment in iterator:
|
| 780 |
+
segment.text = self.transcribe_line(
|
| 781 |
+
segment.image,
|
| 782 |
+
num_beams=num_beams,
|
| 783 |
+
max_length=max_length
|
| 784 |
+
)
|
| 785 |
+
|
| 786 |
+
return segments
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
def main():
|
| 790 |
+
parser = argparse.ArgumentParser(
|
| 791 |
+
description="Whole-page OCR inference for Ukrainian handwritten text"
|
| 792 |
+
)
|
| 793 |
+
parser.add_argument(
|
| 794 |
+
'--image',
|
| 795 |
+
type=str,
|
| 796 |
+
required=True,
|
| 797 |
+
help='Path to input page image'
|
| 798 |
+
)
|
| 799 |
+
parser.add_argument(
|
| 800 |
+
'--checkpoint',
|
| 801 |
+
type=str,
|
| 802 |
+
required=True,
|
| 803 |
+
help='Path to TrOCR checkpoint directory'
|
| 804 |
+
)
|
| 805 |
+
parser.add_argument(
|
| 806 |
+
'--xml',
|
| 807 |
+
type=str,
|
| 808 |
+
default=None,
|
| 809 |
+
help='Optional: PAGE XML file for line segmentation (if not provided, automatic segmentation is used)'
|
| 810 |
+
)
|
| 811 |
+
parser.add_argument(
|
| 812 |
+
'--output',
|
| 813 |
+
type=str,
|
| 814 |
+
default=None,
|
| 815 |
+
help='Output text file (default: <image_name>_transcription.txt)'
|
| 816 |
+
)
|
| 817 |
+
parser.add_argument(
|
| 818 |
+
'--num_beams',
|
| 819 |
+
type=int,
|
| 820 |
+
default=4,
|
| 821 |
+
help='Number of beams for beam search (default: 4, higher=better quality but slower)'
|
| 822 |
+
)
|
| 823 |
+
parser.add_argument(
|
| 824 |
+
'--max_length',
|
| 825 |
+
type=int,
|
| 826 |
+
default=128,
|
| 827 |
+
help='Maximum sequence length (default: 128)'
|
| 828 |
+
)
|
| 829 |
+
parser.add_argument(
|
| 830 |
+
'--min_line_height',
|
| 831 |
+
type=int,
|
| 832 |
+
default=20,
|
| 833 |
+
help='Minimum line height for automatic segmentation (default: 20)'
|
| 834 |
+
)
|
| 835 |
+
parser.add_argument(
|
| 836 |
+
'--debug',
|
| 837 |
+
action='store_true',
|
| 838 |
+
help='Visualize line segmentation'
|
| 839 |
+
)
|
| 840 |
+
parser.add_argument(
|
| 841 |
+
'--device',
|
| 842 |
+
type=str,
|
| 843 |
+
default=None,
|
| 844 |
+
choices=['cuda', 'cpu'],
|
| 845 |
+
help='Device to use for inference (default: auto-detect)'
|
| 846 |
+
)
|
| 847 |
+
parser.add_argument(
|
| 848 |
+
'--base_model',
|
| 849 |
+
type=str,
|
| 850 |
+
default='kazars24/trocr-base-handwritten-ru',
|
| 851 |
+
help='Base model for processor (default: kazars24/trocr-base-handwritten-ru)'
|
| 852 |
+
)
|
| 853 |
+
parser.add_argument(
|
| 854 |
+
'--normalize-background',
|
| 855 |
+
action='store_true',
|
| 856 |
+
help='Apply background normalization (REQUIRED if model was trained with --normalize-background)'
|
| 857 |
+
)
|
| 858 |
+
parser.add_argument(
|
| 859 |
+
'--flip-rtl',
|
| 860 |
+
action='store_true',
|
| 861 |
+
help='Flip line images horizontally for RTL scripts (REQUIRED if model was trained with --flip-rtl)'
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
args = parser.parse_args()
|
| 865 |
+
|
| 866 |
+
print("=" * 80)
|
| 867 |
+
print("TrOCR Whole-Page Inference")
|
| 868 |
+
print("=" * 80)
|
| 869 |
+
print(f"Input image: {args.image}")
|
| 870 |
+
print(f"Checkpoint: {args.checkpoint}")
|
| 871 |
+
print(f"Segmentation: {'PAGE XML' if args.xml else 'Automatic'}")
|
| 872 |
+
print(f"Beam search: {args.num_beams}")
|
| 873 |
+
print("=" * 80)
|
| 874 |
+
|
| 875 |
+
# Load image
|
| 876 |
+
print("\nLoading image...")
|
| 877 |
+
Image.MAX_IMAGE_PIXELS = None # Allow large images
|
| 878 |
+
from PIL import ImageOps
|
| 879 |
+
image = Image.open(args.image)
|
| 880 |
+
image = ImageOps.exif_transpose(image) # Fix EXIF orientation
|
| 881 |
+
image = image.convert('RGB')
|
| 882 |
+
print(f"Image size: {image.width}x{image.height}")
|
| 883 |
+
|
| 884 |
+
# Segment lines
|
| 885 |
+
print("\nSegmenting lines...")
|
| 886 |
+
if args.xml:
|
| 887 |
+
segmenter = PageXMLSegmenter(args.xml)
|
| 888 |
+
segments = segmenter.segment_lines(image)
|
| 889 |
+
print(f"Found {len(segments)} lines in PAGE XML")
|
| 890 |
+
else:
|
| 891 |
+
segmenter = LineSegmenter(
|
| 892 |
+
min_line_height=args.min_line_height
|
| 893 |
+
)
|
| 894 |
+
segments = segmenter.segment_lines(image, debug=args.debug)
|
| 895 |
+
print(f"Detected {len(segments)} lines")
|
| 896 |
+
|
| 897 |
+
if not segments:
|
| 898 |
+
print("ERROR: No lines detected!")
|
| 899 |
+
return
|
| 900 |
+
|
| 901 |
+
# Initialize TrOCR
|
| 902 |
+
print("\nInitializing TrOCR model...")
|
| 903 |
+
ocr = TrOCRInference(
|
| 904 |
+
args.checkpoint,
|
| 905 |
+
device=args.device,
|
| 906 |
+
base_model=args.base_model,
|
| 907 |
+
normalize_bg=args.normalize_background, # NEW: pass normalization flag
|
| 908 |
+
flip_rtl=args.flip_rtl
|
| 909 |
+
)
|
| 910 |
+
|
| 911 |
+
# Transcribe
|
| 912 |
+
print(f"\nTranscribing {len(segments)} lines...")
|
| 913 |
+
segments = ocr.transcribe_segments(
|
| 914 |
+
segments,
|
| 915 |
+
num_beams=args.num_beams,
|
| 916 |
+
max_length=args.max_length
|
| 917 |
+
)
|
| 918 |
+
|
| 919 |
+
# Prepare output
|
| 920 |
+
transcription = "\n".join(seg.text for seg in segments if seg.text)
|
| 921 |
+
|
| 922 |
+
# Determine output path
|
| 923 |
+
if args.output:
|
| 924 |
+
output_path = Path(args.output)
|
| 925 |
+
else:
|
| 926 |
+
image_path = Path(args.image)
|
| 927 |
+
output_path = image_path.parent / f"{image_path.stem}_transcription.txt"
|
| 928 |
+
|
| 929 |
+
# Save
|
| 930 |
+
print(f"\nSaving transcription to {output_path}...")
|
| 931 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 932 |
+
f.write(transcription)
|
| 933 |
+
|
| 934 |
+
# Print results
|
| 935 |
+
print("\n" + "=" * 80)
|
| 936 |
+
print("TRANSCRIPTION RESULT")
|
| 937 |
+
print("=" * 80)
|
| 938 |
+
print(transcription)
|
| 939 |
+
print("=" * 80)
|
| 940 |
+
print(f"\nTranscription saved to: {output_path}")
|
| 941 |
+
print(f"Total lines: {len(segments)}")
|
| 942 |
+
print(f"Average confidence: N/A (not implemented yet)")
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
if __name__ == '__main__':
|
| 946 |
+
main()
|
inference_pylaia_native.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Native PyLaia Inference (No WSL)
|
| 3 |
+
|
| 4 |
+
This module provides inference for PyLaia CRNN models trained with train_pylaia.py.
|
| 5 |
+
It loads the PyTorch checkpoint directly and runs inference natively on Linux.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Tuple, Optional, List
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import torchvision.transforms as transforms
|
| 14 |
+
import logging
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class CRNN(nn.Module):
|
| 22 |
+
"""
|
| 23 |
+
CRNN architecture (same as train_pylaia.py).
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
img_height: int = 128,
|
| 29 |
+
num_channels: int = 1,
|
| 30 |
+
num_classes: int = 100,
|
| 31 |
+
cnn_filters: List[int] = [12, 24, 48, 48],
|
| 32 |
+
cnn_poolsize: List[int] = [2, 2, 0, 2],
|
| 33 |
+
rnn_hidden: int = 256,
|
| 34 |
+
rnn_layers: int = 3,
|
| 35 |
+
dropout: float = 0.5
|
| 36 |
+
):
|
| 37 |
+
super(CRNN, self).__init__()
|
| 38 |
+
|
| 39 |
+
self.img_height = img_height
|
| 40 |
+
self.num_classes = num_classes
|
| 41 |
+
self.cnn_poolsize = cnn_poolsize
|
| 42 |
+
|
| 43 |
+
# CNN layers
|
| 44 |
+
cnn_layers = []
|
| 45 |
+
in_channels = num_channels
|
| 46 |
+
|
| 47 |
+
for i, out_channels in enumerate(cnn_filters):
|
| 48 |
+
cnn_layers.extend([
|
| 49 |
+
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, dilation=1),
|
| 50 |
+
nn.BatchNorm2d(out_channels),
|
| 51 |
+
nn.LeakyReLU(0.2, inplace=True)
|
| 52 |
+
])
|
| 53 |
+
|
| 54 |
+
if cnn_poolsize[i] > 0:
|
| 55 |
+
cnn_layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
|
| 56 |
+
|
| 57 |
+
in_channels = out_channels
|
| 58 |
+
|
| 59 |
+
self.cnn = nn.Sequential(*cnn_layers)
|
| 60 |
+
|
| 61 |
+
# Calculate RNN input size
|
| 62 |
+
num_pools = sum(1 for p in cnn_poolsize if p > 0)
|
| 63 |
+
cnn_output_height = img_height // (2 ** num_pools)
|
| 64 |
+
rnn_input_size = cnn_filters[-1] * cnn_output_height
|
| 65 |
+
|
| 66 |
+
# Bidirectional LSTM
|
| 67 |
+
self.rnn = nn.LSTM(
|
| 68 |
+
input_size=rnn_input_size,
|
| 69 |
+
hidden_size=rnn_hidden,
|
| 70 |
+
num_layers=rnn_layers,
|
| 71 |
+
dropout=dropout if rnn_layers > 1 else 0,
|
| 72 |
+
bidirectional=True,
|
| 73 |
+
batch_first=False
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
self.lin_dropout = nn.Dropout(dropout)
|
| 77 |
+
self.fc = nn.Linear(rnn_hidden * 2, num_classes)
|
| 78 |
+
|
| 79 |
+
def forward(self, x):
|
| 80 |
+
"""
|
| 81 |
+
Args:
|
| 82 |
+
x: [batch, channels, height, width]
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
log_probs: [width, batch, num_classes]
|
| 86 |
+
"""
|
| 87 |
+
# CNN
|
| 88 |
+
conv = self.cnn(x)
|
| 89 |
+
|
| 90 |
+
# Reshape for RNN
|
| 91 |
+
batch, channels, height, width = conv.size()
|
| 92 |
+
conv = conv.permute(3, 0, 1, 2) # [width, batch, channels, height]
|
| 93 |
+
conv = conv.reshape(width, batch, channels * height)
|
| 94 |
+
|
| 95 |
+
# RNN
|
| 96 |
+
rnn_out, _ = self.rnn(conv)
|
| 97 |
+
rnn_out = self.lin_dropout(rnn_out)
|
| 98 |
+
|
| 99 |
+
# Output projection
|
| 100 |
+
output = self.fc(rnn_out)
|
| 101 |
+
|
| 102 |
+
# Log softmax for CTC
|
| 103 |
+
log_probs = torch.nn.functional.log_softmax(output, dim=2)
|
| 104 |
+
|
| 105 |
+
return log_probs
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class PyLaiaInference:
|
| 109 |
+
"""
|
| 110 |
+
Native PyLaia inference (no WSL dependency).
|
| 111 |
+
Loads PyTorch checkpoint directly and runs inference on Linux.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def __init__(self, checkpoint_path: str, syms_path: str = None, enable_spaces: bool = True):
|
| 115 |
+
"""
|
| 116 |
+
Initialize PyLaia inference.
|
| 117 |
+
|
| 118 |
+
Args:
|
| 119 |
+
checkpoint_path: Path to .ckpt checkpoint file
|
| 120 |
+
syms_path: Path to symbols file. If None, will look in data directory.
|
| 121 |
+
enable_spaces: If True, convert <space> tokens to actual spaces. If False, keep as <space>.
|
| 122 |
+
"""
|
| 123 |
+
self.enable_spaces = enable_spaces
|
| 124 |
+
self.checkpoint_path = Path(checkpoint_path)
|
| 125 |
+
|
| 126 |
+
if not self.checkpoint_path.exists():
|
| 127 |
+
raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
|
| 128 |
+
|
| 129 |
+
# Find symbols file
|
| 130 |
+
if syms_path is None:
|
| 131 |
+
# First: look alongside the checkpoint for symbols.txt or syms.txt
|
| 132 |
+
model_dir = self.checkpoint_path.parent
|
| 133 |
+
for _candidate in ("symbols.txt", "syms.txt"):
|
| 134 |
+
_candidate_path = model_dir / _candidate
|
| 135 |
+
if _candidate_path.exists():
|
| 136 |
+
syms_path = _candidate_path
|
| 137 |
+
logger.info(f"Found symbols file alongside checkpoint: {syms_path}")
|
| 138 |
+
break
|
| 139 |
+
if syms_path is None:
|
| 140 |
+
# Last-resort fallback
|
| 141 |
+
syms_path = Path("data/pylaia_glagolitic/syms.txt")
|
| 142 |
+
|
| 143 |
+
self.syms_path = Path(syms_path)
|
| 144 |
+
if not self.syms_path.exists():
|
| 145 |
+
raise FileNotFoundError(f"Symbols file not found: {syms_path}")
|
| 146 |
+
|
| 147 |
+
# Load symbols (handle both list and KALDI formats)
|
| 148 |
+
# CRITICAL: Use rstrip('\n\r') not strip() to preserve leading/trailing whitespace in symbols (e.g., TAB)
|
| 149 |
+
with open(self.syms_path, 'r', encoding='utf-8') as f:
|
| 150 |
+
symbols_raw = [line.rstrip('\n\r') for line in f if line.rstrip('\n\r')]
|
| 151 |
+
|
| 152 |
+
# Auto-detect format: KALDI format has "symbol index" pairs
|
| 153 |
+
if symbols_raw and ' ' in symbols_raw[0]:
|
| 154 |
+
parts = symbols_raw[0].split()
|
| 155 |
+
if len(parts) == 2 and parts[1].isdigit():
|
| 156 |
+
# KALDI format: "symbol index"
|
| 157 |
+
# Parse carefully to handle whitespace symbols (e.g., TAB at index 131)
|
| 158 |
+
self.symbols = []
|
| 159 |
+
for line in symbols_raw:
|
| 160 |
+
# Get the last token (index)
|
| 161 |
+
idx_str = line.split()[-1]
|
| 162 |
+
if not idx_str.isdigit():
|
| 163 |
+
continue
|
| 164 |
+
# Symbol is everything before the last space + index
|
| 165 |
+
symbol = line[:line.rfind(' ' + idx_str)]
|
| 166 |
+
self.symbols.append(symbol)
|
| 167 |
+
logger.info(f"Detected KALDI format vocabulary")
|
| 168 |
+
else:
|
| 169 |
+
# List format (one symbol per line)
|
| 170 |
+
self.symbols = symbols_raw
|
| 171 |
+
else:
|
| 172 |
+
# List format (one symbol per line)
|
| 173 |
+
self.symbols = symbols_raw
|
| 174 |
+
|
| 175 |
+
# Remove <ctc> token if present (CTC blank is handled separately as index 0)
|
| 176 |
+
if self.symbols and self.symbols[0] == '<ctc>':
|
| 177 |
+
self.symbols = self.symbols[1:]
|
| 178 |
+
logger.info(f"Removed <ctc> token from vocabulary (using index 0 for CTC blank)")
|
| 179 |
+
|
| 180 |
+
# Create char-to-index mapping (0 reserved for CTC blank)
|
| 181 |
+
self.char2idx = {char: idx + 1 for idx, char in enumerate(self.symbols)}
|
| 182 |
+
self.idx2char = {idx: char for char, idx in self.char2idx.items()}
|
| 183 |
+
self.idx2char[0] = '' # CTC blank
|
| 184 |
+
|
| 185 |
+
# Map <SPACE> or <space> to actual space (if enabled)
|
| 186 |
+
if self.enable_spaces:
|
| 187 |
+
if '<SPACE>' in self.char2idx:
|
| 188 |
+
space_idx = self.char2idx['<SPACE>']
|
| 189 |
+
self.idx2char[space_idx] = ' '
|
| 190 |
+
elif '<space>' in self.char2idx:
|
| 191 |
+
space_idx = self.char2idx['<space>']
|
| 192 |
+
self.idx2char[space_idx] = ' '
|
| 193 |
+
|
| 194 |
+
# Load checkpoint
|
| 195 |
+
logger.info(f"Loading PyLaia checkpoint: {checkpoint_path}")
|
| 196 |
+
checkpoint = torch.load(self.checkpoint_path, map_location='cpu', weights_only=False)
|
| 197 |
+
|
| 198 |
+
# CRITICAL: If checkpoint has idx2char, use it instead of vocabulary file
|
| 199 |
+
# This handles models trained with different vocabulary parsing (strip vs rstrip)
|
| 200 |
+
if 'idx2char' in checkpoint:
|
| 201 |
+
logger.info(f"Using idx2char from checkpoint ({len(checkpoint['idx2char'])} characters)")
|
| 202 |
+
self.idx2char = checkpoint['idx2char']
|
| 203 |
+
self.char2idx = checkpoint.get('char2idx', {char: idx for idx, char in self.idx2char.items()})
|
| 204 |
+
# Still apply enable_spaces setting
|
| 205 |
+
if self.enable_spaces:
|
| 206 |
+
for idx, char in list(self.idx2char.items()):
|
| 207 |
+
if char == '<SPACE>' or char == '<space>':
|
| 208 |
+
self.idx2char[idx] = ' '
|
| 209 |
+
|
| 210 |
+
# Extract model state dict from checkpoint
|
| 211 |
+
# train_pylaia.py saves checkpoints with 'model_state_dict' key
|
| 212 |
+
state_dict = checkpoint.get('model_state_dict', checkpoint.get('state_dict', checkpoint))
|
| 213 |
+
|
| 214 |
+
# Infer number of classes from checkpoint (fc.weight shape is [num_classes, rnn_hidden*2])
|
| 215 |
+
fc_weight_shape = state_dict['fc.weight'].shape
|
| 216 |
+
num_classes = fc_weight_shape[0]
|
| 217 |
+
|
| 218 |
+
logger.info(f"Inferred {num_classes} output classes from checkpoint")
|
| 219 |
+
logger.info(f"Vocabulary has {len(self.symbols)} symbols (+ 1 blank = {len(self.symbols)+1} expected)")
|
| 220 |
+
|
| 221 |
+
# Initialize model
|
| 222 |
+
self.model = CRNN(
|
| 223 |
+
img_height=128,
|
| 224 |
+
num_channels=1,
|
| 225 |
+
num_classes=num_classes,
|
| 226 |
+
cnn_filters=[12, 24, 48, 48],
|
| 227 |
+
cnn_poolsize=[2, 2, 0, 2],
|
| 228 |
+
rnn_hidden=256,
|
| 229 |
+
rnn_layers=3,
|
| 230 |
+
dropout=0.5
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Load weights
|
| 234 |
+
self.model.load_state_dict(state_dict, strict=True)
|
| 235 |
+
|
| 236 |
+
# Set device
|
| 237 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 238 |
+
self.model = self.model.to(self.device)
|
| 239 |
+
self.model.eval()
|
| 240 |
+
|
| 241 |
+
# Image preprocessing (same as training)
|
| 242 |
+
self.transform = transforms.Compose([
|
| 243 |
+
transforms.ToTensor(),
|
| 244 |
+
transforms.Normalize(mean=[0.5], std=[0.5])
|
| 245 |
+
])
|
| 246 |
+
|
| 247 |
+
logger.info(f"Loaded PyLaia model with {num_classes} output classes")
|
| 248 |
+
logger.info(f"Using device: {self.device}")
|
| 249 |
+
|
| 250 |
+
def preprocess_image(self, image: Image.Image) -> torch.Tensor:
|
| 251 |
+
"""
|
| 252 |
+
Preprocess image for inference.
|
| 253 |
+
|
| 254 |
+
Args:
|
| 255 |
+
image: PIL Image (RGB or grayscale)
|
| 256 |
+
|
| 257 |
+
Returns:
|
| 258 |
+
Preprocessed tensor [1, 1, height, width]
|
| 259 |
+
"""
|
| 260 |
+
# Convert to grayscale
|
| 261 |
+
if image.mode != 'L':
|
| 262 |
+
image = image.convert('L')
|
| 263 |
+
|
| 264 |
+
# Resize to target height (128) while preserving aspect ratio
|
| 265 |
+
target_height = 128
|
| 266 |
+
aspect_ratio = image.width / image.height
|
| 267 |
+
new_width = int(target_height * aspect_ratio)
|
| 268 |
+
image = image.resize((new_width, target_height), Image.LANCZOS)
|
| 269 |
+
|
| 270 |
+
# Apply transforms
|
| 271 |
+
img_tensor = self.transform(image) # [1, height, width]
|
| 272 |
+
img_tensor = img_tensor.unsqueeze(0) # [1, 1, height, width]
|
| 273 |
+
|
| 274 |
+
return img_tensor
|
| 275 |
+
|
| 276 |
+
def decode_ctc(self, log_probs: torch.Tensor) -> Tuple[str, float]:
|
| 277 |
+
"""
|
| 278 |
+
Decode CTC output using greedy decoding.
|
| 279 |
+
|
| 280 |
+
Args:
|
| 281 |
+
log_probs: [seq_len, 1, num_classes]
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
Tuple of (decoded_text, confidence)
|
| 285 |
+
"""
|
| 286 |
+
# Get most likely class at each time step
|
| 287 |
+
probs = torch.exp(log_probs)
|
| 288 |
+
_, pred_indices = torch.max(probs, dim=2) # [seq_len, 1]
|
| 289 |
+
pred_indices = pred_indices.squeeze(1).cpu().numpy() # [seq_len]
|
| 290 |
+
|
| 291 |
+
# CTC greedy decoding: remove consecutive duplicates and blanks
|
| 292 |
+
decoded_chars = []
|
| 293 |
+
prev_idx = -1
|
| 294 |
+
confidences = []
|
| 295 |
+
|
| 296 |
+
for t, idx in enumerate(pred_indices):
|
| 297 |
+
if idx != 0 and idx != prev_idx: # Not blank and not duplicate
|
| 298 |
+
char = self.idx2char.get(idx, '')
|
| 299 |
+
if char:
|
| 300 |
+
decoded_chars.append(char)
|
| 301 |
+
# Get confidence for this character
|
| 302 |
+
char_conf = probs[t, 0, idx].item()
|
| 303 |
+
confidences.append(char_conf)
|
| 304 |
+
prev_idx = idx
|
| 305 |
+
|
| 306 |
+
# Join characters
|
| 307 |
+
text = ''.join(decoded_chars)
|
| 308 |
+
|
| 309 |
+
# Average confidence
|
| 310 |
+
confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
| 311 |
+
|
| 312 |
+
return text, confidence
|
| 313 |
+
|
| 314 |
+
def transcribe(self, image: Image.Image) -> Tuple[str, float]:
|
| 315 |
+
"""
|
| 316 |
+
Transcribe a single line image.
|
| 317 |
+
|
| 318 |
+
Args:
|
| 319 |
+
image: PIL Image of text line
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
Tuple of (transcription_text, confidence_score)
|
| 323 |
+
"""
|
| 324 |
+
try:
|
| 325 |
+
# Preprocess
|
| 326 |
+
img_tensor = self.preprocess_image(image).to(self.device)
|
| 327 |
+
|
| 328 |
+
# Forward pass
|
| 329 |
+
with torch.no_grad():
|
| 330 |
+
log_probs = self.model(img_tensor) # [width, 1, num_classes]
|
| 331 |
+
|
| 332 |
+
# Decode
|
| 333 |
+
text, confidence = self.decode_ctc(log_probs)
|
| 334 |
+
|
| 335 |
+
return text, confidence
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
logger.error(f"Error during PyLaia inference: {e}")
|
| 339 |
+
import traceback
|
| 340 |
+
traceback.print_exc()
|
| 341 |
+
return "", 0.0
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# Model registry (updated for trained models)
|
| 345 |
+
PYLAIA_MODELS = {
|
| 346 |
+
"Church Slavonic (2.89% CER)": {
|
| 347 |
+
"checkpoint": "models/pylaia_church_slavonic_20251103_222215/best_model.pt",
|
| 348 |
+
"syms": "models/pylaia_church_slavonic_20251103_222215/symbols.txt",
|
| 349 |
+
"description": "PyLaia CRNN - Church Slavonic manuscript (2.89% CER)"
|
| 350 |
+
},
|
| 351 |
+
"Prosta Mova (3.77% CER)": {
|
| 352 |
+
"checkpoint": "models/pylaia_prosta_mova_v4_20251121_155322/best_model.pt",
|
| 353 |
+
"syms": "models/pylaia_prosta_mova_v4_20251121_155322/symbols.txt",
|
| 354 |
+
"description": "PyLaia CRNN - Prosta Mova (3.77% CER)"
|
| 355 |
+
},
|
| 356 |
+
"Glagolitic (5.33% CER)": {
|
| 357 |
+
"checkpoint": "models/pylaia_glagolitic_with_spaces_20251102_182103/best_model.pt",
|
| 358 |
+
"syms": "data/pylaia_glagolitic/syms.txt",
|
| 359 |
+
"description": "PyLaia CRNN - Glagolitic manuscript (76 symbols, 5.33% CER)"
|
| 360 |
+
},
|
| 361 |
+
"Ukrainian (4.76% CER)": {
|
| 362 |
+
"checkpoint": "models/pylaia_ukrainian_v2c_20251124_180634/best_model.pt",
|
| 363 |
+
"syms": "models/pylaia_ukrainian_v2c_20251124_180634/symbols.txt",
|
| 364 |
+
"description": "PyLaia CRNN - Ukrainian manuscript (4.76% CER)"
|
| 365 |
+
},
|
| 366 |
+
"Ukrainian (13.53% CER - OLD)": {
|
| 367 |
+
"checkpoint": "models/pylaia_ukrainian_retrain_20251102_213431/best_model.pt",
|
| 368 |
+
"syms": "models/pylaia_ukrainian_retrain_20251102_213431/symbols.txt",
|
| 369 |
+
"description": "PyLaia CRNN - Ukrainian manuscript (180 symbols, 13.53% CER)"
|
| 370 |
+
},
|
| 371 |
+
"Glagolitic (old)": {
|
| 372 |
+
"checkpoint": "models/pylaia_glagolitic_single_gpu/best_model.pt",
|
| 373 |
+
"syms": "models/pylaia_glagolitic_single_gpu/symbols.txt",
|
| 374 |
+
"description": "PyLaia model - old Glagolitic training (no spaces)"
|
| 375 |
+
}
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def _register_hf_space_demo_models() -> None:
|
| 380 |
+
"""Add public Hugging Face CRNN-CTC presets for the hosted demo mode."""
|
| 381 |
+
if os.environ.get("POLYSCRIPTOR_DEMO_MODE") != "hf_space":
|
| 382 |
+
return
|
| 383 |
+
PYLAIA_MODELS.clear()
|
| 384 |
+
PYLAIA_MODELS.update({
|
| 385 |
+
"Ukrainian (HF, 4.76% CER)": {
|
| 386 |
+
"repo_id": "achimrabus/crnn-ctc-ukrainian",
|
| 387 |
+
"checkpoint": "best_model.pt",
|
| 388 |
+
"syms": "symbols.txt",
|
| 389 |
+
"description": "Public Hugging Face CRNN-CTC model for Ukrainian HTR",
|
| 390 |
+
},
|
| 391 |
+
"Prosta Mova (HF, 3.77% CER)": {
|
| 392 |
+
"repo_id": "achimrabus/crnn-ctc-prosta-mova",
|
| 393 |
+
"checkpoint": "best_model.pt",
|
| 394 |
+
"syms": "symbols.txt",
|
| 395 |
+
"description": "Public Hugging Face CRNN-CTC model for Prosta Mova HTR",
|
| 396 |
+
},
|
| 397 |
+
"Church Slavonic (HF, 2.89% CER)": {
|
| 398 |
+
"repo_id": "achimrabus/crnn-ctc-church-slavonic",
|
| 399 |
+
"checkpoint": "best_model.pt",
|
| 400 |
+
"syms": "symbols.txt",
|
| 401 |
+
"description": "Public Hugging Face CRNN-CTC model for Church Slavonic HTR",
|
| 402 |
+
},
|
| 403 |
+
"Glagolitic (HF, 5.33% CER)": {
|
| 404 |
+
"repo_id": "achimrabus/crnn-ctc-glagolitic",
|
| 405 |
+
"checkpoint": "best_model.pt",
|
| 406 |
+
"syms": "symbols.txt",
|
| 407 |
+
"description": "Public Hugging Face CRNN-CTC model for Glagolitic HTR",
|
| 408 |
+
},
|
| 409 |
+
})
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
def _scan_pylaia_models(models_dir: str = "models") -> None:
|
| 413 |
+
"""Scan models/ for CRNN-CTC checkpoints not already in PYLAIA_MODELS.
|
| 414 |
+
|
| 415 |
+
Any subdirectory containing best_model.pt that isn't already registered
|
| 416 |
+
is added automatically, using its folder name as the display key.
|
| 417 |
+
A co-located symbols.txt or syms.txt is used as the symbols file.
|
| 418 |
+
This lets users drop a trained model into models/ without editing the registry.
|
| 419 |
+
"""
|
| 420 |
+
models_path = Path(models_dir)
|
| 421 |
+
if not models_path.is_dir():
|
| 422 |
+
return
|
| 423 |
+
|
| 424 |
+
registered = {
|
| 425 |
+
str(Path(info["checkpoint"])) if isinstance(info, dict) else str(Path(info))
|
| 426 |
+
for info in PYLAIA_MODELS.values()
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
for checkpoint in sorted(models_path.glob("*/best_model.pt")):
|
| 430 |
+
checkpoint_str = str(checkpoint)
|
| 431 |
+
if checkpoint_str in registered:
|
| 432 |
+
continue
|
| 433 |
+
model_dir = checkpoint.parent
|
| 434 |
+
folder_name = model_dir.name
|
| 435 |
+
if folder_name in PYLAIA_MODELS:
|
| 436 |
+
continue
|
| 437 |
+
syms_path = None
|
| 438 |
+
for candidate in ("symbols.txt", "syms.txt"):
|
| 439 |
+
candidate_path = model_dir / candidate
|
| 440 |
+
if candidate_path.exists():
|
| 441 |
+
syms_path = str(candidate_path)
|
| 442 |
+
break
|
| 443 |
+
PYLAIA_MODELS[folder_name] = {
|
| 444 |
+
"checkpoint": checkpoint_str,
|
| 445 |
+
"syms": syms_path,
|
| 446 |
+
"description": f"CRNN-CTC model (auto-discovered): {folder_name}",
|
| 447 |
+
}
|
| 448 |
+
logger.debug(f"Auto-discovered CRNN-CTC model: {folder_name}")
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
# Populate registry with any models not hard-coded above
|
| 452 |
+
_register_hf_space_demo_models()
|
| 453 |
+
_scan_pylaia_models()
|
kraken_segmenter.py
ADDED
|
@@ -0,0 +1,823 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Kraken-based line segmentation for historical document OCR.
|
| 3 |
+
|
| 4 |
+
This module provides an alternative to the classical HPP (Horizontal Projection Profile)
|
| 5 |
+
segmentation using Kraken's pre-trained neural models.
|
| 6 |
+
|
| 7 |
+
Supports two modes:
|
| 8 |
+
- Classical: pageseg.segment() — fast, lines only, no regions
|
| 9 |
+
- Neural (blla): blla.segment() — GPU-accelerated, returns regions AND baselines,
|
| 10 |
+
handles multi-column layouts
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import time
|
| 15 |
+
from dataclasses import dataclass, field
|
| 16 |
+
from typing import Any, List, Optional, NamedTuple, Tuple, Dict
|
| 17 |
+
from PIL import Image
|
| 18 |
+
import numpy as np
|
| 19 |
+
|
| 20 |
+
# Module-level cache: maps model path -> loaded TorchVGSLModel.
|
| 21 |
+
# Shared across all KrakenLineSegmenter instances so that the model is loaded
|
| 22 |
+
# from disk only once per process, even in batch processing loops.
|
| 23 |
+
_MODEL_CACHE: Dict[str, Any] = {}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class LineSegment(NamedTuple):
|
| 27 |
+
"""Represents a segmented text line."""
|
| 28 |
+
image: Image.Image
|
| 29 |
+
bbox: tuple # (x1, y1, x2, y2)
|
| 30 |
+
baseline: Optional[List[tuple]] = None # List of (x, y) points
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class SegRegion:
|
| 35 |
+
"""Represents a detected text region (column, marginalia, etc.)."""
|
| 36 |
+
id: str
|
| 37 |
+
bbox: Tuple[int, int, int, int] # (x1, y1, x2, y2)
|
| 38 |
+
line_ids: List[str] = field(default_factory=list)
|
| 39 |
+
polygon: Optional[List[Tuple[int, int]]] = None # Convex hull or neural polygon
|
| 40 |
+
mode: str = "neural" # "neural" or "classical"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class KrakenLineSegmenter:
|
| 44 |
+
"""
|
| 45 |
+
Line segmentation using Kraken with pre-trained models.
|
| 46 |
+
|
| 47 |
+
Kraken is specifically designed for historical document OCR and provides:
|
| 48 |
+
- Pre-trained models that work out-of-the-box
|
| 49 |
+
- Baseline detection (not just bounding boxes)
|
| 50 |
+
- Robust handling of degraded/faded text
|
| 51 |
+
- Support for rotated and multi-column layouts
|
| 52 |
+
|
| 53 |
+
Performance: ~3-8s per page (CPU), ~1-3s (GPU)
|
| 54 |
+
Accuracy: 90-95% on historical documents
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def __init__(self, model_path: Optional[str] = None, device: str = "cpu"):
|
| 58 |
+
"""
|
| 59 |
+
Initialize Kraken segmenter.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
model_path: Path to custom segmentation model (.mlmodel file).
|
| 63 |
+
Note: Kraken 5.x uses classical segmentation by default.
|
| 64 |
+
Neural baseline segmentation requires additional setup.
|
| 65 |
+
device: 'cpu' or 'cuda' for GPU acceleration (not used by classical segmenter)
|
| 66 |
+
"""
|
| 67 |
+
self.model_path = model_path
|
| 68 |
+
self.device = device
|
| 69 |
+
|
| 70 |
+
# Import kraken components
|
| 71 |
+
try:
|
| 72 |
+
from kraken import binarization, pageseg
|
| 73 |
+
self.binarization = binarization
|
| 74 |
+
self.pageseg = pageseg
|
| 75 |
+
except ImportError as e:
|
| 76 |
+
raise ImportError(
|
| 77 |
+
"Kraken is not installed. Install it with: pip install kraken\n"
|
| 78 |
+
f"Original error: {e}"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# Note: model_path is currently not used as pageseg.segment() doesn't accept models
|
| 82 |
+
# The classical segmentation algorithm is robust and works well for most documents
|
| 83 |
+
if model_path:
|
| 84 |
+
print(f"[KrakenSegmenter] Warning: Custom model path provided but not used.")
|
| 85 |
+
print(f"[KrakenSegmenter] Kraken 5.x pageseg.segment() uses classical algorithm.")
|
| 86 |
+
print(f"[KrakenSegmenter] Neural baseline segmentation requires kraken.lib.models workflow.")
|
| 87 |
+
|
| 88 |
+
def segment_lines(
|
| 89 |
+
self,
|
| 90 |
+
image: Image.Image,
|
| 91 |
+
text_direction: str = 'horizontal-lr',
|
| 92 |
+
use_binarization: bool = True
|
| 93 |
+
) -> List[LineSegment]:
|
| 94 |
+
"""
|
| 95 |
+
Segment image into text lines using Kraken.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
image: PIL Image to segment
|
| 99 |
+
text_direction: Text direction - 'horizontal-lr' (left-to-right),
|
| 100 |
+
'horizontal-rl', 'vertical-lr', 'vertical-rl'
|
| 101 |
+
use_binarization: Whether to apply neural binarization preprocessing
|
| 102 |
+
(recommended for degraded documents)
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
List of LineSegment objects sorted top to bottom
|
| 106 |
+
"""
|
| 107 |
+
print(f"[KrakenSegmenter] Segmenting image (size={image.size}, mode={image.mode}, "
|
| 108 |
+
f"direction={text_direction}, binarize={use_binarization})")
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
# Step 0: Convert to grayscale if needed (Kraken works better with grayscale)
|
| 112 |
+
if image.mode not in ('L', '1'):
|
| 113 |
+
print(f"[KrakenSegmenter] Converting from {image.mode} to grayscale...")
|
| 114 |
+
image = image.convert('L')
|
| 115 |
+
|
| 116 |
+
# Step 1: Binarize (required by pageseg.segment)
|
| 117 |
+
# pageseg.segment REQUIRES binary images
|
| 118 |
+
if use_binarization:
|
| 119 |
+
print(f"[KrakenSegmenter] Applying neural binarization...")
|
| 120 |
+
processed_img = self.binarization.nlbin(image)
|
| 121 |
+
else:
|
| 122 |
+
# Simple Otsu binarization as fallback
|
| 123 |
+
print(f"[KrakenSegmenter] Applying Otsu binarization...")
|
| 124 |
+
import numpy as np
|
| 125 |
+
from PIL import ImageOps
|
| 126 |
+
# Otsu's method
|
| 127 |
+
img_array = np.array(image)
|
| 128 |
+
threshold = np.median(img_array) # Simple threshold
|
| 129 |
+
binary = img_array > threshold
|
| 130 |
+
processed_img = Image.fromarray((binary * 255).astype(np.uint8), mode='L')
|
| 131 |
+
|
| 132 |
+
# Step 2: Line segmentation using Kraken's classical algorithm
|
| 133 |
+
# This is more robust than basic HPP and works well on historical documents
|
| 134 |
+
print(f"[KrakenSegmenter] Running line segmentation...")
|
| 135 |
+
seg_result = self.pageseg.segment(
|
| 136 |
+
processed_img,
|
| 137 |
+
text_direction=text_direction
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Handle both dict (old Kraken) and Segmentation object (new Kraken)
|
| 141 |
+
if isinstance(seg_result, dict):
|
| 142 |
+
print(f"[KrakenSegmenter] pageseg.segment returned dict (old Kraken API)")
|
| 143 |
+
# Old API: seg_result is a dict with 'boxes' key
|
| 144 |
+
seg_lines = seg_result.get('boxes', seg_result.get('lines', []))
|
| 145 |
+
else:
|
| 146 |
+
print(f"[KrakenSegmenter] pageseg.segment returned Segmentation object")
|
| 147 |
+
seg_lines = seg_result.lines
|
| 148 |
+
|
| 149 |
+
print(f"[KrakenSegmenter] Processing {len(seg_lines)} lines...")
|
| 150 |
+
|
| 151 |
+
# Step 3: Extract line information
|
| 152 |
+
lines = []
|
| 153 |
+
for idx, line in enumerate(seg_lines):
|
| 154 |
+
# Extract bounding box
|
| 155 |
+
bbox = line.bbox # (x_min, y_min, x_max, y_max)
|
| 156 |
+
|
| 157 |
+
# Extract baseline (list of (x, y) points)
|
| 158 |
+
baseline = line.baseline if hasattr(line, 'baseline') else None
|
| 159 |
+
|
| 160 |
+
# Crop line image from original (not binarized)
|
| 161 |
+
line_img = image.crop(bbox)
|
| 162 |
+
|
| 163 |
+
lines.append(LineSegment(
|
| 164 |
+
image=line_img,
|
| 165 |
+
bbox=bbox,
|
| 166 |
+
baseline=baseline
|
| 167 |
+
))
|
| 168 |
+
|
| 169 |
+
# Sort lines top to bottom by Y coordinate
|
| 170 |
+
lines = sorted(lines, key=lambda x: x.bbox[1])
|
| 171 |
+
|
| 172 |
+
print(f"[KrakenSegmenter] Detected {len(lines)} lines")
|
| 173 |
+
|
| 174 |
+
return lines
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
print(f"[KrakenSegmenter] ERROR: Segmentation failed: {e}")
|
| 178 |
+
import traceback
|
| 179 |
+
traceback.print_exc()
|
| 180 |
+
return []
|
| 181 |
+
|
| 182 |
+
def segment_with_regions(
|
| 183 |
+
self,
|
| 184 |
+
image: Image.Image,
|
| 185 |
+
model_path: Optional[str] = None,
|
| 186 |
+
device: Optional[str] = None,
|
| 187 |
+
min_line_height: int = 8,
|
| 188 |
+
max_columns: int = 4,
|
| 189 |
+
split_width_fraction: float = 0.40,
|
| 190 |
+
min_lines_to_split: int = 10,
|
| 191 |
+
text_direction: str = 'horizontal-lr',
|
| 192 |
+
) -> Tuple[List[SegRegion], List[LineSegment]]:
|
| 193 |
+
"""
|
| 194 |
+
Neural baseline segmentation using blla.segment().
|
| 195 |
+
|
| 196 |
+
Returns regions AND lines with baselines. Handles multi-column layouts
|
| 197 |
+
by using blla's region detection, with a column-clustering fallback when
|
| 198 |
+
blla returns a single region with many lines (≥30).
|
| 199 |
+
|
| 200 |
+
Falls back to classical pageseg.segment() + column clustering if blla
|
| 201 |
+
fails or the model file is missing.
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
image: PIL Image to segment (RGB or grayscale)
|
| 205 |
+
model_path: Path to blla .mlmodel file. Defaults to
|
| 206 |
+
``pagexml/blla.mlmodel`` relative to this script.
|
| 207 |
+
device: 'cpu' or 'cuda' / 'cuda:0'. Defaults to self.device.
|
| 208 |
+
min_line_height: Discard lines shorter than this (pixels).
|
| 209 |
+
max_columns: Maximum number of columns to detect per region (1-8).
|
| 210 |
+
split_width_fraction: Minimum region width as fraction of page width
|
| 211 |
+
to trigger sub-column splitting (0.0-1.0). Lower values
|
| 212 |
+
split narrower regions. Default 0.40 (40%).
|
| 213 |
+
For landscape double-page spreads, try 0.20 (20%).
|
| 214 |
+
min_lines_to_split: Minimum number of lines in a region before
|
| 215 |
+
attempting to split it into sub-columns.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
(regions, lines) where *lines* carry a ``region_id`` attribute via
|
| 219 |
+
the companion ``SegRegion`` that owns them.
|
| 220 |
+
"""
|
| 221 |
+
device = device or self.device
|
| 222 |
+
if model_path is None:
|
| 223 |
+
model_path = os.path.join(os.path.dirname(__file__), 'pagexml', 'blla.mlmodel')
|
| 224 |
+
|
| 225 |
+
print(f"[KrakenSegmenter] Neural segmentation (blla) on {image.size}, device={device}")
|
| 226 |
+
|
| 227 |
+
# ── Try neural (blla) first ──────────────────────────────────
|
| 228 |
+
if os.path.isfile(model_path):
|
| 229 |
+
try:
|
| 230 |
+
regions, lines = self._segment_neural(
|
| 231 |
+
image, model_path, device, min_line_height,
|
| 232 |
+
max_columns=max_columns,
|
| 233 |
+
split_width_fraction=split_width_fraction,
|
| 234 |
+
min_lines_to_split=min_lines_to_split,
|
| 235 |
+
text_direction=text_direction,
|
| 236 |
+
)
|
| 237 |
+
if regions:
|
| 238 |
+
print(f"[KrakenSegmenter] blla: {len(regions)} regions, {len(lines)} lines")
|
| 239 |
+
return regions, lines
|
| 240 |
+
print("[KrakenSegmenter] blla returned no regions; falling back to classical + clustering")
|
| 241 |
+
except Exception as e:
|
| 242 |
+
print(f"[KrakenSegmenter] blla failed ({e}); falling back to classical + clustering")
|
| 243 |
+
import traceback
|
| 244 |
+
traceback.print_exc()
|
| 245 |
+
else:
|
| 246 |
+
print(f"[KrakenSegmenter] blla model not found at {model_path}; using classical fallback")
|
| 247 |
+
|
| 248 |
+
# ── Fallback: classical pageseg + column clustering ──────────
|
| 249 |
+
return self._segment_classical_with_regions(image, min_line_height)
|
| 250 |
+
|
| 251 |
+
# ── internal: neural blla ────────────────────────────────────────
|
| 252 |
+
|
| 253 |
+
def _segment_neural(
|
| 254 |
+
self,
|
| 255 |
+
image: Image.Image,
|
| 256 |
+
model_path: str,
|
| 257 |
+
device: str,
|
| 258 |
+
min_line_height: int,
|
| 259 |
+
max_columns: int = 4,
|
| 260 |
+
split_width_fraction: float = 0.40,
|
| 261 |
+
min_lines_to_split: int = 10,
|
| 262 |
+
text_direction: str = 'horizontal-lr',
|
| 263 |
+
) -> Tuple[List[SegRegion], List[LineSegment]]:
|
| 264 |
+
"""Run blla.segment() and build SegRegion / LineSegment lists."""
|
| 265 |
+
from kraken import blla
|
| 266 |
+
from kraken.lib import vgsl
|
| 267 |
+
import torch
|
| 268 |
+
|
| 269 |
+
start = time.time()
|
| 270 |
+
|
| 271 |
+
# Validate device
|
| 272 |
+
if device.startswith('cuda') and not torch.cuda.is_available():
|
| 273 |
+
print(f"[KrakenSegmenter] WARNING: device={device} but CUDA not available, falling back to cpu")
|
| 274 |
+
device = 'cpu'
|
| 275 |
+
|
| 276 |
+
# Load model once and cache keyed by (path, device) — repeated calls
|
| 277 |
+
# reuse the already-loaded, already-placed model. Keying by device means
|
| 278 |
+
# a CPU and a CUDA instance don't share the same cached object.
|
| 279 |
+
cache_key = (model_path, device)
|
| 280 |
+
if cache_key not in _MODEL_CACHE:
|
| 281 |
+
print(f"[KrakenSegmenter] Loading blla model: {model_path}")
|
| 282 |
+
m = vgsl.TorchVGSLModel.load_model(model_path)
|
| 283 |
+
# blla.segment()'s device= parameter does NOT move the model —
|
| 284 |
+
# it must be placed on the target device explicitly before the call.
|
| 285 |
+
m.nn.to(device)
|
| 286 |
+
_MODEL_CACHE[cache_key] = m
|
| 287 |
+
model = _MODEL_CACHE[cache_key]
|
| 288 |
+
|
| 289 |
+
# Diagnostic: confirm model parameters are on the expected device.
|
| 290 |
+
try:
|
| 291 |
+
actual_device = next(model.nn.parameters()).device
|
| 292 |
+
print(f"[KrakenSegmenter] blla model on: {actual_device} (requested: {device})")
|
| 293 |
+
if device.startswith('cuda') and actual_device.type != 'cuda':
|
| 294 |
+
print(f"[KrakenSegmenter] WARNING: model is on {actual_device}, not GPU")
|
| 295 |
+
except Exception:
|
| 296 |
+
print(f"[KrakenSegmenter] blla running on device={device}")
|
| 297 |
+
|
| 298 |
+
# blla wants RGB
|
| 299 |
+
img = image.convert('RGB') if image.mode != 'RGB' else image
|
| 300 |
+
|
| 301 |
+
# blla has built-in autocast support (disabled by default). Enable it
|
| 302 |
+
# on CUDA for faster fp16 forward pass.
|
| 303 |
+
baseline_seg = blla.segment(img, model=model, device=device,
|
| 304 |
+
autocast=device.startswith('cuda'),
|
| 305 |
+
text_direction=text_direction)
|
| 306 |
+
|
| 307 |
+
w, h = image.size
|
| 308 |
+
seg_lines: List[LineSegment] = []
|
| 309 |
+
# region_id -> {'lines': [...], 'blla_region': ...}
|
| 310 |
+
regions_dict: Dict[str, dict] = {}
|
| 311 |
+
|
| 312 |
+
# Extract blla region bounding boxes for cross-column line splitting.
|
| 313 |
+
# blla sometimes draws baselines that span multiple columns at the same
|
| 314 |
+
# vertical position. Using region boundaries we can clip or split such
|
| 315 |
+
# lines so that each crop stays within one column.
|
| 316 |
+
blla_boxes = self._extract_blla_region_boxes(baseline_seg, text_direction)
|
| 317 |
+
if blla_boxes:
|
| 318 |
+
print(f"[KrakenSegmenter] blla detected {len(blla_boxes)} text regions "
|
| 319 |
+
f"— will clip lines to region boundaries")
|
| 320 |
+
|
| 321 |
+
for idx, line in enumerate(baseline_seg.lines):
|
| 322 |
+
bbox = self._extract_bbox(line)
|
| 323 |
+
if bbox is None:
|
| 324 |
+
continue
|
| 325 |
+
|
| 326 |
+
baseline = (
|
| 327 |
+
[(int(p[0]), int(p[1])) for p in line.baseline]
|
| 328 |
+
if hasattr(line, 'baseline') and line.baseline
|
| 329 |
+
else None
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
if blla_boxes:
|
| 333 |
+
# Find which detected regions this line's bbox overlaps.
|
| 334 |
+
overlapping = self._overlapping_blla_boxes(bbox, blla_boxes)
|
| 335 |
+
else:
|
| 336 |
+
overlapping = []
|
| 337 |
+
|
| 338 |
+
if not overlapping:
|
| 339 |
+
# No region overlap or no regions at all — fall back to
|
| 340 |
+
# centre-based assignment and keep the original bbox.
|
| 341 |
+
region_id, blla_region = self._find_region_for_line(
|
| 342 |
+
bbox, line, baseline_seg
|
| 343 |
+
)
|
| 344 |
+
sub_bboxes = [(bbox, region_id, blla_region)]
|
| 345 |
+
else:
|
| 346 |
+
# Clip / split the line at each overlapping region boundary.
|
| 347 |
+
sub_bboxes = []
|
| 348 |
+
for rx1, ry1, rx2, ry2, region_obj, region_key in overlapping:
|
| 349 |
+
clipped = (
|
| 350 |
+
max(bbox[0], rx1), max(bbox[1], ry1),
|
| 351 |
+
min(bbox[2], rx2), min(bbox[3], ry2),
|
| 352 |
+
)
|
| 353 |
+
sub_bboxes.append((clipped, region_key, region_obj))
|
| 354 |
+
|
| 355 |
+
for clipped_bbox, region_key, region_obj in sub_bboxes:
|
| 356 |
+
cx1, cy1, cx2, cy2 = clipped_bbox
|
| 357 |
+
if cx2 <= cx1 or cy2 <= cy1:
|
| 358 |
+
continue
|
| 359 |
+
# Filter tiny lines (after possible clamping)
|
| 360 |
+
if (cy2 - cy1) < min_line_height:
|
| 361 |
+
continue
|
| 362 |
+
|
| 363 |
+
line_img = image.crop(clipped_bbox)
|
| 364 |
+
seg_line = LineSegment(image=line_img, bbox=clipped_bbox, baseline=baseline)
|
| 365 |
+
seg_lines.append(seg_line)
|
| 366 |
+
|
| 367 |
+
if region_key not in regions_dict:
|
| 368 |
+
regions_dict[region_key] = {'lines': [], 'blla_region': region_obj}
|
| 369 |
+
regions_dict[region_key]['lines'].append((len(seg_lines) - 1, seg_line))
|
| 370 |
+
|
| 371 |
+
# Sub-split wide regions that likely contain multiple columns.
|
| 372 |
+
# blla often detects "left page" and "right page" as two regions on a
|
| 373 |
+
# double-page spread, but each page may have 2 columns internally.
|
| 374 |
+
# Loop until convergence: a single pass may leave wide sub-regions that
|
| 375 |
+
# need further splitting (e.g. a 3-column area assigned as one bucket).
|
| 376 |
+
for _round in range(max_columns):
|
| 377 |
+
prev_size = len(regions_dict)
|
| 378 |
+
regions_dict = self._split_wide_regions(
|
| 379 |
+
regions_dict, w,
|
| 380 |
+
min_lines_to_split=min_lines_to_split,
|
| 381 |
+
split_width_fraction=split_width_fraction,
|
| 382 |
+
max_columns=max_columns,
|
| 383 |
+
)
|
| 384 |
+
if len(regions_dict) == prev_size:
|
| 385 |
+
break # no new splits — converged
|
| 386 |
+
|
| 387 |
+
# Build SegRegion objects
|
| 388 |
+
regions, ordered_lines = self._build_regions(regions_dict, seg_lines, w,
|
| 389 |
+
text_direction=text_direction)
|
| 390 |
+
|
| 391 |
+
elapsed = time.time() - start
|
| 392 |
+
print(f"[KrakenSegmenter] blla completed in {elapsed:.2f}s")
|
| 393 |
+
return regions, ordered_lines
|
| 394 |
+
|
| 395 |
+
# ── internal: classical fallback with column clustering ──────────
|
| 396 |
+
|
| 397 |
+
def segment_classical_with_regions(
|
| 398 |
+
self,
|
| 399 |
+
image: Image.Image,
|
| 400 |
+
min_line_height: int = 15,
|
| 401 |
+
max_columns: int = 4,
|
| 402 |
+
) -> Tuple[List[SegRegion], List[LineSegment]]:
|
| 403 |
+
"""Public wrapper: classical pageseg + heuristic column clustering."""
|
| 404 |
+
return self._segment_classical_with_regions(image, min_line_height, max_columns)
|
| 405 |
+
|
| 406 |
+
def _segment_classical_with_regions(
|
| 407 |
+
self,
|
| 408 |
+
image: Image.Image,
|
| 409 |
+
min_line_height: int = 15,
|
| 410 |
+
max_columns: int = 4,
|
| 411 |
+
) -> Tuple[List[SegRegion], List[LineSegment]]:
|
| 412 |
+
"""Classical pageseg + heuristic column clustering."""
|
| 413 |
+
raw_lines = self.segment_lines(image)
|
| 414 |
+
if not raw_lines:
|
| 415 |
+
return [], []
|
| 416 |
+
|
| 417 |
+
# Filter small lines
|
| 418 |
+
raw_lines = [l for l in raw_lines if (l.bbox[3] - l.bbox[1]) >= min_line_height]
|
| 419 |
+
|
| 420 |
+
w = image.size[0]
|
| 421 |
+
# Cluster into columns (pass max_columns so 4-column spreads are handled)
|
| 422 |
+
regions_dict = self._cluster_into_columns(raw_lines, w, max_columns=max_columns)
|
| 423 |
+
regions, ordered_lines = self._build_regions(regions_dict, raw_lines, w)
|
| 424 |
+
for r in regions:
|
| 425 |
+
r.mode = "classical"
|
| 426 |
+
return regions, ordered_lines
|
| 427 |
+
|
| 428 |
+
# ── helpers ───────────────────────────────────────────────────────
|
| 429 |
+
|
| 430 |
+
@staticmethod
|
| 431 |
+
def _extract_bbox(line) -> Optional[Tuple[int, int, int, int]]:
|
| 432 |
+
"""Extract (x1,y1,x2,y2) bbox from a blla line object."""
|
| 433 |
+
if hasattr(line, 'bbox'):
|
| 434 |
+
return tuple(int(v) for v in line.bbox)
|
| 435 |
+
if hasattr(line, 'baseline') and line.baseline:
|
| 436 |
+
xs = [p[0] for p in line.baseline]
|
| 437 |
+
ys = [p[1] for p in line.baseline]
|
| 438 |
+
avg_h = 30
|
| 439 |
+
return (int(min(xs)), int(min(ys) - avg_h // 2),
|
| 440 |
+
int(max(xs)), int(max(ys) + avg_h // 2))
|
| 441 |
+
return None
|
| 442 |
+
|
| 443 |
+
@staticmethod
|
| 444 |
+
def _find_region_for_line(bbox, line, baseline_seg) -> Tuple[str, object]:
|
| 445 |
+
"""Determine which blla region a line belongs to."""
|
| 446 |
+
# Check tags first
|
| 447 |
+
if hasattr(line, 'tags') and isinstance(line.tags, dict):
|
| 448 |
+
rtype = line.tags.get('type')
|
| 449 |
+
if rtype and isinstance(rtype, str):
|
| 450 |
+
return rtype, None
|
| 451 |
+
|
| 452 |
+
# Check region boundaries
|
| 453 |
+
if hasattr(baseline_seg, 'regions') and baseline_seg.regions:
|
| 454 |
+
cx = (bbox[0] + bbox[2]) // 2
|
| 455 |
+
cy = (bbox[1] + bbox[3]) // 2
|
| 456 |
+
for rtype, region_list in baseline_seg.regions.items():
|
| 457 |
+
for ri, region in enumerate(region_list):
|
| 458 |
+
if hasattr(region, 'boundary') and region.boundary:
|
| 459 |
+
bxs = [p[0] for p in region.boundary]
|
| 460 |
+
bys = [p[1] for p in region.boundary]
|
| 461 |
+
if (min(bxs) <= cx <= max(bxs) and
|
| 462 |
+
min(bys) <= cy <= max(bys)):
|
| 463 |
+
return f"{rtype}_{ri}", region
|
| 464 |
+
|
| 465 |
+
return 'r_1', None
|
| 466 |
+
|
| 467 |
+
@staticmethod
|
| 468 |
+
def _extract_blla_region_boxes(
|
| 469 |
+
baseline_seg,
|
| 470 |
+
text_direction: str = 'horizontal-lr',
|
| 471 |
+
) -> List[Tuple[int, int, int, int, object, str]]:
|
| 472 |
+
"""
|
| 473 |
+
Build a sorted list of (x1, y1, x2, y2, region_obj, region_key) tuples
|
| 474 |
+
from blla's detected regions. Used to clip / split lines that cross
|
| 475 |
+
column boundaries. Returns an empty list when no region boundaries are
|
| 476 |
+
available.
|
| 477 |
+
"""
|
| 478 |
+
boxes: List[Tuple[int, int, int, int, object, str]] = []
|
| 479 |
+
if not (hasattr(baseline_seg, 'regions') and baseline_seg.regions):
|
| 480 |
+
return boxes
|
| 481 |
+
for rtype, region_list in baseline_seg.regions.items():
|
| 482 |
+
for ri, region in enumerate(region_list):
|
| 483 |
+
if not (hasattr(region, 'boundary') and region.boundary):
|
| 484 |
+
continue
|
| 485 |
+
bxs = [p[0] for p in region.boundary]
|
| 486 |
+
bys = [p[1] for p in region.boundary]
|
| 487 |
+
boxes.append((
|
| 488 |
+
int(min(bxs)), int(min(bys)),
|
| 489 |
+
int(max(bxs)), int(max(bys)),
|
| 490 |
+
region, f"{rtype}_{ri}",
|
| 491 |
+
))
|
| 492 |
+
rtl = text_direction.endswith('-rl')
|
| 493 |
+
boxes.sort(key=lambda t: t[0], reverse=rtl)
|
| 494 |
+
return boxes
|
| 495 |
+
|
| 496 |
+
@staticmethod
|
| 497 |
+
def _overlapping_blla_boxes(
|
| 498 |
+
bbox: Tuple[int, int, int, int],
|
| 499 |
+
blla_boxes: List[Tuple[int, int, int, int, object, str]],
|
| 500 |
+
) -> List[Tuple[int, int, int, int, object, str]]:
|
| 501 |
+
"""
|
| 502 |
+
Return the blla region boxes whose bbox overlaps with *bbox*.
|
| 503 |
+
Overlap requires intersection in both x and y.
|
| 504 |
+
"""
|
| 505 |
+
x1, y1, x2, y2 = bbox
|
| 506 |
+
result = []
|
| 507 |
+
for rb in blla_boxes:
|
| 508 |
+
rx1, ry1, rx2, ry2 = rb[0], rb[1], rb[2], rb[3]
|
| 509 |
+
if rx1 < x2 and rx2 > x1 and ry1 < y2 and ry2 > y1:
|
| 510 |
+
result.append(rb)
|
| 511 |
+
return result
|
| 512 |
+
|
| 513 |
+
@staticmethod
|
| 514 |
+
def _estimate_columns(
|
| 515 |
+
lines: list,
|
| 516 |
+
page_w: int,
|
| 517 |
+
max_columns: int = 4,
|
| 518 |
+
min_gap_fraction: float = 0.03,
|
| 519 |
+
) -> List[int]:
|
| 520 |
+
"""
|
| 521 |
+
Gap-based column clustering.
|
| 522 |
+
|
| 523 |
+
Finds natural breaks in the x-center distribution by looking for the
|
| 524 |
+
largest gaps in the sorted sequence of line x-centers. This is more
|
| 525 |
+
robust than histogram peak-finding for closely spaced columns, because
|
| 526 |
+
a column gap is a region with *no* line centers — it shows up as a large
|
| 527 |
+
jump in the sorted sequence regardless of how close the columns are.
|
| 528 |
+
|
| 529 |
+
Args:
|
| 530 |
+
lines: List of LineSegment objects.
|
| 531 |
+
page_w: Width of the region being analysed (pixels).
|
| 532 |
+
max_columns: Maximum number of columns to return (≥1).
|
| 533 |
+
min_gap_fraction: Minimum gap size as a fraction of *page_w* to be
|
| 534 |
+
considered a column boundary. Default 0.03 (3%).
|
| 535 |
+
Increase if spurious splits occur within a column.
|
| 536 |
+
"""
|
| 537 |
+
if not lines:
|
| 538 |
+
return []
|
| 539 |
+
|
| 540 |
+
# Lines wider than 60% of the region are likely headers/footers that
|
| 541 |
+
# span columns — exclude them from clustering to avoid false splits.
|
| 542 |
+
orig_centers = [((l.bbox[0] + l.bbox[2]) // 2) for l in lines]
|
| 543 |
+
line_widths = [(l.bbox[2] - l.bbox[0]) for l in lines]
|
| 544 |
+
clustering_centers = [
|
| 545 |
+
cx for cx, w in zip(orig_centers, line_widths)
|
| 546 |
+
if w < 0.60 * page_w
|
| 547 |
+
]
|
| 548 |
+
|
| 549 |
+
if not clustering_centers:
|
| 550 |
+
# All lines are wide (e.g. single full-width text block)
|
| 551 |
+
return [0] * len(lines)
|
| 552 |
+
|
| 553 |
+
min_gap_px = max(10, int(min_gap_fraction * page_w))
|
| 554 |
+
sorted_cx = sorted(clustering_centers)
|
| 555 |
+
|
| 556 |
+
# Compute gaps between consecutive sorted x-centers
|
| 557 |
+
gaps = [
|
| 558 |
+
(sorted_cx[i + 1] - sorted_cx[i], (sorted_cx[i] + sorted_cx[i + 1]) // 2)
|
| 559 |
+
for i in range(len(sorted_cx) - 1)
|
| 560 |
+
if sorted_cx[i + 1] - sorted_cx[i] >= min_gap_px
|
| 561 |
+
]
|
| 562 |
+
|
| 563 |
+
if not gaps:
|
| 564 |
+
return [0] * len(lines)
|
| 565 |
+
|
| 566 |
+
# Take the largest max_columns-1 gaps as column boundaries
|
| 567 |
+
split_midpoints = sorted(
|
| 568 |
+
mid for _, mid in sorted(gaps, reverse=True)[: max_columns - 1]
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
# Assign each line (using original center) to a column
|
| 572 |
+
assignments = []
|
| 573 |
+
for cx in orig_centers:
|
| 574 |
+
col = sum(1 for sp in split_midpoints if cx > sp)
|
| 575 |
+
assignments.append(col)
|
| 576 |
+
|
| 577 |
+
return assignments
|
| 578 |
+
|
| 579 |
+
def _split_wide_regions(
|
| 580 |
+
self,
|
| 581 |
+
regions_dict: Dict[str, dict],
|
| 582 |
+
page_w: int,
|
| 583 |
+
min_lines_to_split: int = 10,
|
| 584 |
+
split_width_fraction: float = 0.40,
|
| 585 |
+
max_columns: int = 4,
|
| 586 |
+
) -> Dict[str, dict]:
|
| 587 |
+
"""
|
| 588 |
+
Split blla regions that are wide enough to contain multiple columns.
|
| 589 |
+
|
| 590 |
+
A region whose width exceeds *split_width_fraction* of the page width
|
| 591 |
+
and has enough lines is run through column clustering internally.
|
| 592 |
+
|
| 593 |
+
For landscape double-page spreads, lower split_width_fraction (e.g. 0.20)
|
| 594 |
+
to trigger splitting on narrower regions.
|
| 595 |
+
"""
|
| 596 |
+
new_dict: Dict[str, dict] = {}
|
| 597 |
+
split_counter = 0
|
| 598 |
+
|
| 599 |
+
for key, rdata in regions_dict.items():
|
| 600 |
+
region_lines = rdata['lines'] # list of (idx, LineSegment)
|
| 601 |
+
if len(region_lines) < min_lines_to_split:
|
| 602 |
+
new_dict[key] = rdata
|
| 603 |
+
continue
|
| 604 |
+
|
| 605 |
+
# Compute region width from line bboxes
|
| 606 |
+
bboxes = [l.bbox for _, l in region_lines]
|
| 607 |
+
rx1 = min(b[0] for b in bboxes)
|
| 608 |
+
rx2 = max(b[2] for b in bboxes)
|
| 609 |
+
region_w = rx2 - rx1
|
| 610 |
+
|
| 611 |
+
if region_w < split_width_fraction * page_w:
|
| 612 |
+
# Narrow enough to be a single column
|
| 613 |
+
new_dict[key] = rdata
|
| 614 |
+
continue
|
| 615 |
+
|
| 616 |
+
# Wide region — try column clustering within it.
|
| 617 |
+
# _estimate_columns bins x-centers into [0, page_w), so we need to
|
| 618 |
+
# shift line coordinates so that rx1 maps to 0.
|
| 619 |
+
just_lines = [l for _, l in region_lines]
|
| 620 |
+
shifted_lines = []
|
| 621 |
+
for l in just_lines:
|
| 622 |
+
shifted_bbox = (l.bbox[0] - rx1, l.bbox[1],
|
| 623 |
+
l.bbox[2] - rx1, l.bbox[3])
|
| 624 |
+
shifted_lines.append(LineSegment(l.image, shifted_bbox, l.baseline))
|
| 625 |
+
assignments = self._estimate_columns(shifted_lines, page_w=region_w,
|
| 626 |
+
max_columns=max_columns)
|
| 627 |
+
|
| 628 |
+
n_cols = len(set(assignments))
|
| 629 |
+
if n_cols <= 1:
|
| 630 |
+
# Clustering didn't find multiple columns
|
| 631 |
+
new_dict[key] = rdata
|
| 632 |
+
continue
|
| 633 |
+
|
| 634 |
+
print(f"[KrakenSegmenter] Splitting region '{key}' ({len(region_lines)} lines, "
|
| 635 |
+
f"width={region_w}px) into {n_cols} sub-columns")
|
| 636 |
+
|
| 637 |
+
# Re-compute x-centers relative to region left edge for clustering
|
| 638 |
+
# (already done inside _estimate_columns via absolute coords, which
|
| 639 |
+
# works fine since columns are spatially separated)
|
| 640 |
+
for col_id in sorted(set(assignments)):
|
| 641 |
+
sub_key = f"{key}_col{split_counter}"
|
| 642 |
+
split_counter += 1
|
| 643 |
+
sub_lines = [
|
| 644 |
+
region_lines[i]
|
| 645 |
+
for i, a in enumerate(assignments)
|
| 646 |
+
if a == col_id
|
| 647 |
+
]
|
| 648 |
+
new_dict[sub_key] = {'lines': sub_lines, 'blla_region': None}
|
| 649 |
+
|
| 650 |
+
return new_dict
|
| 651 |
+
|
| 652 |
+
def _cluster_into_columns(
|
| 653 |
+
self,
|
| 654 |
+
lines: list,
|
| 655 |
+
page_w: int,
|
| 656 |
+
max_columns: int = 4,
|
| 657 |
+
) -> Dict[str, dict]:
|
| 658 |
+
"""Cluster lines into columns and return regions_dict."""
|
| 659 |
+
assignments = self._estimate_columns(lines, page_w, max_columns=max_columns)
|
| 660 |
+
regions_dict: Dict[str, dict] = {}
|
| 661 |
+
for idx, (col, line) in enumerate(zip(assignments, lines)):
|
| 662 |
+
key = f"col_{col}"
|
| 663 |
+
if key not in regions_dict:
|
| 664 |
+
regions_dict[key] = {'lines': [], 'blla_region': None}
|
| 665 |
+
regions_dict[key]['lines'].append((idx, line))
|
| 666 |
+
return regions_dict
|
| 667 |
+
|
| 668 |
+
@staticmethod
|
| 669 |
+
def _convex_hull(points: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
|
| 670 |
+
"""Monotonic chain convex hull."""
|
| 671 |
+
pts = sorted(set(points))
|
| 672 |
+
if len(pts) <= 2:
|
| 673 |
+
return pts
|
| 674 |
+
|
| 675 |
+
def cross(o, a, b):
|
| 676 |
+
return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
|
| 677 |
+
|
| 678 |
+
lower = []
|
| 679 |
+
for p in pts:
|
| 680 |
+
while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0:
|
| 681 |
+
lower.pop()
|
| 682 |
+
lower.append(p)
|
| 683 |
+
upper = []
|
| 684 |
+
for p in reversed(pts):
|
| 685 |
+
while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0:
|
| 686 |
+
upper.pop()
|
| 687 |
+
upper.append(p)
|
| 688 |
+
return lower[:-1] + upper[:-1]
|
| 689 |
+
|
| 690 |
+
def _build_regions(
|
| 691 |
+
self,
|
| 692 |
+
regions_dict: Dict[str, dict],
|
| 693 |
+
all_lines: list,
|
| 694 |
+
page_w: int,
|
| 695 |
+
text_direction: str = 'horizontal-lr',
|
| 696 |
+
) -> Tuple[List[SegRegion], List[LineSegment]]:
|
| 697 |
+
"""
|
| 698 |
+
Build SegRegion objects from regions_dict.
|
| 699 |
+
|
| 700 |
+
Returns (regions, ordered_lines) where ordered_lines is sorted by
|
| 701 |
+
region (left-to-right for LTR, right-to-left for RTL) then
|
| 702 |
+
top-to-bottom within each region.
|
| 703 |
+
"""
|
| 704 |
+
rtl = text_direction.endswith('-rl')
|
| 705 |
+
|
| 706 |
+
# Sort regions by mean x-center: LTR = ascending, RTL = descending
|
| 707 |
+
def _region_mean_x(item):
|
| 708 |
+
lines = item[1]['lines']
|
| 709 |
+
if not lines:
|
| 710 |
+
return 0
|
| 711 |
+
return sum((l.bbox[0] + l.bbox[2]) / 2 for _, l in lines) / len(lines)
|
| 712 |
+
|
| 713 |
+
sorted_regions = sorted(regions_dict.items(), key=_region_mean_x, reverse=rtl)
|
| 714 |
+
|
| 715 |
+
regions: List[SegRegion] = []
|
| 716 |
+
ordered_lines: List[LineSegment] = []
|
| 717 |
+
|
| 718 |
+
for ri, (region_key, rdata) in enumerate(sorted_regions, start=1):
|
| 719 |
+
region_lines = rdata['lines']
|
| 720 |
+
blla_region = rdata['blla_region']
|
| 721 |
+
|
| 722 |
+
# Sort lines top-to-bottom within region
|
| 723 |
+
region_lines.sort(key=lambda item: item[1].bbox[1])
|
| 724 |
+
|
| 725 |
+
region_id = f"r_{ri}"
|
| 726 |
+
line_ids = [f"l_{i + 1}" for i, _ in region_lines]
|
| 727 |
+
|
| 728 |
+
bboxes = [l.bbox for _, l in region_lines]
|
| 729 |
+
rbbox = (
|
| 730 |
+
min(b[0] for b in bboxes),
|
| 731 |
+
min(b[1] for b in bboxes),
|
| 732 |
+
max(b[2] for b in bboxes),
|
| 733 |
+
max(b[3] for b in bboxes),
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
# Polygon: prefer blla boundary, else convex hull
|
| 737 |
+
polygon = None
|
| 738 |
+
if blla_region and hasattr(blla_region, 'boundary') and blla_region.boundary:
|
| 739 |
+
polygon = [(int(p[0]), int(p[1])) for p in blla_region.boundary]
|
| 740 |
+
else:
|
| 741 |
+
pts = []
|
| 742 |
+
for _, l in region_lines:
|
| 743 |
+
x1, y1, x2, y2 = l.bbox
|
| 744 |
+
pts.extend([(x1, y1), (x2, y1), (x2, y2), (x1, y2)])
|
| 745 |
+
hull = self._convex_hull(pts)
|
| 746 |
+
polygon = hull if len(hull) >= 3 else None
|
| 747 |
+
|
| 748 |
+
regions.append(SegRegion(
|
| 749 |
+
id=region_id,
|
| 750 |
+
bbox=rbbox,
|
| 751 |
+
line_ids=line_ids,
|
| 752 |
+
polygon=polygon,
|
| 753 |
+
))
|
| 754 |
+
|
| 755 |
+
for _, line in region_lines:
|
| 756 |
+
ordered_lines.append(line)
|
| 757 |
+
|
| 758 |
+
return regions, ordered_lines
|
| 759 |
+
|
| 760 |
+
def segment_lines_to_dict(
|
| 761 |
+
self,
|
| 762 |
+
image: Image.Image,
|
| 763 |
+
text_direction: str = 'horizontal-lr',
|
| 764 |
+
use_binarization: bool = True
|
| 765 |
+
) -> List[dict]:
|
| 766 |
+
"""
|
| 767 |
+
Segment image and return results as dictionaries (for compatibility).
|
| 768 |
+
|
| 769 |
+
Returns:
|
| 770 |
+
List of dicts with 'image', 'bbox', and 'baseline' keys
|
| 771 |
+
"""
|
| 772 |
+
segments = self.segment_lines(image, text_direction, use_binarization)
|
| 773 |
+
return [
|
| 774 |
+
{
|
| 775 |
+
'image': seg.image,
|
| 776 |
+
'bbox': seg.bbox,
|
| 777 |
+
'baseline': seg.baseline
|
| 778 |
+
}
|
| 779 |
+
for seg in segments
|
| 780 |
+
]
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
def test_kraken_segmenter():
|
| 784 |
+
"""Test Kraken segmenter on a sample image."""
|
| 785 |
+
import sys
|
| 786 |
+
|
| 787 |
+
if len(sys.argv) < 2:
|
| 788 |
+
print("Usage: python kraken_segmenter.py <image_path>")
|
| 789 |
+
sys.exit(1)
|
| 790 |
+
|
| 791 |
+
image_path = sys.argv[1]
|
| 792 |
+
print(f"Testing Kraken segmenter on: {image_path}")
|
| 793 |
+
|
| 794 |
+
# Load image
|
| 795 |
+
image = Image.open(image_path)
|
| 796 |
+
print(f"Image size: {image.size}")
|
| 797 |
+
|
| 798 |
+
# Create segmenter
|
| 799 |
+
segmenter = KrakenLineSegmenter()
|
| 800 |
+
|
| 801 |
+
# Segment lines
|
| 802 |
+
lines = segmenter.segment_lines(image, use_binarization=True)
|
| 803 |
+
|
| 804 |
+
# Print results
|
| 805 |
+
print(f"\nDetected {len(lines)} lines:")
|
| 806 |
+
for i, line in enumerate(lines):
|
| 807 |
+
print(f" Line {i+1}: bbox={line.bbox}, "
|
| 808 |
+
f"baseline_points={len(line.baseline) if line.baseline else 0}")
|
| 809 |
+
|
| 810 |
+
# Save line images
|
| 811 |
+
import os
|
| 812 |
+
output_dir = "kraken_test_output"
|
| 813 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 814 |
+
|
| 815 |
+
for i, line in enumerate(lines):
|
| 816 |
+
output_path = os.path.join(output_dir, f"line_{i+1:03d}.png")
|
| 817 |
+
line.image.save(output_path)
|
| 818 |
+
|
| 819 |
+
print(f"\nLine images saved to: {output_dir}/")
|
| 820 |
+
|
| 821 |
+
|
| 822 |
+
if __name__ == "__main__":
|
| 823 |
+
test_kraken_segmenter()
|
page_xml_exporter.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PAGE XML Exporter
|
| 3 |
+
|
| 4 |
+
Exports line segmentation and transcription data to PAGE XML format.
|
| 5 |
+
Compatible with party and other PAGE XML processors.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import xml.etree.ElementTree as ET
|
| 9 |
+
from xml.dom import minidom
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Optional
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from inference_page import LineSegment
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class PageXMLExporter:
|
| 17 |
+
"""Export line segmentation data to PAGE XML format."""
|
| 18 |
+
|
| 19 |
+
# PAGE XML namespace
|
| 20 |
+
NAMESPACE = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
|
| 21 |
+
|
| 22 |
+
def __init__(self, image_path: str, image_width: int, image_height: int):
|
| 23 |
+
"""
|
| 24 |
+
Initialize PAGE XML exporter.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
image_path: Path to the page image file
|
| 28 |
+
image_width: Width of the page image in pixels
|
| 29 |
+
image_height: Height of the page image in pixels
|
| 30 |
+
"""
|
| 31 |
+
self.image_path = Path(image_path)
|
| 32 |
+
self.image_width = image_width
|
| 33 |
+
self.image_height = image_height
|
| 34 |
+
|
| 35 |
+
def _make_root(self, creator: str, comments: Optional[str]) -> tuple:
|
| 36 |
+
"""Build root PcGts element with Metadata and Page. Returns (root, page)."""
|
| 37 |
+
ET.register_namespace('', self.NAMESPACE)
|
| 38 |
+
root = ET.Element('PcGts', {
|
| 39 |
+
'xmlns': self.NAMESPACE,
|
| 40 |
+
'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
|
| 41 |
+
'xsi:schemaLocation': (
|
| 42 |
+
f'{self.NAMESPACE} '
|
| 43 |
+
'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd'
|
| 44 |
+
),
|
| 45 |
+
'pcGtsId': f'pc-{self.image_path.stem}'
|
| 46 |
+
})
|
| 47 |
+
metadata = ET.SubElement(root, 'Metadata')
|
| 48 |
+
ET.SubElement(metadata, 'Creator').text = creator
|
| 49 |
+
ET.SubElement(metadata, 'Created').text = datetime.now().isoformat()
|
| 50 |
+
ET.SubElement(metadata, 'LastChange').text = datetime.now().isoformat()
|
| 51 |
+
if comments:
|
| 52 |
+
ET.SubElement(metadata, 'Comments').text = comments
|
| 53 |
+
page = ET.SubElement(root, 'Page', {
|
| 54 |
+
'imageFilename': str(self.image_path.name),
|
| 55 |
+
'imageWidth': str(self.image_width),
|
| 56 |
+
'imageHeight': str(self.image_height)
|
| 57 |
+
})
|
| 58 |
+
return root, page
|
| 59 |
+
|
| 60 |
+
@staticmethod
|
| 61 |
+
def _write_xml(root: ET.Element, output_path: str) -> None:
|
| 62 |
+
xml_str = ET.tostring(root, encoding='utf-8', method='xml')
|
| 63 |
+
dom = minidom.parseString(xml_str)
|
| 64 |
+
pretty_xml = dom.toprettyxml(indent=' ', encoding='utf-8')
|
| 65 |
+
with open(output_path, 'wb') as f:
|
| 66 |
+
f.write(pretty_xml)
|
| 67 |
+
|
| 68 |
+
@staticmethod
|
| 69 |
+
def _baseline_points(segment) -> str:
|
| 70 |
+
"""Return PAGE XML baseline points string for a segment."""
|
| 71 |
+
if hasattr(segment, 'baseline') and segment.baseline:
|
| 72 |
+
return ' '.join(f'{x},{y}' for x, y in segment.baseline)
|
| 73 |
+
x1, y1, x2, y2 = segment.bbox
|
| 74 |
+
bl_y = y2 - 5
|
| 75 |
+
return f'{x1},{bl_y} {x2},{bl_y}'
|
| 76 |
+
|
| 77 |
+
@staticmethod
|
| 78 |
+
def _coords_points(segment) -> str:
|
| 79 |
+
"""Return PAGE XML coords points string for a segment."""
|
| 80 |
+
if hasattr(segment, 'coords') and segment.coords:
|
| 81 |
+
return ' '.join(f'{x},{y}' for x, y in segment.coords)
|
| 82 |
+
x1, y1, x2, y2 = segment.bbox
|
| 83 |
+
return f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
|
| 84 |
+
|
| 85 |
+
def _add_text_line(self, parent: ET.Element, line_id: str, segment,
|
| 86 |
+
text: Optional[str], line_idx: int) -> None:
|
| 87 |
+
"""Add a TextLine element to parent with coords, baseline and optional text."""
|
| 88 |
+
line_elem = ET.SubElement(parent, 'TextLine', {
|
| 89 |
+
'id': line_id,
|
| 90 |
+
'custom': f'readingOrder {{index:{line_idx};}}'
|
| 91 |
+
})
|
| 92 |
+
ET.SubElement(line_elem, 'Coords').set('points', self._coords_points(segment))
|
| 93 |
+
ET.SubElement(line_elem, 'Baseline').set('points', self._baseline_points(segment))
|
| 94 |
+
if text:
|
| 95 |
+
conf = '1.0'
|
| 96 |
+
if hasattr(segment, 'confidence') and segment.confidence is not None:
|
| 97 |
+
conf = str(segment.confidence)
|
| 98 |
+
text_equiv = ET.SubElement(line_elem, 'TextEquiv', {'conf': conf})
|
| 99 |
+
ET.SubElement(text_equiv, 'Unicode').text = text
|
| 100 |
+
|
| 101 |
+
def export(self, segments: List[LineSegment], output_path: str,
|
| 102 |
+
creator: str = "TrOCR-GUI", comments: Optional[str] = None) -> None:
|
| 103 |
+
"""
|
| 104 |
+
Export line segments to PAGE XML (single TextRegion, no region info).
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
segments: List of LineSegment objects (may carry .text attribute)
|
| 108 |
+
output_path: Path where to save the PAGE XML file
|
| 109 |
+
creator: Software/tool that created this PAGE XML
|
| 110 |
+
comments: Optional comments about the document
|
| 111 |
+
"""
|
| 112 |
+
root, page = self._make_root(creator, comments)
|
| 113 |
+
|
| 114 |
+
# Reading order
|
| 115 |
+
reading_order = ET.SubElement(page, 'ReadingOrder')
|
| 116 |
+
ordered_group = ET.SubElement(reading_order, 'OrderedGroup', {
|
| 117 |
+
'id': 'ro_1',
|
| 118 |
+
'caption': 'Regions reading order'
|
| 119 |
+
})
|
| 120 |
+
ET.SubElement(ordered_group, 'RegionRefIndexed', {
|
| 121 |
+
'index': '0',
|
| 122 |
+
'regionRef': 'region_1'
|
| 123 |
+
})
|
| 124 |
+
|
| 125 |
+
# Single text region spanning all lines
|
| 126 |
+
text_region = ET.SubElement(page, 'TextRegion', {
|
| 127 |
+
'id': 'region_1',
|
| 128 |
+
'type': 'paragraph',
|
| 129 |
+
'custom': 'readingOrder {index:0;}'
|
| 130 |
+
})
|
| 131 |
+
if segments:
|
| 132 |
+
x1 = min(seg.bbox[0] for seg in segments)
|
| 133 |
+
y1 = min(seg.bbox[1] for seg in segments)
|
| 134 |
+
x2 = max(seg.bbox[2] for seg in segments)
|
| 135 |
+
y2 = max(seg.bbox[3] for seg in segments)
|
| 136 |
+
ET.SubElement(text_region, 'Coords').set(
|
| 137 |
+
'points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
for idx, segment in enumerate(segments):
|
| 141 |
+
text = getattr(segment, 'text', None) or None
|
| 142 |
+
self._add_text_line(text_region, f'line_{idx + 1}', segment, text, idx)
|
| 143 |
+
|
| 144 |
+
self._write_xml(root, output_path)
|
| 145 |
+
|
| 146 |
+
def export_with_regions(
|
| 147 |
+
self,
|
| 148 |
+
regions,
|
| 149 |
+
lines,
|
| 150 |
+
output_path: str,
|
| 151 |
+
transcriptions: Optional[List[str]] = None,
|
| 152 |
+
creator: str = "TrOCR-GUI",
|
| 153 |
+
comments: Optional[str] = None,
|
| 154 |
+
) -> None:
|
| 155 |
+
"""
|
| 156 |
+
Export with proper multi-region PAGE XML structure.
|
| 157 |
+
|
| 158 |
+
Creates one TextRegion per detected region (e.g. columns, marginalia),
|
| 159 |
+
with TextLines nested inside their region and actual baseline polylines.
|
| 160 |
+
ReadingOrder lists regions left-to-right and lines top-to-bottom within
|
| 161 |
+
each region, matching how blla / column clustering ordered them.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
regions: List of SegRegion objects (duck-typed: .id, .line_ids,
|
| 165 |
+
.bbox, optional .polygon).
|
| 166 |
+
lines: Flat list of LineSegment objects, already ordered by
|
| 167 |
+
region (region[0]'s lines first, then region[1]'s, …).
|
| 168 |
+
The count of lines per region is len(region.line_ids).
|
| 169 |
+
output_path: Where to write the PAGE XML file.
|
| 170 |
+
transcriptions: Optional list of text strings, parallel to *lines*.
|
| 171 |
+
Pass self.transcriptions from the GUI when available.
|
| 172 |
+
creator: Creator string for Metadata.
|
| 173 |
+
comments: Optional comments string for Metadata.
|
| 174 |
+
"""
|
| 175 |
+
root, page = self._make_root(creator, comments)
|
| 176 |
+
|
| 177 |
+
# ReadingOrder — one RegionRefIndexed per region
|
| 178 |
+
reading_order = ET.SubElement(page, 'ReadingOrder')
|
| 179 |
+
ordered_group = ET.SubElement(reading_order, 'OrderedGroup', {
|
| 180 |
+
'id': 'ro_1',
|
| 181 |
+
'caption': 'Regions reading order'
|
| 182 |
+
})
|
| 183 |
+
for ri, region in enumerate(regions):
|
| 184 |
+
ET.SubElement(ordered_group, 'RegionRefIndexed', {
|
| 185 |
+
'index': str(ri),
|
| 186 |
+
'regionRef': region.id
|
| 187 |
+
})
|
| 188 |
+
|
| 189 |
+
# TextRegions — one per region, lines nested inside
|
| 190 |
+
line_offset = 0
|
| 191 |
+
for ri, region in enumerate(regions):
|
| 192 |
+
n = len(region.line_ids) if hasattr(region, 'line_ids') else 0
|
| 193 |
+
region_lines = lines[line_offset:line_offset + n]
|
| 194 |
+
line_offset += n
|
| 195 |
+
|
| 196 |
+
text_region = ET.SubElement(page, 'TextRegion', {
|
| 197 |
+
'id': region.id,
|
| 198 |
+
'type': 'paragraph',
|
| 199 |
+
'custom': f'readingOrder {{index:{ri};}}'
|
| 200 |
+
})
|
| 201 |
+
|
| 202 |
+
# Region polygon (prefer neural boundary over convex hull over bbox)
|
| 203 |
+
if hasattr(region, 'polygon') and region.polygon and len(region.polygon) >= 3:
|
| 204 |
+
pts = ' '.join(f'{x},{y}' for x, y in region.polygon)
|
| 205 |
+
else:
|
| 206 |
+
x1, y1, x2, y2 = region.bbox
|
| 207 |
+
pts = f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}'
|
| 208 |
+
ET.SubElement(text_region, 'Coords').set('points', pts)
|
| 209 |
+
|
| 210 |
+
for li, segment in enumerate(region_lines):
|
| 211 |
+
global_line_idx = line_offset - n + li # index in the flat lines list
|
| 212 |
+
text = None
|
| 213 |
+
if transcriptions and global_line_idx < len(transcriptions):
|
| 214 |
+
text = transcriptions[global_line_idx] or None
|
| 215 |
+
elif hasattr(segment, 'text'):
|
| 216 |
+
text = getattr(segment, 'text', None) or None
|
| 217 |
+
self._add_text_line(
|
| 218 |
+
text_region,
|
| 219 |
+
f'line_{ri + 1}_{li + 1}',
|
| 220 |
+
segment,
|
| 221 |
+
text,
|
| 222 |
+
li,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
self._write_xml(root, output_path)
|
| 226 |
+
|
| 227 |
+
@staticmethod
|
| 228 |
+
def quick_export(image_path: str, segments: List[LineSegment],
|
| 229 |
+
output_path: Optional[str] = None) -> str:
|
| 230 |
+
"""
|
| 231 |
+
Quick export helper that automatically determines output path and image dimensions.
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
image_path: Path to the page image
|
| 235 |
+
segments: List of LineSegment objects
|
| 236 |
+
output_path: Optional output path (default: same as image with .xml extension)
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
Path to the exported PAGE XML file
|
| 240 |
+
"""
|
| 241 |
+
from PIL import Image
|
| 242 |
+
|
| 243 |
+
# Load image to get dimensions
|
| 244 |
+
img = Image.open(image_path)
|
| 245 |
+
width, height = img.size
|
| 246 |
+
|
| 247 |
+
# Determine output path
|
| 248 |
+
if output_path is None:
|
| 249 |
+
output_path = Path(image_path).with_suffix('.xml')
|
| 250 |
+
|
| 251 |
+
# Export
|
| 252 |
+
exporter = PageXMLExporter(image_path, width, height)
|
| 253 |
+
exporter.export(segments, str(output_path))
|
| 254 |
+
|
| 255 |
+
return str(output_path)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
if __name__ == "__main__":
|
| 259 |
+
# Example usage
|
| 260 |
+
from PIL import Image
|
| 261 |
+
|
| 262 |
+
# Create a dummy segment for testing
|
| 263 |
+
dummy_img = Image.new('L', (100, 30))
|
| 264 |
+
dummy_segment = LineSegment(
|
| 265 |
+
image=dummy_img,
|
| 266 |
+
bbox=(10, 10, 200, 40),
|
| 267 |
+
text="Example text",
|
| 268 |
+
confidence=0.95
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
exporter = PageXMLExporter("test_page.jpg", 800, 1200)
|
| 272 |
+
exporter.export([dummy_segment], "test_output.xml",
|
| 273 |
+
creator="PAGE XML Exporter Test",
|
| 274 |
+
comments="This is a test export")
|
| 275 |
+
|
| 276 |
+
print("Test PAGE XML created: test_output.xml")
|
web/polyscriptor_server.py
ADDED
|
@@ -0,0 +1,2237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Polyscriptor Web UI — FastAPI Backend
|
| 3 |
+
|
| 4 |
+
Thin wrapper around existing HTR engine code. Provides REST API + SSE
|
| 5 |
+
for browser-based transcription. All heavy lifting done by the same
|
| 6 |
+
modules the PyQt6 GUI uses.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
source htr_gui/bin/activate
|
| 10 |
+
python -m uvicorn web.polyscriptor_server:app --host 0.0.0.0 --port 8765
|
| 11 |
+
|
| 12 |
+
Author: Claude Code
|
| 13 |
+
Date: 2026-02-26
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import asyncio
|
| 17 |
+
import hashlib
|
| 18 |
+
import importlib
|
| 19 |
+
import json
|
| 20 |
+
import logging
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
import time
|
| 24 |
+
import uuid
|
| 25 |
+
from dataclasses import dataclass, field
|
| 26 |
+
from types import SimpleNamespace
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
from typing import Any, Dict, List, Optional
|
| 29 |
+
|
| 30 |
+
import numpy as np
|
| 31 |
+
from PIL import Image, ImageOps
|
| 32 |
+
from fastapi import Cookie, FastAPI, File, HTTPException, Query, Request, UploadFile
|
| 33 |
+
from fastapi.responses import FileResponse, Response, StreamingResponse
|
| 34 |
+
from fastapi.staticfiles import StaticFiles
|
| 35 |
+
from pydantic import BaseModel
|
| 36 |
+
|
| 37 |
+
log = logging.getLogger("polyscriptor")
|
| 38 |
+
DEMO_MODE = os.environ.get("POLYSCRIPTOR_DEMO_MODE", "").strip().lower()
|
| 39 |
+
|
| 40 |
+
# Add project root to path so we can import existing modules
|
| 41 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 42 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 43 |
+
|
| 44 |
+
# Load .env from project root (same as the Qt GUI does via CommercialAPIEngine)
|
| 45 |
+
try:
|
| 46 |
+
from dotenv import load_dotenv
|
| 47 |
+
_env_path = PROJECT_ROOT / ".env"
|
| 48 |
+
if _env_path.exists():
|
| 49 |
+
load_dotenv(_env_path)
|
| 50 |
+
log.info(f"Loaded environment variables from {_env_path}")
|
| 51 |
+
except ImportError:
|
| 52 |
+
pass # python-dotenv not installed — env vars must be set externally
|
| 53 |
+
|
| 54 |
+
from htr_engine_base import get_global_registry, HTREngine, TranscriptionResult
|
| 55 |
+
|
| 56 |
+
# PDF support via PyMuPDF
|
| 57 |
+
try:
|
| 58 |
+
import fitz as _fitz # PyMuPDF
|
| 59 |
+
PDF_AVAILABLE = True
|
| 60 |
+
except ImportError:
|
| 61 |
+
PDF_AVAILABLE = False
|
| 62 |
+
log.warning("PyMuPDF not installed — PDF upload disabled. Install with: pip install pymupdf")
|
| 63 |
+
|
| 64 |
+
# Lazy imports for segmentation (avoid slow startup)
|
| 65 |
+
_segmenters_imported = False
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _import_segmenters():
|
| 69 |
+
global _segmenters_imported
|
| 70 |
+
if _segmenters_imported:
|
| 71 |
+
return
|
| 72 |
+
global KrakenLineSegmenter, LineSegmenter, PYLAIA_MODELS
|
| 73 |
+
from kraken_segmenter import KrakenLineSegmenter
|
| 74 |
+
from inference_page import LineSegmenter
|
| 75 |
+
try:
|
| 76 |
+
from inference_pylaia_native import PYLAIA_MODELS
|
| 77 |
+
except ImportError:
|
| 78 |
+
PYLAIA_MODELS = {}
|
| 79 |
+
_segmenters_imported = True
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ---------------------------------------------------------------------------
|
| 83 |
+
# App setup
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
|
| 86 |
+
app = FastAPI(title="Polyscriptor HTR", version="0.1.0")
|
| 87 |
+
|
| 88 |
+
# Serve static frontend files
|
| 89 |
+
STATIC_DIR = Path(__file__).parent / "static"
|
| 90 |
+
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
| 91 |
+
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
# Engine pool — Phase 2: shared pool of loaded engine instances
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
|
| 96 |
+
@dataclass
|
| 97 |
+
class EngineSlot:
|
| 98 |
+
"""One loaded engine instance in the pool."""
|
| 99 |
+
engine: Any # HTREngine instance (not the registry singleton)
|
| 100 |
+
engine_name: str
|
| 101 |
+
config: dict
|
| 102 |
+
pool_key: str
|
| 103 |
+
ref_count: int = 0
|
| 104 |
+
last_used: float = field(default_factory=time.time)
|
| 105 |
+
lock: asyncio.Lock = field(default_factory=asyncio.Lock)
|
| 106 |
+
|
| 107 |
+
engine_pool: Dict[str, EngineSlot] = {}
|
| 108 |
+
pool_lock = asyncio.Lock()
|
| 109 |
+
|
| 110 |
+
# VRAM budget estimates (GB) for eviction decisions
|
| 111 |
+
_ENGINE_VRAM_GB = {
|
| 112 |
+
"CRNN-CTC (PyLaia-inspired)": 2,
|
| 113 |
+
"TrOCR": 3,
|
| 114 |
+
"Qwen3-VL": 18,
|
| 115 |
+
"Churro VLM": 10,
|
| 116 |
+
"Kraken": 2,
|
| 117 |
+
"Party": 4,
|
| 118 |
+
"PaddleOCR": 2,
|
| 119 |
+
}
|
| 120 |
+
_NO_GPU_ENGINES = {"Commercial APIs", "OpenWebUI", "LightOnOCR", "DeepSeek-OCR"}
|
| 121 |
+
_TOTAL_VRAM_GB = 92 # 2x L40S @ 46GB each
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# Factory: engine name -> (module, class) for creating fresh instances
|
| 125 |
+
_ENGINE_FACTORY = {
|
| 126 |
+
"TrOCR": ("engines.trocr_engine", "TrOCREngine"),
|
| 127 |
+
"CRNN-CTC (PyLaia-inspired)": ("engines.pylaia_engine", "PyLaiaEngine"),
|
| 128 |
+
"Qwen3-VL": ("engines.qwen3_engine", "Qwen3Engine"),
|
| 129 |
+
"Churro VLM": ("engines.churro_engine", "ChurroEngine"),
|
| 130 |
+
"Kraken": ("engines.kraken_engine", "KrakenEngine"),
|
| 131 |
+
"Commercial APIs": ("engines.commercial_api_engine", "CommercialAPIEngine"),
|
| 132 |
+
"Party": ("engines.party_engine", "PartyEngine"),
|
| 133 |
+
"OpenWebUI": ("engines.openwebui_engine", "OpenWebUIEngine"),
|
| 134 |
+
"DeepSeek-OCR": ("engines.deepseek_ocr_engine", "DeepSeekOCREngine"),
|
| 135 |
+
"LightOnOCR": ("engines.lighton_ocr_engine", "LightOnOCREngine"),
|
| 136 |
+
"PaddleOCR": ("engines.paddle_engine", "PaddleOCREngine"),
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _create_engine_instance(engine_name: str):
|
| 141 |
+
"""Create a fresh engine instance (not the registry singleton).
|
| 142 |
+
|
| 143 |
+
The registry is used for discovery/availability only.
|
| 144 |
+
Pool slots get their own instances so multiple models can coexist.
|
| 145 |
+
"""
|
| 146 |
+
entry = _ENGINE_FACTORY.get(engine_name)
|
| 147 |
+
if not entry:
|
| 148 |
+
return None
|
| 149 |
+
module_name, class_name = entry
|
| 150 |
+
mod = importlib.import_module(module_name)
|
| 151 |
+
cls = getattr(mod, class_name)
|
| 152 |
+
return cls()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _make_pool_key(engine_name: str, config: dict) -> str:
|
| 156 |
+
"""Build a key that uniquely identifies an engine+model combination."""
|
| 157 |
+
if engine_name == "Commercial APIs":
|
| 158 |
+
provider = config.get("provider", "unknown")
|
| 159 |
+
model = config.get("model", "unknown")
|
| 160 |
+
api_key = config.get("api_key", "")
|
| 161 |
+
key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
|
| 162 |
+
return f"{engine_name}::{provider}::{model}::{key_hash}"
|
| 163 |
+
|
| 164 |
+
if engine_name == "OpenWebUI":
|
| 165 |
+
model = config.get("model", "unknown")
|
| 166 |
+
base_url = config.get("base_url", "unknown")
|
| 167 |
+
api_key = config.get("api_key", "")
|
| 168 |
+
key_hash = hashlib.sha256(api_key.encode()).hexdigest()[:8] if api_key else "nokey"
|
| 169 |
+
return f"{engine_name}::{base_url}::{model}::{key_hash}"
|
| 170 |
+
|
| 171 |
+
if engine_name == "TrOCR":
|
| 172 |
+
return f"{engine_name}::{config.get('model_path', 'default')}"
|
| 173 |
+
|
| 174 |
+
if engine_name in ("CRNN-CTC (PyLaia-inspired)", "Kraken"):
|
| 175 |
+
return f"{engine_name}::{config.get('model_path', 'default')}"
|
| 176 |
+
|
| 177 |
+
if engine_name == "Qwen3-VL":
|
| 178 |
+
base = config.get("base_model", "default")
|
| 179 |
+
adapter = config.get("adapter", "")
|
| 180 |
+
return f"{engine_name}::{base}::{adapter or 'none'}"
|
| 181 |
+
|
| 182 |
+
if engine_name == "Churro VLM":
|
| 183 |
+
return f"{engine_name}::{config.get('model_name', 'default')}"
|
| 184 |
+
|
| 185 |
+
if engine_name == "LightOnOCR":
|
| 186 |
+
return f"{engine_name}::{config.get('model_path', 'default')}"
|
| 187 |
+
|
| 188 |
+
# Fallback: hash the config
|
| 189 |
+
config_hash = hashlib.sha256(str(sorted(config.items())).encode()).hexdigest()[:12]
|
| 190 |
+
return f"{engine_name}::{config_hash}"
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
async def _maybe_evict(new_engine_name: str):
|
| 194 |
+
"""Evict LRU slots with ref_count==0 if VRAM is tight. Called UNDER pool_lock."""
|
| 195 |
+
if new_engine_name in _NO_GPU_ENGINES:
|
| 196 |
+
return
|
| 197 |
+
needed = _ENGINE_VRAM_GB.get(new_engine_name, 4)
|
| 198 |
+
used = sum(_ENGINE_VRAM_GB.get(s.engine_name, 4)
|
| 199 |
+
for s in engine_pool.values()
|
| 200 |
+
if s.engine_name not in _NO_GPU_ENGINES)
|
| 201 |
+
if used + needed <= _TOTAL_VRAM_GB:
|
| 202 |
+
return
|
| 203 |
+
# Evict: ref_count==0, oldest first
|
| 204 |
+
candidates = sorted(
|
| 205 |
+
[(k, s) for k, s in engine_pool.items()
|
| 206 |
+
if s.ref_count == 0 and s.engine_name not in _NO_GPU_ENGINES],
|
| 207 |
+
key=lambda x: x[1].last_used
|
| 208 |
+
)
|
| 209 |
+
for key, slot in candidates:
|
| 210 |
+
if used + needed <= _TOTAL_VRAM_GB:
|
| 211 |
+
break
|
| 212 |
+
log.info(f"Evicting engine slot '{key}' (last used {time.time() - slot.last_used:.0f}s ago)")
|
| 213 |
+
try:
|
| 214 |
+
slot.engine.unload_model()
|
| 215 |
+
except Exception as e:
|
| 216 |
+
log.warning(f"Error unloading evicted engine: {e}")
|
| 217 |
+
del engine_pool[key]
|
| 218 |
+
used -= _ENGINE_VRAM_GB.get(slot.engine_name, 4)
|
| 219 |
+
if used + needed > _TOTAL_VRAM_GB:
|
| 220 |
+
log.warning(f"VRAM tight: ~{used}GB used + ~{needed}GB needed > {_TOTAL_VRAM_GB}GB total")
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
# Compatibility shims — will be removed after full migration
|
| 224 |
+
loaded_engine: Optional[HTREngine] = None
|
| 225 |
+
loaded_engine_name: str = ""
|
| 226 |
+
loaded_config: dict = {}
|
| 227 |
+
|
| 228 |
+
# Persistent upload storage (survives server restarts)
|
| 229 |
+
UPLOAD_DIR = Path(__file__).parent / "uploads"
|
| 230 |
+
UPLOAD_DIR.mkdir(exist_ok=True)
|
| 231 |
+
|
| 232 |
+
# Upload TTL: 24 hours
|
| 233 |
+
_UPLOAD_TTL_SECONDS = 86400
|
| 234 |
+
|
| 235 |
+
# Session TTL: 2 hours of inactivity
|
| 236 |
+
_SESSION_TTL_SECONDS = 7200
|
| 237 |
+
|
| 238 |
+
# Cookie name for session tracking
|
| 239 |
+
_SESSION_COOKIE = "polyscriptor_session"
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# ---------------------------------------------------------------------------
|
| 243 |
+
# Per-user sessions — Phase 1 of multi-user refactoring
|
| 244 |
+
# ---------------------------------------------------------------------------
|
| 245 |
+
|
| 246 |
+
@dataclass
|
| 247 |
+
class UserSession:
|
| 248 |
+
session_id: str
|
| 249 |
+
image_cache: Dict[str, dict] = field(default_factory=dict)
|
| 250 |
+
cancel_events: Dict[str, asyncio.Event] = field(default_factory=dict)
|
| 251 |
+
pool_key: Optional[str] = None # Reference into engine_pool
|
| 252 |
+
created_at: float = field(default_factory=time.time)
|
| 253 |
+
last_active: float = field(default_factory=time.time)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
sessions: Dict[str, UserSession] = {}
|
| 257 |
+
global_image_cache: Dict[str, dict] = {}
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def _get_or_create_session(session_id: Optional[str]) -> tuple[UserSession, bool]:
|
| 261 |
+
"""Return (session, created). If session_id is missing/unknown, create a new one."""
|
| 262 |
+
if session_id and session_id in sessions:
|
| 263 |
+
session = sessions[session_id]
|
| 264 |
+
session.last_active = time.time()
|
| 265 |
+
return session, False
|
| 266 |
+
new_id = str(uuid.uuid4())
|
| 267 |
+
session = UserSession(session_id=new_id)
|
| 268 |
+
sessions[new_id] = session
|
| 269 |
+
return session, True
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _cleanup_expired_sessions() -> int:
|
| 273 |
+
"""Remove sessions inactive for more than _SESSION_TTL_SECONDS. Returns count removed."""
|
| 274 |
+
cutoff = time.time() - _SESSION_TTL_SECONDS
|
| 275 |
+
expired = [sid for sid, s in sessions.items() if s.last_active < cutoff]
|
| 276 |
+
for sid in expired:
|
| 277 |
+
session = sessions.pop(sid)
|
| 278 |
+
# Release pool reference
|
| 279 |
+
if session.pool_key and session.pool_key in engine_pool:
|
| 280 |
+
slot = engine_pool[session.pool_key]
|
| 281 |
+
slot.ref_count = max(0, slot.ref_count - 1)
|
| 282 |
+
if slot.ref_count == 0:
|
| 283 |
+
log.info(f"Immediate eviction (session expiry): '{slot.engine_name}'")
|
| 284 |
+
try:
|
| 285 |
+
slot.engine.unload_model()
|
| 286 |
+
except Exception as e:
|
| 287 |
+
log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
|
| 288 |
+
if session.pool_key in engine_pool:
|
| 289 |
+
del engine_pool[session.pool_key]
|
| 290 |
+
# Clean up upload files belonging to this session
|
| 291 |
+
for iid, img_data in session.image_cache.items():
|
| 292 |
+
p = img_data.get("path")
|
| 293 |
+
if p:
|
| 294 |
+
Path(p).unlink(missing_ok=True)
|
| 295 |
+
xp = img_data.get("xml_path")
|
| 296 |
+
if xp:
|
| 297 |
+
Path(xp).unlink(missing_ok=True)
|
| 298 |
+
log.info(f"Expired session {sid[:8]}... ({len(session.image_cache)} images)")
|
| 299 |
+
return len(expired)
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
_SESSION_PASSTHROUGH_PATHS = {"/api/gpu", "/api/engines", "/api/kraken/presets"}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
@app.middleware("http")
|
| 306 |
+
async def session_middleware(request: Request, call_next):
|
| 307 |
+
"""Inject session into request.state; set session cookie on new sessions.
|
| 308 |
+
|
| 309 |
+
Pure status/discovery routes (GPU poll, engine list) are excluded from
|
| 310 |
+
last_active updates so that background browser polling cannot keep a session
|
| 311 |
+
alive indefinitely and prevent engine-slot eviction.
|
| 312 |
+
"""
|
| 313 |
+
session_id = request.cookies.get(_SESSION_COOKIE)
|
| 314 |
+
session, created = _get_or_create_session(session_id)
|
| 315 |
+
request.state.session = session
|
| 316 |
+
|
| 317 |
+
# Don't update last_active for polling-only routes
|
| 318 |
+
if request.url.path in _SESSION_PASSTHROUGH_PATHS:
|
| 319 |
+
session.last_active # read only — no write
|
| 320 |
+
else:
|
| 321 |
+
session.last_active = time.time()
|
| 322 |
+
|
| 323 |
+
response = await call_next(request)
|
| 324 |
+
|
| 325 |
+
if created or session_id != session.session_id:
|
| 326 |
+
cookie_kwargs = {
|
| 327 |
+
"key": _SESSION_COOKIE,
|
| 328 |
+
"value": session.session_id,
|
| 329 |
+
"httponly": True,
|
| 330 |
+
"max_age": _SESSION_TTL_SECONDS,
|
| 331 |
+
}
|
| 332 |
+
if DEMO_MODE == "hf_space":
|
| 333 |
+
cookie_kwargs.update({"samesite": "none", "secure": True})
|
| 334 |
+
else:
|
| 335 |
+
cookie_kwargs.update({"samesite": "lax"})
|
| 336 |
+
response.set_cookie(
|
| 337 |
+
**cookie_kwargs
|
| 338 |
+
)
|
| 339 |
+
return response
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def _get_session(request: Request) -> UserSession:
|
| 343 |
+
"""FastAPI dependency: extract session set by middleware."""
|
| 344 |
+
return request.state.session
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def _cleanup_old_uploads() -> int:
|
| 348 |
+
"""Delete uploads older than TTL and evict image_cache entries across all sessions."""
|
| 349 |
+
cutoff = time.time() - _UPLOAD_TTL_SECONDS
|
| 350 |
+
deleted = 0
|
| 351 |
+
for f in list(UPLOAD_DIR.iterdir()):
|
| 352 |
+
if f.is_file():
|
| 353 |
+
try:
|
| 354 |
+
if f.stat().st_mtime < cutoff:
|
| 355 |
+
f.unlink(missing_ok=True)
|
| 356 |
+
deleted += 1
|
| 357 |
+
except OSError:
|
| 358 |
+
pass
|
| 359 |
+
# Evict stale image_cache entries whose file no longer exists (all sessions)
|
| 360 |
+
for session in sessions.values():
|
| 361 |
+
for iid in list(session.image_cache.keys()):
|
| 362 |
+
p = session.image_cache[iid].get("path")
|
| 363 |
+
if p and not Path(p).exists():
|
| 364 |
+
del session.image_cache[iid]
|
| 365 |
+
return deleted
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
_SLOT_IDLE_TTL_SECONDS = 6 * 3600 # evict loaded engines idle for 6h, regardless of ref_count
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def _evict_idle_slots() -> int:
|
| 372 |
+
"""Evict engine slots that have not been used for _SLOT_IDLE_TTL_SECONDS.
|
| 373 |
+
|
| 374 |
+
Called under no lock — must only be called from _periodic_cleanup (single-threaded).
|
| 375 |
+
The GPU-status poll (/api/gpu) keeps sessions alive indefinitely, so we cannot rely
|
| 376 |
+
on session expiry alone to release VRAM. This independently caps engine residency.
|
| 377 |
+
"""
|
| 378 |
+
cutoff = time.time() - _SLOT_IDLE_TTL_SECONDS
|
| 379 |
+
stale = [k for k, s in engine_pool.items() if s.last_used < cutoff
|
| 380 |
+
and s.engine_name not in _NO_GPU_ENGINES]
|
| 381 |
+
for key in stale:
|
| 382 |
+
slot = engine_pool.pop(key)
|
| 383 |
+
log.info(f"Idle eviction: '{slot.engine_name}' (idle {(time.time() - slot.last_used)/3600:.1f}h)")
|
| 384 |
+
try:
|
| 385 |
+
slot.engine.unload_model()
|
| 386 |
+
except Exception as e:
|
| 387 |
+
log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
|
| 388 |
+
# Invalidate all sessions pointing at this slot
|
| 389 |
+
for session in sessions.values():
|
| 390 |
+
if session.pool_key == key:
|
| 391 |
+
session.pool_key = None
|
| 392 |
+
return len(stale)
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
async def _periodic_cleanup():
|
| 396 |
+
"""Background task: clean up uploads + expired sessions + idle engine slots every hour."""
|
| 397 |
+
while True:
|
| 398 |
+
await asyncio.sleep(3600)
|
| 399 |
+
n = _cleanup_old_uploads()
|
| 400 |
+
m = _cleanup_expired_sessions()
|
| 401 |
+
p = _evict_idle_slots()
|
| 402 |
+
if n or m or p:
|
| 403 |
+
log.info(f"Periodic cleanup: {n} upload(s), {m} session(s), {p} idle engine slot(s).")
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
# ---------------------------------------------------------------------------
|
| 407 |
+
# API key resolution — keys never stored or shared server-side (Phase 3)
|
| 408 |
+
# Web UI users MUST provide their own keys via browser localStorage.
|
| 409 |
+
# Server env vars (.env) are NOT used by the web UI — they exist only for
|
| 410 |
+
# the PyQt GUI and CLI tools which run locally on the admin's machine.
|
| 411 |
+
# ---------------------------------------------------------------------------
|
| 412 |
+
|
| 413 |
+
# Known key slots (for validation only — env vars are NOT consulted)
|
| 414 |
+
_KEY_SLOTS = {"openai", "gemini", "claude", "openwebui"}
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def _resolve_api_key(slot: str, request_value: str) -> str:
|
| 418 |
+
"""
|
| 419 |
+
Return the API key from the browser request, or empty string.
|
| 420 |
+
Server env vars are deliberately NOT used as fallback — each web user
|
| 421 |
+
must supply their own key via browser localStorage.
|
| 422 |
+
"""
|
| 423 |
+
if request_value and request_value.strip():
|
| 424 |
+
return request_value.strip()
|
| 425 |
+
return ""
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
# ---------------------------------------------------------------------------
|
| 429 |
+
# Startup config (web/server_config.yaml) — optional, auto-load an engine
|
| 430 |
+
# ---------------------------------------------------------------------------
|
| 431 |
+
|
| 432 |
+
def _load_startup_config() -> dict:
|
| 433 |
+
cfg_path = Path(__file__).parent / "server_config.yaml"
|
| 434 |
+
if not cfg_path.exists():
|
| 435 |
+
return {}
|
| 436 |
+
try:
|
| 437 |
+
import yaml
|
| 438 |
+
with open(cfg_path) as f:
|
| 439 |
+
return yaml.safe_load(f) or {}
|
| 440 |
+
except Exception as e:
|
| 441 |
+
log.warning(f"Could not read server_config.yaml: {e}")
|
| 442 |
+
return {}
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
@app.on_event("startup")
|
| 446 |
+
async def startup_event():
|
| 447 |
+
"""Clean old uploads, start periodic cleanup, auto-load engine."""
|
| 448 |
+
# Clean up uploads left over from previous server runs
|
| 449 |
+
n = _cleanup_old_uploads()
|
| 450 |
+
if n:
|
| 451 |
+
log.info(f"Startup cleanup: removed {n} old upload file(s).")
|
| 452 |
+
|
| 453 |
+
# Schedule periodic cleanup (every hour)
|
| 454 |
+
asyncio.create_task(_periodic_cleanup())
|
| 455 |
+
|
| 456 |
+
# Auto-load default engine from server_config.yaml if present
|
| 457 |
+
cfg = _load_startup_config()
|
| 458 |
+
if not cfg.get("default_engine"):
|
| 459 |
+
return
|
| 460 |
+
engine_name = cfg["default_engine"]
|
| 461 |
+
engine_config = cfg.get("default_config", {})
|
| 462 |
+
log.info(f"Auto-loading engine '{engine_name}' from server_config.yaml ...")
|
| 463 |
+
try:
|
| 464 |
+
registry = get_global_registry()
|
| 465 |
+
reg_engine = registry.get_engine_by_name(engine_name)
|
| 466 |
+
if reg_engine and reg_engine.is_available():
|
| 467 |
+
engine = _create_engine_instance(engine_name)
|
| 468 |
+
if not engine:
|
| 469 |
+
log.warning(f"Auto-load: cannot create instance for '{engine_name}'.")
|
| 470 |
+
return
|
| 471 |
+
ok = await asyncio.to_thread(engine.load_model, engine_config)
|
| 472 |
+
if ok:
|
| 473 |
+
pool_key = _make_pool_key(engine_name, engine_config)
|
| 474 |
+
slot = EngineSlot(
|
| 475 |
+
engine=engine, engine_name=engine_name,
|
| 476 |
+
config=engine_config, pool_key=pool_key,
|
| 477 |
+
ref_count=0, # No session owns it yet
|
| 478 |
+
)
|
| 479 |
+
engine_pool[pool_key] = slot
|
| 480 |
+
# Update compat shims
|
| 481 |
+
global loaded_engine, loaded_engine_name, loaded_config
|
| 482 |
+
loaded_engine = engine
|
| 483 |
+
loaded_engine_name = engine_name
|
| 484 |
+
loaded_config = engine_config
|
| 485 |
+
log.info(f"Auto-loaded '{engine_name}' into pool as '{pool_key}'.")
|
| 486 |
+
else:
|
| 487 |
+
log.warning(f"Auto-load of '{engine_name}' failed (load_model returned False).")
|
| 488 |
+
else:
|
| 489 |
+
log.warning(f"Auto-load: engine '{engine_name}' not found or not available.")
|
| 490 |
+
except Exception as e:
|
| 491 |
+
log.warning(f"Auto-load error: {e}")
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
# ---------------------------------------------------------------------------
|
| 495 |
+
# Config schemas — replaces Qt config widgets for the web UI
|
| 496 |
+
# ---------------------------------------------------------------------------
|
| 497 |
+
|
| 498 |
+
def _get_pylaia_model_options() -> list:
|
| 499 |
+
_import_segmenters()
|
| 500 |
+
from inference_pylaia_native import _scan_pylaia_models
|
| 501 |
+
_scan_pylaia_models(str(Path(__file__).resolve().parents[1] / "models"))
|
| 502 |
+
options = [{"label": k, "value": k} for k in PYLAIA_MODELS.keys()]
|
| 503 |
+
options.append({"label": "Custom / local path…", "value": "__custom__"})
|
| 504 |
+
return options
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
def _scan_kraken_models() -> list:
|
| 508 |
+
"""Scan models/ directory for local Kraken .mlmodel files and build select options."""
|
| 509 |
+
options = []
|
| 510 |
+
models_root = Path(__file__).resolve().parents[1] / "models"
|
| 511 |
+
if models_root.exists():
|
| 512 |
+
for p in sorted(models_root.rglob("*.mlmodel")):
|
| 513 |
+
rel = str(p.relative_to(models_root.parent)) # e.g. models/kraken_cs/best.mlmodel
|
| 514 |
+
label = f"{p.parent.name}/{p.name}"
|
| 515 |
+
options.append({"label": label, "value": rel, "source": "local"})
|
| 516 |
+
# Zenodo presets from kraken_engine (auto-download on load)
|
| 517 |
+
try:
|
| 518 |
+
from engines.kraken_engine import KRAKEN_MODELS
|
| 519 |
+
for preset_id, info in KRAKEN_MODELS.items():
|
| 520 |
+
if info.get("source") == "zenodo":
|
| 521 |
+
options.append({
|
| 522 |
+
"label": f"{info.get('label', preset_id)} [Zenodo, auto-download]",
|
| 523 |
+
"value": f"__zenodo__{preset_id}",
|
| 524 |
+
"source": "zenodo",
|
| 525 |
+
})
|
| 526 |
+
except Exception:
|
| 527 |
+
pass
|
| 528 |
+
return options
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
def _scan_trocr_models() -> list:
|
| 532 |
+
"""Scan models/ directory for TrOCR checkpoints.
|
| 533 |
+
|
| 534 |
+
A directory is considered a TrOCR model if it contains
|
| 535 |
+
preprocessor_config.json (TrOCR/ViT-specific) AND config.json
|
| 536 |
+
with model_type == 'vision-encoder-decoder'.
|
| 537 |
+
This avoids picking up PyLaia/CRNN-CTC directories that also
|
| 538 |
+
contain a config.json with training parameters.
|
| 539 |
+
"""
|
| 540 |
+
import json as _json
|
| 541 |
+
models_dir = PROJECT_ROOT / "models"
|
| 542 |
+
options = [
|
| 543 |
+
{"label": "Custom HuggingFace ID or local path…", "value": "__custom__"},
|
| 544 |
+
{"label": "kazars24/trocr-base-handwritten-ru (HuggingFace)",
|
| 545 |
+
"value": "kazars24/trocr-base-handwritten-ru",
|
| 546 |
+
"source": "huggingface"},
|
| 547 |
+
{"label": "microsoft/trocr-base-printed — printed text, base",
|
| 548 |
+
"value": "microsoft/trocr-base-printed",
|
| 549 |
+
"source": "huggingface"},
|
| 550 |
+
{"label": "microsoft/trocr-large-printed — printed text, large",
|
| 551 |
+
"value": "microsoft/trocr-large-printed",
|
| 552 |
+
"source": "huggingface"},
|
| 553 |
+
{"label": "dh-unibe/trocr-kurrent — German Kurrent 19th c. (CER 2.66%)",
|
| 554 |
+
"value": "dh-unibe/trocr-kurrent",
|
| 555 |
+
"source": "huggingface"},
|
| 556 |
+
{"label": "dh-unibe/trocr-kurrent-XVI-XVII — German Kurrent 16th–18th c. (CER 5.42%)",
|
| 557 |
+
"value": "dh-unibe/trocr-kurrent-XVI-XVII",
|
| 558 |
+
"source": "huggingface"},
|
| 559 |
+
]
|
| 560 |
+
if models_dir.exists():
|
| 561 |
+
for d in sorted(models_dir.iterdir()):
|
| 562 |
+
if not d.is_dir():
|
| 563 |
+
continue
|
| 564 |
+
# Require BOTH preprocessor_config.json AND config.json with
|
| 565 |
+
# model_type == 'vision-encoder-decoder'.
|
| 566 |
+
# preprocessor_config.json is ViT/TrOCR-specific (not in PyLaia).
|
| 567 |
+
# config.json model_type disambiguates from Qwen3 adapters that
|
| 568 |
+
# also ship a preprocessor_config but have no config.json.
|
| 569 |
+
if not (d / "preprocessor_config.json").exists():
|
| 570 |
+
continue
|
| 571 |
+
cfg_path = d / "config.json"
|
| 572 |
+
if not cfg_path.exists():
|
| 573 |
+
continue
|
| 574 |
+
try:
|
| 575 |
+
cfg = _json.load(open(cfg_path))
|
| 576 |
+
if cfg.get("model_type") != "vision-encoder-decoder":
|
| 577 |
+
continue
|
| 578 |
+
except Exception:
|
| 579 |
+
continue
|
| 580 |
+
options.append({
|
| 581 |
+
"label": d.name,
|
| 582 |
+
"value": str(d),
|
| 583 |
+
"source": "local",
|
| 584 |
+
})
|
| 585 |
+
return options
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
def _scan_vlm_models(engine_type: str = "qwen3") -> list:
|
| 589 |
+
"""Scan models/ directory for local VLM checkpoints (LoRA adapters and full models).
|
| 590 |
+
|
| 591 |
+
Looks for directories containing adapter_config.json (LoRA fine-tunes) or
|
| 592 |
+
config.json mentioning Qwen/VLM/vision architectures.
|
| 593 |
+
|
| 594 |
+
Returns options list ending with a __custom__ sentinel for manual entry.
|
| 595 |
+
"""
|
| 596 |
+
models_dir = PROJECT_ROOT / "models"
|
| 597 |
+
options = []
|
| 598 |
+
|
| 599 |
+
if models_dir.exists():
|
| 600 |
+
for d in sorted(models_dir.iterdir()):
|
| 601 |
+
if not d.is_dir():
|
| 602 |
+
continue
|
| 603 |
+
|
| 604 |
+
# Check for LoRA adapter at top-level
|
| 605 |
+
if (d / "adapter_config.json").exists():
|
| 606 |
+
try:
|
| 607 |
+
import json as _json
|
| 608 |
+
with open(d / "adapter_config.json") as f:
|
| 609 |
+
adapter_cfg = _json.load(f)
|
| 610 |
+
base = adapter_cfg.get("base_model_name_or_path", "")
|
| 611 |
+
is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
|
| 612 |
+
is_churro = "churro" in base.lower() or "churro" in d.name.lower()
|
| 613 |
+
if engine_type == "qwen3" and is_qwen and not is_churro:
|
| 614 |
+
options.append({
|
| 615 |
+
"label": f"{d.name} (LoRA → {base})",
|
| 616 |
+
"value": str(d),
|
| 617 |
+
"base_model": base,
|
| 618 |
+
"adapter": str(d),
|
| 619 |
+
})
|
| 620 |
+
elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
|
| 621 |
+
options.append({
|
| 622 |
+
"label": f"{d.name} (LoRA → {base})",
|
| 623 |
+
"value": str(d),
|
| 624 |
+
"base_model": base,
|
| 625 |
+
"adapter": str(d),
|
| 626 |
+
})
|
| 627 |
+
except Exception:
|
| 628 |
+
pass
|
| 629 |
+
continue # Don't also check final_model subdirs
|
| 630 |
+
|
| 631 |
+
# Check for final_model subdirectory with adapter
|
| 632 |
+
final = d / "final_model"
|
| 633 |
+
if final.is_dir() and (final / "adapter_config.json").exists():
|
| 634 |
+
try:
|
| 635 |
+
import json as _json
|
| 636 |
+
with open(final / "adapter_config.json") as f:
|
| 637 |
+
adapter_cfg = _json.load(f)
|
| 638 |
+
base = adapter_cfg.get("base_model_name_or_path", "")
|
| 639 |
+
is_qwen = "qwen" in base.lower() or "qwen" in d.name.lower()
|
| 640 |
+
is_churro = "churro" in base.lower() or "churro" in d.name.lower()
|
| 641 |
+
if engine_type == "qwen3" and is_qwen and not is_churro:
|
| 642 |
+
options.append({
|
| 643 |
+
"label": f"{d.name} (LoRA → {base})",
|
| 644 |
+
"value": str(final),
|
| 645 |
+
"base_model": base,
|
| 646 |
+
"adapter": str(final),
|
| 647 |
+
})
|
| 648 |
+
elif engine_type == "churro" and (is_churro or ("churro" in d.name.lower())):
|
| 649 |
+
options.append({
|
| 650 |
+
"label": f"{d.name} (LoRA → {base})",
|
| 651 |
+
"value": str(final),
|
| 652 |
+
"base_model": base,
|
| 653 |
+
"adapter": str(final),
|
| 654 |
+
})
|
| 655 |
+
except Exception:
|
| 656 |
+
pass
|
| 657 |
+
|
| 658 |
+
# Always append a "Custom / HuggingFace" sentinel as the last option
|
| 659 |
+
options.append({
|
| 660 |
+
"label": "Custom / HuggingFace model ID...",
|
| 661 |
+
"value": "__custom__",
|
| 662 |
+
})
|
| 663 |
+
return options
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
ENGINE_SCHEMAS = {
|
| 667 |
+
"CRNN-CTC (PyLaia-inspired)": lambda: {
|
| 668 |
+
"fields": [
|
| 669 |
+
{"key": "model_path", "type": "select", "label": "Model",
|
| 670 |
+
"options": _get_pylaia_model_options(),
|
| 671 |
+
"custom_key": "custom_model_path",
|
| 672 |
+
"custom_placeholder": "Absolute path to best_model.pt (e.g. /home/…/models/pylaia_yiddish_20260326/best_model.pt)"},
|
| 673 |
+
{"key": "enable_spaces", "type": "checkbox",
|
| 674 |
+
"label": "Convert <space> tokens", "default": True},
|
| 675 |
+
{"key": "flip_rtl", "type": "checkbox",
|
| 676 |
+
"label": "RTL manuscript (flip line images)", "default": False,
|
| 677 |
+
"hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
|
| 678 |
+
]
|
| 679 |
+
},
|
| 680 |
+
"TrOCR": lambda: {
|
| 681 |
+
"fields": [
|
| 682 |
+
{"key": "model_path", "type": "select", "label": "Model",
|
| 683 |
+
"options": _scan_trocr_models(),
|
| 684 |
+
"custom_key": "custom_model_path",
|
| 685 |
+
"custom_placeholder": "HuggingFace model ID (e.g. microsoft/trocr-base-handwritten) or absolute local path"},
|
| 686 |
+
{"key": "num_beams", "type": "number", "label": "Beam Search",
|
| 687 |
+
"min": 1, "max": 10, "default": 4},
|
| 688 |
+
{"key": "normalize_background", "type": "checkbox",
|
| 689 |
+
"label": "Normalize Background", "default": False},
|
| 690 |
+
{"key": "flip_rtl", "type": "checkbox",
|
| 691 |
+
"label": "RTL manuscript (flip line images)", "default": False,
|
| 692 |
+
"hint": "Flip line images horizontally for RTL scripts (Ottoman, Arabic, Hebrew)"},
|
| 693 |
+
]
|
| 694 |
+
},
|
| 695 |
+
"Qwen3-VL": lambda: {
|
| 696 |
+
"fields": [
|
| 697 |
+
{"key": "model_preset", "type": "select", "label": "Model",
|
| 698 |
+
"options": _scan_vlm_models("qwen3"),
|
| 699 |
+
"custom_key": "base_model",
|
| 700 |
+
"custom_placeholder": "HuggingFace model ID, e.g. Qwen/Qwen3-VL-8B-Instruct"},
|
| 701 |
+
{"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
|
| 702 |
+
"min": 512, "max": 4096, "default": 1536},
|
| 703 |
+
]
|
| 704 |
+
},
|
| 705 |
+
"Churro VLM": lambda: {
|
| 706 |
+
"fields": [
|
| 707 |
+
{"key": "model_preset", "type": "select", "label": "Model",
|
| 708 |
+
"options": _scan_vlm_models("churro"),
|
| 709 |
+
"custom_key": "model_name",
|
| 710 |
+
"custom_placeholder": "HuggingFace model ID, e.g. stanford-oval/churro-3B"},
|
| 711 |
+
{"key": "device", "type": "select", "label": "Device",
|
| 712 |
+
"options": [{"label": "Auto", "value": "auto"},
|
| 713 |
+
{"label": "GPU 0", "value": "cuda:0"},
|
| 714 |
+
{"label": "GPU 1", "value": "cuda:1"},
|
| 715 |
+
{"label": "CPU", "value": "cpu"}]},
|
| 716 |
+
{"key": "max_image_size", "type": "number", "label": "Max Image Size (px)",
|
| 717 |
+
"min": 512, "max": 4096, "default": 2048},
|
| 718 |
+
]
|
| 719 |
+
},
|
| 720 |
+
"Kraken": lambda: {
|
| 721 |
+
"fields": [
|
| 722 |
+
{"key": "model_path", "type": "select", "label": "Model",
|
| 723 |
+
"options": _scan_kraken_models(),
|
| 724 |
+
"custom_key": "custom_model_path",
|
| 725 |
+
"custom_placeholder": "Absolute path on server, e.g. /home/user/models/my.mlmodel",
|
| 726 |
+
"upload": True},
|
| 727 |
+
]
|
| 728 |
+
},
|
| 729 |
+
"Commercial APIs": lambda: {
|
| 730 |
+
"fields": [
|
| 731 |
+
{"key": "provider", "type": "select", "label": "Provider",
|
| 732 |
+
"options": [
|
| 733 |
+
{"label": "OpenAI (GPT-4o, o1, …)", "value": "OpenAI"},
|
| 734 |
+
{"label": "Google Gemini", "value": "Gemini"},
|
| 735 |
+
{"label": "Anthropic Claude", "value": "Claude"},
|
| 736 |
+
]},
|
| 737 |
+
{"key": "model", "type": "select", "label": "Model",
|
| 738 |
+
"dynamic": True,
|
| 739 |
+
"dynamic_hint": "Enter API key, then ↻ to load available models",
|
| 740 |
+
# No static lists — always fetch live from the provider API
|
| 741 |
+
"per_provider_options": {},
|
| 742 |
+
"options": [],
|
| 743 |
+
"custom_key": "custom_model_id",
|
| 744 |
+
"custom_placeholder": "e.g. gpt-4.5, gemini-exp-1206, claude-opus-4"},
|
| 745 |
+
{"key": "api_key", "type": "password", "label": "API Key",
|
| 746 |
+
"default": "", "placeholder": "Paste your API key here"},
|
| 747 |
+
{"key": "temperature", "type": "number", "label": "Temperature",
|
| 748 |
+
"min": 0.0, "max": 2.0, "default": 0.0,
|
| 749 |
+
"placeholder": "0.0 = deterministic (recommended for transcription)"},
|
| 750 |
+
{"key": "max_output_tokens", "type": "number", "label": "Max output tokens (optional)",
|
| 751 |
+
"min": 512, "max": 65536, "default": None,
|
| 752 |
+
"placeholder": "Leave blank = model maximum"},
|
| 753 |
+
{"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
|
| 754 |
+
"default": "",
|
| 755 |
+
"rows": 4,
|
| 756 |
+
"placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
|
| 757 |
+
"hint": "Leave blank to use the default prompt shown above"},
|
| 758 |
+
{"key": "thinking_mode", "type": "select", "label": "Thinking Mode (Gemini only)",
|
| 759 |
+
"options": [
|
| 760 |
+
{"label": "Auto (model decides, no cap)", "value": ""},
|
| 761 |
+
{"label": "Low (budget: 8k tokens)", "value": "low"},
|
| 762 |
+
{"label": "High (no cap, max reasoning)", "value": "high"},
|
| 763 |
+
], "default": ""},
|
| 764 |
+
]
|
| 765 |
+
},
|
| 766 |
+
"OpenWebUI": lambda: {
|
| 767 |
+
"fields": [
|
| 768 |
+
{"key": "base_url", "type": "text", "label": "Base URL",
|
| 769 |
+
"default": "",
|
| 770 |
+
"placeholder": "https://your-openwebui-instance/api or .../api/v1"},
|
| 771 |
+
{"key": "api_key", "type": "password", "label": "API Key",
|
| 772 |
+
"default": "", "placeholder": "Your OpenWebUI API key"},
|
| 773 |
+
{"key": "model", "type": "select", "label": "Model",
|
| 774 |
+
"dynamic": True,
|
| 775 |
+
"dynamic_hint": "Enter API key & base URL, then ↻ to load available models",
|
| 776 |
+
"options": [{"label": "Custom model ID…", "value": "__custom__"}],
|
| 777 |
+
"default": "__custom__",
|
| 778 |
+
"custom_key": "model_custom",
|
| 779 |
+
"custom_placeholder": "e.g. llama3.1, qwen2.5vl, gemma3, ..."},
|
| 780 |
+
{"key": "temperature", "type": "number", "label": "Temperature",
|
| 781 |
+
"min": 0.0, "max": 2.0, "default": 0.1},
|
| 782 |
+
{"key": "max_tokens", "type": "number", "label": "Max output tokens (optional)",
|
| 783 |
+
"min": 512, "max": 65536, "default": None,
|
| 784 |
+
"placeholder": "Leave blank = model maximum"},
|
| 785 |
+
{"key": "custom_prompt", "type": "textarea", "label": "Custom Prompt (optional)",
|
| 786 |
+
"default": "",
|
| 787 |
+
"rows": 3,
|
| 788 |
+
"placeholder": "Transcribe all handwritten text in this manuscript image. Preserve the original language (Cyrillic, Latin, etc.) and layout. Output only the transcribed text without any additional commentary.",
|
| 789 |
+
"hint": "Leave blank to use the default prompt shown above"},
|
| 790 |
+
]
|
| 791 |
+
},
|
| 792 |
+
"LightOnOCR": lambda: {
|
| 793 |
+
"fields": [
|
| 794 |
+
{"key": "model_path", "type": "select", "label": "Model",
|
| 795 |
+
"options": (lambda: [
|
| 796 |
+
{"label": f"{name} — {info.get('description','')}", "value": info["id"]}
|
| 797 |
+
for name, info in __import__('lighton_models', fromlist=['LIGHTON_MODELS']).LIGHTON_MODELS.items()
|
| 798 |
+
] + [{"label": "Custom HuggingFace ID…", "value": "__custom__"}])(),
|
| 799 |
+
"custom_key": "custom_model_path",
|
| 800 |
+
"custom_placeholder": "HuggingFace model ID, e.g. lightonai/LightOnOCR-2-1B-base"},
|
| 801 |
+
{"key": "max_new_tokens", "type": "number", "label": "Max new tokens",
|
| 802 |
+
"min": 32, "max": 512, "default": 128},
|
| 803 |
+
]
|
| 804 |
+
},
|
| 805 |
+
"PaddleOCR": lambda: {
|
| 806 |
+
"fields": [
|
| 807 |
+
{"key": "lang", "type": "select", "label": "Language / Script",
|
| 808 |
+
"default": "ch",
|
| 809 |
+
"options": [
|
| 810 |
+
{"label": "Chinese + English (mixed, recommended default)", "value": "ch"},
|
| 811 |
+
{"label": "English", "value": "en"},
|
| 812 |
+
{"label": "German", "value": "german"},
|
| 813 |
+
{"label": "French", "value": "french"},
|
| 814 |
+
{"label": "Japanese", "value": "japan"},
|
| 815 |
+
{"label": "Korean", "value": "korean"},
|
| 816 |
+
{"label": "Arabic", "value": "arabic"},
|
| 817 |
+
{"label": "Cyrillic (Russian/Ukrainian/Bulgarian)", "value": "cyrillic"},
|
| 818 |
+
{"label": "Latin script (generic)", "value": "latin"},
|
| 819 |
+
{"label": "Custom (enter code below)", "value": "__custom__"},
|
| 820 |
+
],
|
| 821 |
+
"custom_key": "custom_lang",
|
| 822 |
+
"custom_placeholder": "PaddleOCR lang code, e.g. ru, uk, fr, es, it, pt, …",
|
| 823 |
+
"hint": "One language model per run. 'ch' is bilingual (Chinese+English) and PaddleOCR's strongest model. For mixed-script documents outside this list, run separate passes."},
|
| 824 |
+
{"key": "use_angle_cls", "type": "checkbox",
|
| 825 |
+
"label": "Text-angle classifier (correct 180° rotation)", "default": True},
|
| 826 |
+
{"key": "use_gpu", "type": "checkbox",
|
| 827 |
+
"label": "Use GPU (requires paddlepaddle-gpu)", "default": False},
|
| 828 |
+
]
|
| 829 |
+
},
|
| 830 |
+
}
|
| 831 |
+
|
| 832 |
+
|
| 833 |
+
# ---------------------------------------------------------------------------
|
| 834 |
+
# Request/response models
|
| 835 |
+
# ---------------------------------------------------------------------------
|
| 836 |
+
|
| 837 |
+
class EngineLoadRequest(BaseModel):
|
| 838 |
+
engine_name: str
|
| 839 |
+
config: Dict[str, Any] = {}
|
| 840 |
+
|
| 841 |
+
|
| 842 |
+
class TranscribeRequest(BaseModel):
|
| 843 |
+
image_id: str
|
| 844 |
+
seg_method: str = "kraken" # kraken, kraken-blla, hpp
|
| 845 |
+
seg_device: str = "cpu"
|
| 846 |
+
max_columns: int = 6 # blla: max sub-columns per region (iterative splitting)
|
| 847 |
+
split_width_fraction: float = 0.40 # blla: min region width (fraction of page) to trigger sub-split
|
| 848 |
+
use_pagexml: bool = True # use attached PAGE XML for segmentation when available
|
| 849 |
+
text_direction: str = "horizontal-lr" # reading order for Kraken: horizontal-lr, horizontal-rl, vertical-lr, vertical-rl
|
| 850 |
+
engine_config_overrides: Dict[str, Any] = {} # live form values merged into stored config at transcription time
|
| 851 |
+
|
| 852 |
+
|
| 853 |
+
# ---------------------------------------------------------------------------
|
| 854 |
+
# Routes
|
| 855 |
+
# ---------------------------------------------------------------------------
|
| 856 |
+
|
| 857 |
+
@app.get("/")
|
| 858 |
+
async def index():
|
| 859 |
+
return FileResponse(str(STATIC_DIR / "index.html"))
|
| 860 |
+
|
| 861 |
+
|
| 862 |
+
@app.get("/demo")
|
| 863 |
+
async def pwa_demo():
|
| 864 |
+
return FileResponse(str(STATIC_DIR / "pwa" / "demo.html"))
|
| 865 |
+
|
| 866 |
+
|
| 867 |
+
@app.get("/manifest.json")
|
| 868 |
+
async def pwa_manifest():
|
| 869 |
+
"""Serve the PWA manifest from root so scope / start_url are valid."""
|
| 870 |
+
from fastapi.responses import FileResponse as _FR
|
| 871 |
+
return _FR(str(STATIC_DIR / "pwa" / "manifest.json"), media_type="application/manifest+json")
|
| 872 |
+
|
| 873 |
+
|
| 874 |
+
@app.get("/sw.js")
|
| 875 |
+
async def pwa_service_worker():
|
| 876 |
+
"""Serve the PWA service worker from root scope so it can control /demo."""
|
| 877 |
+
from fastapi.responses import FileResponse as _FR
|
| 878 |
+
resp = _FR(str(STATIC_DIR / "pwa" / "sw.js"), media_type="application/javascript")
|
| 879 |
+
resp.headers["Service-Worker-Allowed"] = "/"
|
| 880 |
+
return resp
|
| 881 |
+
|
| 882 |
+
|
| 883 |
+
@app.get("/api/engines")
|
| 884 |
+
async def list_engines():
|
| 885 |
+
registry = get_global_registry()
|
| 886 |
+
engines = []
|
| 887 |
+
for engine in registry.get_all_engines():
|
| 888 |
+
available = engine.is_available()
|
| 889 |
+
engines.append({
|
| 890 |
+
"name": engine.get_name(),
|
| 891 |
+
"description": engine.get_description(),
|
| 892 |
+
"available": available,
|
| 893 |
+
"unavailable_reason": engine.get_unavailable_reason() if not available else None,
|
| 894 |
+
"requires_line_segmentation": engine.requires_line_segmentation(),
|
| 895 |
+
"has_config_schema": engine.get_name() in ENGINE_SCHEMAS,
|
| 896 |
+
})
|
| 897 |
+
return engines
|
| 898 |
+
|
| 899 |
+
|
| 900 |
+
@app.get("/api/engine/{name}/config-schema")
|
| 901 |
+
async def get_config_schema(name: str):
|
| 902 |
+
if name not in ENGINE_SCHEMAS:
|
| 903 |
+
return {"fields": []}
|
| 904 |
+
schema = ENGINE_SCHEMAS[name]()
|
| 905 |
+
|
| 906 |
+
# Key status: always "missing" from server perspective — browser localStorage
|
| 907 |
+
# is the only key store. The frontend checks localStorage client-side.
|
| 908 |
+
for field in schema.get("fields", []):
|
| 909 |
+
if field.get("type") == "password":
|
| 910 |
+
field["key_status"] = "missing"
|
| 911 |
+
|
| 912 |
+
return schema
|
| 913 |
+
|
| 914 |
+
|
| 915 |
+
def _openwebui_model_urls(base_url: str) -> list[str]:
|
| 916 |
+
base = base_url.strip().rstrip("/")
|
| 917 |
+
if not base:
|
| 918 |
+
return []
|
| 919 |
+
urls = [f"{base}/models"]
|
| 920 |
+
if base.endswith("/api"):
|
| 921 |
+
urls.append(f"{base}/v1/models")
|
| 922 |
+
urls.append(f"{base[:-4]}/v1/models")
|
| 923 |
+
elif base.endswith("/api/v1"):
|
| 924 |
+
urls.append(f"{base[:-3]}/models")
|
| 925 |
+
urls.append(f"{base}/models")
|
| 926 |
+
elif base.endswith("/v1"):
|
| 927 |
+
urls.append(f"{base[:-3]}/api/models")
|
| 928 |
+
else:
|
| 929 |
+
urls.append(f"{base}/api/models")
|
| 930 |
+
urls.append(f"{base}/api/v1/models")
|
| 931 |
+
urls.append(f"{base}/v1/models")
|
| 932 |
+
return list(dict.fromkeys(urls))
|
| 933 |
+
|
| 934 |
+
|
| 935 |
+
def _extract_openwebui_model_ids(payload: Any) -> list[str]:
|
| 936 |
+
if isinstance(payload, dict):
|
| 937 |
+
for key in ("data", "models"):
|
| 938 |
+
items = payload.get(key)
|
| 939 |
+
if isinstance(items, list):
|
| 940 |
+
return _extract_openwebui_model_ids(items)
|
| 941 |
+
return [
|
| 942 |
+
str(value.get("id") or value.get("name"))
|
| 943 |
+
for value in payload.values()
|
| 944 |
+
if isinstance(value, dict) and (value.get("id") or value.get("name"))
|
| 945 |
+
]
|
| 946 |
+
|
| 947 |
+
if isinstance(payload, list):
|
| 948 |
+
models = []
|
| 949 |
+
for item in payload:
|
| 950 |
+
if isinstance(item, str):
|
| 951 |
+
models.append(item)
|
| 952 |
+
elif isinstance(item, dict):
|
| 953 |
+
model_id = item.get("id") or item.get("name") or item.get("model")
|
| 954 |
+
if model_id:
|
| 955 |
+
models.append(str(model_id))
|
| 956 |
+
return sorted(set(models))
|
| 957 |
+
|
| 958 |
+
return []
|
| 959 |
+
|
| 960 |
+
|
| 961 |
+
def _fetch_openwebui_models(base_url: str, api_key: str) -> list[str]:
|
| 962 |
+
import urllib.error
|
| 963 |
+
import urllib.request
|
| 964 |
+
|
| 965 |
+
errors = []
|
| 966 |
+
for url in _openwebui_model_urls(base_url):
|
| 967 |
+
req = urllib.request.Request(
|
| 968 |
+
url,
|
| 969 |
+
headers={
|
| 970 |
+
"Authorization": f"Bearer {api_key}",
|
| 971 |
+
"x-api-key": api_key,
|
| 972 |
+
"Accept": "application/json",
|
| 973 |
+
"Content-Type": "application/json",
|
| 974 |
+
"User-Agent": "Polyscriptor-HTR-Demo/1.0",
|
| 975 |
+
},
|
| 976 |
+
)
|
| 977 |
+
try:
|
| 978 |
+
with urllib.request.urlopen(req, timeout=20) as resp:
|
| 979 |
+
status = resp.status
|
| 980 |
+
content_type = resp.headers.get("Content-Type", "")
|
| 981 |
+
body = resp.read().decode("utf-8", errors="replace")
|
| 982 |
+
try:
|
| 983 |
+
payload = json.loads(body)
|
| 984 |
+
except json.JSONDecodeError:
|
| 985 |
+
sample = body.strip().replace("\n", " ")[:120] or "<empty response>"
|
| 986 |
+
errors.append(f"{url}: HTTP {status}, non-JSON response ({content_type}): {sample}")
|
| 987 |
+
continue
|
| 988 |
+
models = _extract_openwebui_model_ids(payload)
|
| 989 |
+
if models:
|
| 990 |
+
return models
|
| 991 |
+
errors.append(f"{url}: no model ids in response")
|
| 992 |
+
except urllib.error.HTTPError as exc:
|
| 993 |
+
body = exc.read().decode("utf-8", errors="replace")[:200]
|
| 994 |
+
errors.append(f"{url}: HTTP {exc.code} {body}")
|
| 995 |
+
except Exception as exc:
|
| 996 |
+
errors.append(f"{url}: {exc}")
|
| 997 |
+
raise RuntimeError("; ".join(errors) if errors else "No OpenWebUI model endpoint tried")
|
| 998 |
+
|
| 999 |
+
|
| 1000 |
+
@app.get("/api/engine/status")
|
| 1001 |
+
async def engine_status(request: Request):
|
| 1002 |
+
session = _get_session(request)
|
| 1003 |
+
if session.pool_key and session.pool_key in engine_pool:
|
| 1004 |
+
slot = engine_pool[session.pool_key]
|
| 1005 |
+
return {
|
| 1006 |
+
"loaded": slot.engine.is_model_loaded(),
|
| 1007 |
+
"engine_name": slot.engine_name,
|
| 1008 |
+
"config": slot.config,
|
| 1009 |
+
}
|
| 1010 |
+
# Fallback: compat shim for tests / startup
|
| 1011 |
+
return {
|
| 1012 |
+
"loaded": loaded_engine is not None and loaded_engine.is_model_loaded(),
|
| 1013 |
+
"engine_name": loaded_engine_name,
|
| 1014 |
+
"config": loaded_config,
|
| 1015 |
+
}
|
| 1016 |
+
|
| 1017 |
+
|
| 1018 |
+
@app.get("/api/engine/{name}/models")
|
| 1019 |
+
async def get_engine_models(
|
| 1020 |
+
name: str,
|
| 1021 |
+
api_key: str = "",
|
| 1022 |
+
provider: str = "openai",
|
| 1023 |
+
base_url: str = "",
|
| 1024 |
+
):
|
| 1025 |
+
"""
|
| 1026 |
+
Fetch available models for engines whose model list is dynamic.
|
| 1027 |
+
|
| 1028 |
+
- OpenWebUI: queries the OpenWebUI /api/models endpoint
|
| 1029 |
+
- Commercial APIs: uses existing fetch_* helpers with fallback lists
|
| 1030 |
+
"""
|
| 1031 |
+
if name == "OpenWebUI":
|
| 1032 |
+
resolved = _resolve_api_key("openwebui", api_key)
|
| 1033 |
+
if not resolved:
|
| 1034 |
+
return {"models": [], "error": "No API key — paste one in the form"}
|
| 1035 |
+
effective_url = base_url.strip().rstrip("/")
|
| 1036 |
+
if not effective_url:
|
| 1037 |
+
return {"models": [], "error": "Enter your OpenWebUI base URL"}
|
| 1038 |
+
try:
|
| 1039 |
+
models = await asyncio.to_thread(_fetch_openwebui_models, effective_url, resolved)
|
| 1040 |
+
return {"models": models}
|
| 1041 |
+
except Exception as e:
|
| 1042 |
+
return {"models": [], "error": str(e)}
|
| 1043 |
+
|
| 1044 |
+
elif name == "Commercial APIs":
|
| 1045 |
+
prov = provider.lower()
|
| 1046 |
+
resolved = _resolve_api_key(prov, api_key)
|
| 1047 |
+
if not resolved:
|
| 1048 |
+
return {"models": [], "error": "No API key — paste one in the form"}
|
| 1049 |
+
try:
|
| 1050 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 1051 |
+
if prov == "openai":
|
| 1052 |
+
from inference_commercial_api import fetch_openai_models
|
| 1053 |
+
models = await asyncio.to_thread(fetch_openai_models, resolved)
|
| 1054 |
+
return {"models": models}
|
| 1055 |
+
elif prov == "gemini":
|
| 1056 |
+
from inference_commercial_api import fetch_gemini_models
|
| 1057 |
+
models = await asyncio.to_thread(fetch_gemini_models, resolved)
|
| 1058 |
+
return {"models": models}
|
| 1059 |
+
elif prov == "claude":
|
| 1060 |
+
from inference_commercial_api import fetch_claude_models
|
| 1061 |
+
models = await asyncio.to_thread(fetch_claude_models, resolved)
|
| 1062 |
+
return {"models": models}
|
| 1063 |
+
else:
|
| 1064 |
+
return {"models": [], "error": f"Unknown provider: {provider}"}
|
| 1065 |
+
except Exception as e:
|
| 1066 |
+
return {"models": [], "error": str(e)}
|
| 1067 |
+
|
| 1068 |
+
return {"models": [], "error": f"Dynamic model listing not supported for '{name}'"}
|
| 1069 |
+
|
| 1070 |
+
|
| 1071 |
+
@app.post("/api/engine/load")
|
| 1072 |
+
async def load_engine(request: Request, req: EngineLoadRequest):
|
| 1073 |
+
global loaded_engine, loaded_engine_name, loaded_config
|
| 1074 |
+
session = _get_session(request)
|
| 1075 |
+
|
| 1076 |
+
registry = get_global_registry()
|
| 1077 |
+
reg_engine = registry.get_engine_by_name(req.engine_name)
|
| 1078 |
+
if not reg_engine:
|
| 1079 |
+
raise HTTPException(404, f"Engine '{req.engine_name}' not found")
|
| 1080 |
+
if not reg_engine.is_available():
|
| 1081 |
+
raise HTTPException(400, f"Engine not available: {reg_engine.get_unavailable_reason()}")
|
| 1082 |
+
|
| 1083 |
+
# --- Config resolution (unchanged logic) ---
|
| 1084 |
+
config = dict(req.config)
|
| 1085 |
+
|
| 1086 |
+
if req.engine_name == "CRNN-CTC (PyLaia-inspired)" and "model_path" in config:
|
| 1087 |
+
custom_val = config.pop("custom_model_path", "").strip()
|
| 1088 |
+
if config["model_path"] == "__custom__":
|
| 1089 |
+
if not custom_val:
|
| 1090 |
+
raise HTTPException(400, "Please enter an absolute path to a best_model.pt file")
|
| 1091 |
+
config["model_path"] = custom_val
|
| 1092 |
+
# else: named preset from PYLAIA_MODELS — engine resolves it
|
| 1093 |
+
|
| 1094 |
+
elif req.engine_name == "Kraken" and "model_path" in config:
|
| 1095 |
+
custom_val = config.pop("custom_model_path", "").strip()
|
| 1096 |
+
val = config["model_path"]
|
| 1097 |
+
if val == "__custom__":
|
| 1098 |
+
if not custom_val:
|
| 1099 |
+
raise HTTPException(400, "Please enter a path to a local .mlmodel file")
|
| 1100 |
+
config["model_path"] = custom_val
|
| 1101 |
+
elif val.startswith("__zenodo__"):
|
| 1102 |
+
# Zenodo preset: pass preset_id, let engine handle download
|
| 1103 |
+
config["preset_id"] = val[len("__zenodo__"):]
|
| 1104 |
+
config["model_path"] = None
|
| 1105 |
+
# else: relative local path from select (e.g. "models/kraken_cs/best.mlmodel") — use as-is
|
| 1106 |
+
|
| 1107 |
+
elif req.engine_name == "TrOCR" and "model_path" in config:
|
| 1108 |
+
custom_val = config.pop("custom_model_path", "").strip()
|
| 1109 |
+
if config["model_path"] == "__custom__":
|
| 1110 |
+
if not custom_val:
|
| 1111 |
+
raise HTTPException(400, "Please enter a HuggingFace model ID or local path")
|
| 1112 |
+
config["model_path"] = custom_val
|
| 1113 |
+
from pathlib import Path as _P
|
| 1114 |
+
if _P(config["model_path"]).exists():
|
| 1115 |
+
config["model_source"] = "local"
|
| 1116 |
+
else:
|
| 1117 |
+
config["model_source"] = "huggingface"
|
| 1118 |
+
|
| 1119 |
+
elif req.engine_name == "Qwen3-VL" and "model_preset" in config:
|
| 1120 |
+
preset_val = config.pop("model_preset")
|
| 1121 |
+
custom_val = config.pop("base_model", "").strip()
|
| 1122 |
+
if preset_val == "__custom__":
|
| 1123 |
+
config["base_model"] = custom_val or "Qwen/Qwen3-VL-8B-Instruct"
|
| 1124 |
+
config["adapter"] = None
|
| 1125 |
+
else:
|
| 1126 |
+
vlm_opts = _scan_vlm_models("qwen3")
|
| 1127 |
+
matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
|
| 1128 |
+
if matched:
|
| 1129 |
+
config["base_model"] = matched.get("base_model", preset_val)
|
| 1130 |
+
config["adapter"] = matched.get("adapter")
|
| 1131 |
+
else:
|
| 1132 |
+
config["base_model"] = preset_val
|
| 1133 |
+
config["adapter"] = None
|
| 1134 |
+
|
| 1135 |
+
elif req.engine_name == "Churro VLM" and "model_preset" in config:
|
| 1136 |
+
preset_val = config.pop("model_preset")
|
| 1137 |
+
custom_val = config.pop("model_name", "").strip()
|
| 1138 |
+
if preset_val == "__custom__":
|
| 1139 |
+
config["model_name"] = custom_val or "stanford-oval/churro-3B"
|
| 1140 |
+
config["adapter_path"] = None
|
| 1141 |
+
else:
|
| 1142 |
+
vlm_opts = _scan_vlm_models("churro")
|
| 1143 |
+
matched = next((o for o in vlm_opts if o["value"] == preset_val), None)
|
| 1144 |
+
if matched:
|
| 1145 |
+
config["model_name"] = matched.get("base_model", preset_val)
|
| 1146 |
+
config["adapter_path"] = matched.get("adapter")
|
| 1147 |
+
else:
|
| 1148 |
+
config["model_name"] = preset_val
|
| 1149 |
+
config["adapter_path"] = None
|
| 1150 |
+
|
| 1151 |
+
elif req.engine_name == "LightOnOCR" and "model_path" in config:
|
| 1152 |
+
custom_val = config.pop("custom_model_path", "").strip()
|
| 1153 |
+
if config["model_path"] == "__custom__":
|
| 1154 |
+
if not custom_val:
|
| 1155 |
+
raise HTTPException(400, "Please enter a HuggingFace model ID for LightOnOCR")
|
| 1156 |
+
config["model_path"] = custom_val
|
| 1157 |
+
|
| 1158 |
+
elif req.engine_name == "PaddleOCR" and "lang" in config:
|
| 1159 |
+
if config["lang"] == "__custom__":
|
| 1160 |
+
custom_lang = config.pop("custom_lang", "").strip()
|
| 1161 |
+
if not custom_lang:
|
| 1162 |
+
raise HTTPException(400, "Please enter a PaddleOCR language code")
|
| 1163 |
+
config["lang"] = custom_lang
|
| 1164 |
+
else:
|
| 1165 |
+
config.pop("custom_lang", None)
|
| 1166 |
+
|
| 1167 |
+
elif req.engine_name == "Commercial APIs":
|
| 1168 |
+
if config.get("model") == "__custom__":
|
| 1169 |
+
config["model"] = config.pop("model_custom", "").strip() or "gpt-4o"
|
| 1170 |
+
|
| 1171 |
+
elif req.engine_name == "OpenWebUI":
|
| 1172 |
+
if config.get("model") == "__custom__":
|
| 1173 |
+
custom_model = config.pop("model_custom", "").strip()
|
| 1174 |
+
if not custom_model:
|
| 1175 |
+
raise HTTPException(400, "Please enter an OpenWebUI model ID")
|
| 1176 |
+
config["model"] = custom_model
|
| 1177 |
+
|
| 1178 |
+
# Resolve API keys
|
| 1179 |
+
if req.engine_name == "Commercial APIs":
|
| 1180 |
+
provider_slot = config.get("provider", "openai").lower()
|
| 1181 |
+
raw_key = config.get("api_key", "")
|
| 1182 |
+
resolved = _resolve_api_key(provider_slot, raw_key)
|
| 1183 |
+
if not resolved:
|
| 1184 |
+
raise HTTPException(400, f"No API key for {config.get('provider')}. "
|
| 1185 |
+
"Paste your API key in the field.")
|
| 1186 |
+
config["api_key"] = resolved
|
| 1187 |
+
|
| 1188 |
+
elif req.engine_name == "OpenWebUI":
|
| 1189 |
+
base_url = config.get("base_url", "").strip().rstrip("/")
|
| 1190 |
+
if not base_url:
|
| 1191 |
+
raise HTTPException(400, "No OpenWebUI base URL. "
|
| 1192 |
+
"Enter your own OpenWebUI API base URL.")
|
| 1193 |
+
config["base_url"] = base_url
|
| 1194 |
+
raw_key = config.get("api_key", "")
|
| 1195 |
+
resolved = _resolve_api_key("openwebui", raw_key)
|
| 1196 |
+
if not resolved:
|
| 1197 |
+
raise HTTPException(400, "No API key for OpenWebUI. "
|
| 1198 |
+
"Paste your API key in the field.")
|
| 1199 |
+
config["api_key"] = resolved
|
| 1200 |
+
|
| 1201 |
+
# Strip empty custom_prompt for API engines (use engine default)
|
| 1202 |
+
if req.engine_name in ("Commercial APIs", "OpenWebUI"):
|
| 1203 |
+
if not config.get("custom_prompt", "").strip():
|
| 1204 |
+
config["custom_prompt"] = None
|
| 1205 |
+
|
| 1206 |
+
# --- Engine pool logic ---
|
| 1207 |
+
pool_key = _make_pool_key(req.engine_name, config)
|
| 1208 |
+
|
| 1209 |
+
async with pool_lock:
|
| 1210 |
+
# Release previous engine reference for this session
|
| 1211 |
+
if session.pool_key and session.pool_key in engine_pool:
|
| 1212 |
+
prev_slot = engine_pool[session.pool_key]
|
| 1213 |
+
prev_slot.ref_count = max(0, prev_slot.ref_count - 1)
|
| 1214 |
+
if prev_slot.ref_count == 0:
|
| 1215 |
+
log.info(f"Immediate eviction (engine switch): '{prev_slot.engine_name}'")
|
| 1216 |
+
try:
|
| 1217 |
+
prev_slot.engine.unload_model()
|
| 1218 |
+
except Exception as e:
|
| 1219 |
+
log.warning(f"unload_model() failed for '{prev_slot.engine_name}': {e}")
|
| 1220 |
+
if session.pool_key in engine_pool:
|
| 1221 |
+
del engine_pool[session.pool_key]
|
| 1222 |
+
|
| 1223 |
+
# Check if this exact engine+model is already loaded
|
| 1224 |
+
if pool_key in engine_pool:
|
| 1225 |
+
slot = engine_pool[pool_key]
|
| 1226 |
+
slot.ref_count += 1
|
| 1227 |
+
slot.last_used = time.time()
|
| 1228 |
+
session.pool_key = pool_key
|
| 1229 |
+
# Update compat shims
|
| 1230 |
+
loaded_engine = slot.engine
|
| 1231 |
+
loaded_engine_name = slot.engine_name
|
| 1232 |
+
loaded_config = slot.config
|
| 1233 |
+
log.info(f"Pool hit: reusing '{pool_key}' (ref_count={slot.ref_count})")
|
| 1234 |
+
return {"success": True, "load_time_s": 0.0,
|
| 1235 |
+
"engine_name": req.engine_name, "reused": True}
|
| 1236 |
+
|
| 1237 |
+
# Need new slot — evict if VRAM tight
|
| 1238 |
+
await _maybe_evict(req.engine_name)
|
| 1239 |
+
|
| 1240 |
+
# Load model OUTSIDE pool_lock (blocking I/O)
|
| 1241 |
+
engine = _create_engine_instance(req.engine_name)
|
| 1242 |
+
if not engine:
|
| 1243 |
+
raise HTTPException(500, f"Cannot create engine instance for '{req.engine_name}'")
|
| 1244 |
+
|
| 1245 |
+
start = time.time()
|
| 1246 |
+
success = await asyncio.to_thread(engine.load_model, config)
|
| 1247 |
+
elapsed = time.time() - start
|
| 1248 |
+
|
| 1249 |
+
if not success:
|
| 1250 |
+
raise HTTPException(500, "Failed to load model")
|
| 1251 |
+
|
| 1252 |
+
slot = EngineSlot(
|
| 1253 |
+
engine=engine,
|
| 1254 |
+
engine_name=req.engine_name,
|
| 1255 |
+
config=config,
|
| 1256 |
+
pool_key=pool_key,
|
| 1257 |
+
ref_count=1,
|
| 1258 |
+
last_used=time.time(),
|
| 1259 |
+
)
|
| 1260 |
+
|
| 1261 |
+
async with pool_lock:
|
| 1262 |
+
# Double-check: another request may have loaded the same key concurrently
|
| 1263 |
+
if pool_key in engine_pool:
|
| 1264 |
+
engine.unload_model()
|
| 1265 |
+
slot = engine_pool[pool_key]
|
| 1266 |
+
slot.ref_count += 1
|
| 1267 |
+
slot.last_used = time.time()
|
| 1268 |
+
else:
|
| 1269 |
+
engine_pool[pool_key] = slot
|
| 1270 |
+
|
| 1271 |
+
session.pool_key = pool_key
|
| 1272 |
+
# Update compat shims
|
| 1273 |
+
loaded_engine = slot.engine
|
| 1274 |
+
loaded_engine_name = slot.engine_name
|
| 1275 |
+
loaded_config = slot.config
|
| 1276 |
+
|
| 1277 |
+
log.info(f"Pool miss: loaded '{pool_key}' in {elapsed:.1f}s (pool size={len(engine_pool)})")
|
| 1278 |
+
return {"success": True, "load_time_s": round(elapsed, 2),
|
| 1279 |
+
"engine_name": req.engine_name, "reused": False}
|
| 1280 |
+
|
| 1281 |
+
|
| 1282 |
+
@app.get("/api/keys")
|
| 1283 |
+
async def list_keys():
|
| 1284 |
+
"""Keys are stored in browser localStorage only. Server has no key info.
|
| 1285 |
+
|
| 1286 |
+
This endpoint returns an empty dict — it exists for backwards compatibility.
|
| 1287 |
+
"""
|
| 1288 |
+
return {}
|
| 1289 |
+
|
| 1290 |
+
|
| 1291 |
+
@app.post("/api/admin/evict-all")
|
| 1292 |
+
async def admin_evict_all(request: Request):
|
| 1293 |
+
"""Force-evict all engine slots from VRAM (localhost admin only)."""
|
| 1294 |
+
if request.client and request.client.host not in ("127.0.0.1", "::1"):
|
| 1295 |
+
from fastapi import HTTPException
|
| 1296 |
+
raise HTTPException(status_code=403, detail="localhost only")
|
| 1297 |
+
async with pool_lock:
|
| 1298 |
+
evicted = []
|
| 1299 |
+
for key, slot in list(engine_pool.items()):
|
| 1300 |
+
try:
|
| 1301 |
+
slot.engine.unload_model()
|
| 1302 |
+
except Exception as e:
|
| 1303 |
+
log.warning(f"admin evict failed for '{key}': {e}")
|
| 1304 |
+
del engine_pool[key]
|
| 1305 |
+
evicted.append(key)
|
| 1306 |
+
for session in sessions.values():
|
| 1307 |
+
session.pool_key = None
|
| 1308 |
+
global loaded_engine, loaded_engine_name, loaded_config
|
| 1309 |
+
loaded_engine = None
|
| 1310 |
+
loaded_engine_name = ""
|
| 1311 |
+
loaded_config = {}
|
| 1312 |
+
log.info(f"Admin force-evict: cleared {len(evicted)} slot(s): {evicted}")
|
| 1313 |
+
return {"evicted": evicted}
|
| 1314 |
+
|
| 1315 |
+
|
| 1316 |
+
@app.post("/api/engine/unload")
|
| 1317 |
+
async def unload_engine(request: Request):
|
| 1318 |
+
global loaded_engine, loaded_engine_name, loaded_config
|
| 1319 |
+
session = _get_session(request)
|
| 1320 |
+
|
| 1321 |
+
async with pool_lock:
|
| 1322 |
+
if session.pool_key and session.pool_key in engine_pool:
|
| 1323 |
+
slot = engine_pool[session.pool_key]
|
| 1324 |
+
slot.ref_count = max(0, slot.ref_count - 1)
|
| 1325 |
+
if slot.ref_count == 0:
|
| 1326 |
+
log.info(f"Immediate eviction (explicit unload): '{slot.engine_name}'")
|
| 1327 |
+
try:
|
| 1328 |
+
slot.engine.unload_model()
|
| 1329 |
+
except Exception as e:
|
| 1330 |
+
log.warning(f"unload_model() failed for '{slot.engine_name}': {e}")
|
| 1331 |
+
if session.pool_key in engine_pool:
|
| 1332 |
+
del engine_pool[session.pool_key]
|
| 1333 |
+
session.pool_key = None
|
| 1334 |
+
# Update compat shims
|
| 1335 |
+
loaded_engine = None
|
| 1336 |
+
loaded_engine_name = ""
|
| 1337 |
+
loaded_config = {}
|
| 1338 |
+
|
| 1339 |
+
return {"success": True}
|
| 1340 |
+
|
| 1341 |
+
|
| 1342 |
+
def _register_image(session: UserSession, pil_image: Image.Image, filename: str, save_path: Path) -> str:
|
| 1343 |
+
"""Store a PIL image in the session's cache and return its image_id."""
|
| 1344 |
+
image_id = str(uuid.uuid4())
|
| 1345 |
+
image_data = {
|
| 1346 |
+
"path": save_path,
|
| 1347 |
+
"xml_path": None,
|
| 1348 |
+
"pil_image": pil_image,
|
| 1349 |
+
"width": pil_image.width,
|
| 1350 |
+
"height": pil_image.height,
|
| 1351 |
+
"filename": filename,
|
| 1352 |
+
"lines": None,
|
| 1353 |
+
}
|
| 1354 |
+
session.image_cache[image_id] = image_data
|
| 1355 |
+
global_image_cache[image_id] = image_data
|
| 1356 |
+
return image_id
|
| 1357 |
+
|
| 1358 |
+
|
| 1359 |
+
def _get_image_data(session: UserSession, image_id: str) -> Optional[dict]:
|
| 1360 |
+
"""Return image data, tolerating missing cookies in embedded Space contexts."""
|
| 1361 |
+
if image_id in session.image_cache:
|
| 1362 |
+
return session.image_cache[image_id]
|
| 1363 |
+
img_data = global_image_cache.get(image_id)
|
| 1364 |
+
if img_data is not None:
|
| 1365 |
+
session.image_cache[image_id] = img_data
|
| 1366 |
+
return img_data
|
| 1367 |
+
|
| 1368 |
+
|
| 1369 |
+
@app.post("/api/image/upload")
|
| 1370 |
+
async def upload_image(
|
| 1371 |
+
request: Request,
|
| 1372 |
+
file: UploadFile = File(...),
|
| 1373 |
+
max_dim: Optional[int] = Query(default=None, ge=100, description="Resize long edge to this many pixels (mobile upload only)"),
|
| 1374 |
+
):
|
| 1375 |
+
session = _get_session(request)
|
| 1376 |
+
filename = file.filename or "upload"
|
| 1377 |
+
is_pdf = (
|
| 1378 |
+
filename.lower().endswith(".pdf") or
|
| 1379 |
+
(file.content_type or "").startswith("application/pdf")
|
| 1380 |
+
)
|
| 1381 |
+
image_exts = {
|
| 1382 |
+
".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp"
|
| 1383 |
+
}
|
| 1384 |
+
is_image = (
|
| 1385 |
+
(file.content_type or "").startswith("image/") or
|
| 1386 |
+
Path(filename).suffix.lower() in image_exts
|
| 1387 |
+
)
|
| 1388 |
+
|
| 1389 |
+
content = await file.read()
|
| 1390 |
+
if len(content) > 200 * 1024 * 1024:
|
| 1391 |
+
raise HTTPException(400, "File too large (max 200MB)")
|
| 1392 |
+
|
| 1393 |
+
# ── PDF: render each page as a separate image ──────────────────────────
|
| 1394 |
+
if is_pdf:
|
| 1395 |
+
if not PDF_AVAILABLE:
|
| 1396 |
+
raise HTTPException(400, "PDF support requires PyMuPDF. Install with: pip install pymupdf")
|
| 1397 |
+
try:
|
| 1398 |
+
import asyncio
|
| 1399 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 1400 |
+
|
| 1401 |
+
def _render_pdf(data: bytes, stem: str, sess: UserSession) -> list:
|
| 1402 |
+
mat = _fitz.Matrix(150 / 72, 150 / 72)
|
| 1403 |
+
doc = _fitz.open(stream=data, filetype="pdf")
|
| 1404 |
+
results = []
|
| 1405 |
+
for i, page in enumerate(doc):
|
| 1406 |
+
pix = page.get_pixmap(matrix=mat, colorspace=_fitz.csRGB)
|
| 1407 |
+
pil_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 1408 |
+
page_filename = f"{stem}_page{i+1:03d}.png"
|
| 1409 |
+
save_path = UPLOAD_DIR / f"{uuid.uuid4()}.png"
|
| 1410 |
+
pil_page.save(save_path)
|
| 1411 |
+
pid = _register_image(sess, pil_page, page_filename, save_path)
|
| 1412 |
+
results.append({
|
| 1413 |
+
"image_id": pid,
|
| 1414 |
+
"filename": page_filename,
|
| 1415 |
+
"width": pil_page.width,
|
| 1416 |
+
"height": pil_page.height,
|
| 1417 |
+
"page": i + 1,
|
| 1418 |
+
})
|
| 1419 |
+
doc.close()
|
| 1420 |
+
return results
|
| 1421 |
+
|
| 1422 |
+
stem = Path(filename).stem
|
| 1423 |
+
loop = asyncio.get_event_loop()
|
| 1424 |
+
with ThreadPoolExecutor(max_workers=1) as pool:
|
| 1425 |
+
pages_out = await loop.run_in_executor(pool, _render_pdf, content, stem, session)
|
| 1426 |
+
return {
|
| 1427 |
+
"is_pdf": True,
|
| 1428 |
+
"filename": filename,
|
| 1429 |
+
"num_pages": len(pages_out),
|
| 1430 |
+
"pages": pages_out,
|
| 1431 |
+
}
|
| 1432 |
+
except Exception as e:
|
| 1433 |
+
raise HTTPException(400, f"Failed to render PDF: {e}")
|
| 1434 |
+
|
| 1435 |
+
# ── Regular image ───────────────────────────────────────────────────────
|
| 1436 |
+
if not is_image:
|
| 1437 |
+
raise HTTPException(400, "File must be an image or PDF")
|
| 1438 |
+
|
| 1439 |
+
ext = Path(filename).suffix or ".jpg"
|
| 1440 |
+
save_path = UPLOAD_DIR / f"{uuid.uuid4()}{ext}"
|
| 1441 |
+
save_path.write_bytes(content)
|
| 1442 |
+
|
| 1443 |
+
try:
|
| 1444 |
+
pil_image = Image.open(save_path)
|
| 1445 |
+
pil_image = ImageOps.exif_transpose(pil_image)
|
| 1446 |
+
pil_image = pil_image.convert("RGB")
|
| 1447 |
+
if max_dim and max(pil_image.width, pil_image.height) > max_dim:
|
| 1448 |
+
pil_image.thumbnail((max_dim, max_dim), Image.LANCZOS)
|
| 1449 |
+
pil_image.save(save_path)
|
| 1450 |
+
except Exception as e:
|
| 1451 |
+
save_path.unlink(missing_ok=True)
|
| 1452 |
+
raise HTTPException(400, f"Invalid image: {e}")
|
| 1453 |
+
|
| 1454 |
+
image_id = _register_image(session, pil_image, filename, save_path)
|
| 1455 |
+
return {
|
| 1456 |
+
"image_id": image_id,
|
| 1457 |
+
"width": pil_image.width,
|
| 1458 |
+
"height": pil_image.height,
|
| 1459 |
+
"filename": filename,
|
| 1460 |
+
}
|
| 1461 |
+
|
| 1462 |
+
|
| 1463 |
+
@app.post("/api/image/{image_id}/xml")
|
| 1464 |
+
async def upload_xml(request: Request, image_id: str, file: UploadFile = File(...)):
|
| 1465 |
+
"""Attach a PAGE XML file to an already-uploaded image."""
|
| 1466 |
+
session = _get_session(request)
|
| 1467 |
+
img_data = _get_image_data(session, image_id)
|
| 1468 |
+
if img_data is None:
|
| 1469 |
+
raise HTTPException(404, "Image not found — upload image first")
|
| 1470 |
+
content = await file.read()
|
| 1471 |
+
if len(content) > 10 * 1024 * 1024:
|
| 1472 |
+
raise HTTPException(400, "XML too large (max 10MB)")
|
| 1473 |
+
xml_path = UPLOAD_DIR / f"{image_id}.xml"
|
| 1474 |
+
xml_path.write_bytes(content)
|
| 1475 |
+
img_data["xml_path"] = xml_path
|
| 1476 |
+
return {"success": True, "filename": file.filename}
|
| 1477 |
+
|
| 1478 |
+
|
| 1479 |
+
@app.get("/api/image/{image_id}")
|
| 1480 |
+
async def get_image(request: Request, image_id: str):
|
| 1481 |
+
session = _get_session(request)
|
| 1482 |
+
img_data = _get_image_data(session, image_id)
|
| 1483 |
+
if img_data is None:
|
| 1484 |
+
raise HTTPException(404, "Image not found")
|
| 1485 |
+
return FileResponse(str(img_data["path"]))
|
| 1486 |
+
|
| 1487 |
+
|
| 1488 |
+
@app.get("/api/image/{image_id}/info")
|
| 1489 |
+
async def image_info(request: Request, image_id: str):
|
| 1490 |
+
session = _get_session(request)
|
| 1491 |
+
d = _get_image_data(session, image_id)
|
| 1492 |
+
if d is None:
|
| 1493 |
+
raise HTTPException(404, "Image not found")
|
| 1494 |
+
return {
|
| 1495 |
+
"image_id": image_id,
|
| 1496 |
+
"filename": d["filename"],
|
| 1497 |
+
"width": d["width"],
|
| 1498 |
+
"height": d["height"],
|
| 1499 |
+
"has_xml": d["xml_path"] is not None,
|
| 1500 |
+
}
|
| 1501 |
+
|
| 1502 |
+
|
| 1503 |
+
async def _run_segmentation(img_data: dict, method: str, device: str = "cpu",
|
| 1504 |
+
max_columns: int = 6,
|
| 1505 |
+
split_width_fraction: float = 0.40,
|
| 1506 |
+
text_direction: str = "horizontal-lr") -> dict:
|
| 1507 |
+
"""
|
| 1508 |
+
Shared segmentation helper. Runs the appropriate segmenter, stores
|
| 1509 |
+
results in img_data, and returns a serialisable dict ready for SSE or JSON.
|
| 1510 |
+
Also populates img_data["line_regions"] with a per-line region index list
|
| 1511 |
+
so the transcription loop can tag each line with its column.
|
| 1512 |
+
"""
|
| 1513 |
+
if DEMO_MODE == "hf_space" and method == "kraken-blla":
|
| 1514 |
+
method = "kraken"
|
| 1515 |
+
device = "cpu"
|
| 1516 |
+
pil_image = img_data["pil_image"]
|
| 1517 |
+
xml_path = img_data.get("xml_path")
|
| 1518 |
+
|
| 1519 |
+
if DEMO_MODE == "hf_space" and xml_path is None and method == "hpp":
|
| 1520 |
+
return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data)
|
| 1521 |
+
|
| 1522 |
+
_import_segmenters()
|
| 1523 |
+
|
| 1524 |
+
regions: list = []
|
| 1525 |
+
lines: list = []
|
| 1526 |
+
|
| 1527 |
+
xml_region_data: list = [] # TextRegion bboxes from PAGE XML (for visualization)
|
| 1528 |
+
if xml_path is not None:
|
| 1529 |
+
from inference_page import PageXMLSegmenter as _PXSeg
|
| 1530 |
+
segmenter = _PXSeg(str(xml_path))
|
| 1531 |
+
lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
|
| 1532 |
+
source = "pagexml"
|
| 1533 |
+
xml_region_data = getattr(segmenter, 'region_data', []) or []
|
| 1534 |
+
|
| 1535 |
+
|
| 1536 |
+
elif method == "kraken-blla":
|
| 1537 |
+
segmenter = KrakenLineSegmenter(device=device)
|
| 1538 |
+
regions, lines = await asyncio.to_thread(
|
| 1539 |
+
segmenter.segment_with_regions, pil_image,
|
| 1540 |
+
device=device,
|
| 1541 |
+
max_columns=max_columns,
|
| 1542 |
+
split_width_fraction=split_width_fraction,
|
| 1543 |
+
text_direction=text_direction,
|
| 1544 |
+
)
|
| 1545 |
+
source = "kraken-blla"
|
| 1546 |
+
|
| 1547 |
+
elif method == "kraken":
|
| 1548 |
+
try:
|
| 1549 |
+
segmenter = KrakenLineSegmenter()
|
| 1550 |
+
# Use column-aware segmentation so multi-column pages read correctly
|
| 1551 |
+
regions, lines = await asyncio.to_thread(
|
| 1552 |
+
segmenter.segment_classical_with_regions, pil_image,
|
| 1553 |
+
max_columns=max_columns,
|
| 1554 |
+
)
|
| 1555 |
+
source = "kraken"
|
| 1556 |
+
except Exception as exc:
|
| 1557 |
+
if DEMO_MODE == "hf_space":
|
| 1558 |
+
log.warning("Kraken segmentation failed in HF Space; falling back to HPP: %s", exc)
|
| 1559 |
+
return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")
|
| 1560 |
+
raise
|
| 1561 |
+
|
| 1562 |
+
else: # hpp
|
| 1563 |
+
segmenter = LineSegmenter()
|
| 1564 |
+
lines = await asyncio.to_thread(segmenter.segment_lines, pil_image)
|
| 1565 |
+
source = "hpp"
|
| 1566 |
+
|
| 1567 |
+
if DEMO_MODE == "hf_space" and method == "kraken" and not lines:
|
| 1568 |
+
log.warning("Kraken returned no lines in HF Space; falling back to HPP")
|
| 1569 |
+
return await asyncio.to_thread(_run_demo_hpp_segmentation, img_data, "hpp-fallback")
|
| 1570 |
+
|
| 1571 |
+
# Build per-line region index (used by transcription loop for column view)
|
| 1572 |
+
line_regions: list[int] = []
|
| 1573 |
+
if regions:
|
| 1574 |
+
offset = 0
|
| 1575 |
+
for ri, r in enumerate(regions):
|
| 1576 |
+
for _ in r.line_ids:
|
| 1577 |
+
line_regions.append(ri)
|
| 1578 |
+
offset += len(r.line_ids)
|
| 1579 |
+
else:
|
| 1580 |
+
line_regions = [0] * len(lines)
|
| 1581 |
+
|
| 1582 |
+
img_data["lines"] = lines
|
| 1583 |
+
img_data["line_regions"] = line_regions
|
| 1584 |
+
img_data["seg_source"] = source
|
| 1585 |
+
# PAGE XML provides region bboxes directly; Kraken/blla provide SegRegion objects
|
| 1586 |
+
if xml_region_data:
|
| 1587 |
+
img_data["seg_regions"] = xml_region_data
|
| 1588 |
+
elif regions:
|
| 1589 |
+
img_data["seg_regions"] = [
|
| 1590 |
+
{"id": r.id, "bbox": list(r.bbox), "num_lines": len(r.line_ids)}
|
| 1591 |
+
for r in regions
|
| 1592 |
+
]
|
| 1593 |
+
else:
|
| 1594 |
+
img_data["seg_regions"] = []
|
| 1595 |
+
|
| 1596 |
+
result: dict = {
|
| 1597 |
+
"num_lines": len(lines),
|
| 1598 |
+
"bboxes": [list(l.bbox) for l in lines],
|
| 1599 |
+
"source": source,
|
| 1600 |
+
}
|
| 1601 |
+
if img_data["seg_regions"]:
|
| 1602 |
+
result["regions"] = img_data["seg_regions"]
|
| 1603 |
+
return result
|
| 1604 |
+
|
| 1605 |
+
|
| 1606 |
+
def _run_demo_hpp_segmentation(img_data: dict, source: str = "hpp") -> dict:
|
| 1607 |
+
"""Small dependency-light line segmenter for the hosted CPU demo fallback."""
|
| 1608 |
+
pil_image = img_data["pil_image"]
|
| 1609 |
+
gray = np.array(pil_image.convert("L"))
|
| 1610 |
+
if gray.size == 0:
|
| 1611 |
+
lines = []
|
| 1612 |
+
else:
|
| 1613 |
+
threshold = min(220, max(90, float(np.percentile(gray, 42))))
|
| 1614 |
+
ink = gray < threshold
|
| 1615 |
+
row_density = ink.mean(axis=1)
|
| 1616 |
+
kernel = np.ones(9, dtype=np.float32) / 9.0
|
| 1617 |
+
smooth = np.convolve(row_density, kernel, mode="same")
|
| 1618 |
+
active_threshold = max(0.01, float(smooth.max()) * 0.13)
|
| 1619 |
+
min_height = max(10, int(pil_image.height * 0.008))
|
| 1620 |
+
|
| 1621 |
+
bands = []
|
| 1622 |
+
start = None
|
| 1623 |
+
for y, value in enumerate(smooth):
|
| 1624 |
+
if value > active_threshold and start is None:
|
| 1625 |
+
start = y
|
| 1626 |
+
elif (value <= active_threshold or y == len(smooth) - 1) and start is not None:
|
| 1627 |
+
end = y if y == len(smooth) - 1 else y - 1
|
| 1628 |
+
if end - start + 1 >= min_height:
|
| 1629 |
+
bands.append((start, end))
|
| 1630 |
+
start = None
|
| 1631 |
+
|
| 1632 |
+
lines = []
|
| 1633 |
+
for y1, y2 in bands[:100]:
|
| 1634 |
+
pad_y = max(3, int((y2 - y1 + 1) * 0.25))
|
| 1635 |
+
top = max(0, y1 - pad_y)
|
| 1636 |
+
bottom = min(pil_image.height, y2 + pad_y + 1)
|
| 1637 |
+
band_ink = ink[top:bottom, :]
|
| 1638 |
+
cols = np.where(band_ink.any(axis=0))[0]
|
| 1639 |
+
if cols.size:
|
| 1640 |
+
left = max(0, int(cols[0]) - 8)
|
| 1641 |
+
right = min(pil_image.width, int(cols[-1]) + 9)
|
| 1642 |
+
else:
|
| 1643 |
+
left = 0
|
| 1644 |
+
right = pil_image.width
|
| 1645 |
+
bbox = (left, top, right, bottom)
|
| 1646 |
+
lines.append(SimpleNamespace(
|
| 1647 |
+
image=pil_image.crop(bbox),
|
| 1648 |
+
bbox=bbox,
|
| 1649 |
+
coords=None,
|
| 1650 |
+
))
|
| 1651 |
+
|
| 1652 |
+
img_data["lines"] = lines
|
| 1653 |
+
img_data["line_regions"] = [0] * len(lines)
|
| 1654 |
+
img_data["seg_source"] = source
|
| 1655 |
+
img_data["seg_regions"] = []
|
| 1656 |
+
return {
|
| 1657 |
+
"num_lines": len(lines),
|
| 1658 |
+
"bboxes": [list(line.bbox) for line in lines],
|
| 1659 |
+
"source": source,
|
| 1660 |
+
}
|
| 1661 |
+
|
| 1662 |
+
|
| 1663 |
+
@app.delete("/api/image/{image_id}/region/{region_index}")
|
| 1664 |
+
async def delete_region(request: Request, image_id: str, region_index: int):
|
| 1665 |
+
"""
|
| 1666 |
+
Remove one detected region and its lines from the cached segmentation.
|
| 1667 |
+
Returns updated segmentation data in the same format as /segment,
|
| 1668 |
+
so the client can redraw the canvas.
|
| 1669 |
+
"""
|
| 1670 |
+
session = _get_session(request)
|
| 1671 |
+
img_data = _get_image_data(session, image_id)
|
| 1672 |
+
if img_data is None:
|
| 1673 |
+
raise HTTPException(404, "Image not found")
|
| 1674 |
+
|
| 1675 |
+
seg_regions = img_data.get("seg_regions") or []
|
| 1676 |
+
if not seg_regions:
|
| 1677 |
+
raise HTTPException(400, "No segmentation data — run Segment first")
|
| 1678 |
+
if region_index < 0 or region_index >= len(seg_regions):
|
| 1679 |
+
raise HTTPException(400, f"Region index out of range (0–{len(seg_regions)-1})")
|
| 1680 |
+
|
| 1681 |
+
lines = img_data.get("lines") or []
|
| 1682 |
+
line_regions = img_data.get("line_regions") or ([0] * len(lines))
|
| 1683 |
+
|
| 1684 |
+
# Keep lines that are NOT in the deleted region; re-index later regions
|
| 1685 |
+
new_lines: list = []
|
| 1686 |
+
new_line_regions: list = []
|
| 1687 |
+
for line, lr in zip(lines, line_regions):
|
| 1688 |
+
if lr == region_index:
|
| 1689 |
+
continue
|
| 1690 |
+
new_lines.append(line)
|
| 1691 |
+
new_line_regions.append(lr if lr < region_index else lr - 1)
|
| 1692 |
+
|
| 1693 |
+
new_regions = [r for i, r in enumerate(seg_regions) if i != region_index]
|
| 1694 |
+
|
| 1695 |
+
img_data["lines"] = new_lines
|
| 1696 |
+
img_data["line_regions"] = new_line_regions
|
| 1697 |
+
img_data["seg_regions"] = new_regions
|
| 1698 |
+
|
| 1699 |
+
result: dict = {
|
| 1700 |
+
"num_lines": len(new_lines),
|
| 1701 |
+
"bboxes": [list(l.bbox) for l in new_lines],
|
| 1702 |
+
"source": img_data.get("seg_source", "modified"),
|
| 1703 |
+
}
|
| 1704 |
+
if new_regions:
|
| 1705 |
+
result["regions"] = new_regions
|
| 1706 |
+
return result
|
| 1707 |
+
|
| 1708 |
+
|
| 1709 |
+
@app.get("/api/image/{image_id}/segment")
|
| 1710 |
+
async def segment_image(
|
| 1711 |
+
request: Request,
|
| 1712 |
+
image_id: str,
|
| 1713 |
+
method: str = "kraken",
|
| 1714 |
+
device: str = "cpu",
|
| 1715 |
+
max_columns: int = 6,
|
| 1716 |
+
split_width_fraction: float = 0.40,
|
| 1717 |
+
text_direction: str = "horizontal-lr",
|
| 1718 |
+
):
|
| 1719 |
+
"""
|
| 1720 |
+
Run segmentation only (no transcription) and return line bboxes as JSON.
|
| 1721 |
+
Useful for previewing line layout before transcribing.
|
| 1722 |
+
"""
|
| 1723 |
+
session = _get_session(request)
|
| 1724 |
+
img_data = _get_image_data(session, image_id)
|
| 1725 |
+
if img_data is None:
|
| 1726 |
+
raise HTTPException(404, "Image not found — upload first")
|
| 1727 |
+
|
| 1728 |
+
try:
|
| 1729 |
+
return await _run_segmentation(img_data, method, device,
|
| 1730 |
+
max_columns, split_width_fraction, text_direction)
|
| 1731 |
+
except Exception as e:
|
| 1732 |
+
raise HTTPException(500, f"Segmentation failed: {e}")
|
| 1733 |
+
|
| 1734 |
+
|
| 1735 |
+
@app.post("/api/transcribe")
|
| 1736 |
+
async def transcribe(request: Request, req: TranscribeRequest):
|
| 1737 |
+
session = _get_session(request)
|
| 1738 |
+
|
| 1739 |
+
# Resolve engine from session's pool slot
|
| 1740 |
+
if not session.pool_key or session.pool_key not in engine_pool:
|
| 1741 |
+
# Fallback: check compat shims (e.g. auto-loaded engine, no session yet)
|
| 1742 |
+
if not loaded_engine or not loaded_engine.is_model_loaded():
|
| 1743 |
+
raise HTTPException(400, "No engine loaded")
|
| 1744 |
+
slot = engine_pool.get(session.pool_key) if session.pool_key else None
|
| 1745 |
+
# Build effective engine/config references
|
| 1746 |
+
eff_engine = slot.engine if slot else loaded_engine
|
| 1747 |
+
_base_config = slot.config if slot else loaded_config
|
| 1748 |
+
# Merge live form overrides into a copy of the stored config so changes to
|
| 1749 |
+
# runtime-only fields (custom_prompt, thinking_mode, temperature, …) take
|
| 1750 |
+
# effect without requiring a model reload. Never overwrite security-sensitive
|
| 1751 |
+
# keys that were set during load (api_key, provider, model, model_path, …).
|
| 1752 |
+
_RELOAD_ONLY_KEYS = {"api_key", "provider", "model", "model_path", "model_source",
|
| 1753 |
+
"base_model", "adapter", "model_name", "preset_id", "lang",
|
| 1754 |
+
"use_gpu", "venv_path"}
|
| 1755 |
+
if req.engine_config_overrides:
|
| 1756 |
+
eff_config = dict(_base_config)
|
| 1757 |
+
for k, v in req.engine_config_overrides.items():
|
| 1758 |
+
if k not in _RELOAD_ONLY_KEYS:
|
| 1759 |
+
eff_config[k] = v
|
| 1760 |
+
else:
|
| 1761 |
+
eff_config = _base_config
|
| 1762 |
+
eff_engine_name = slot.engine_name if slot else loaded_engine_name
|
| 1763 |
+
|
| 1764 |
+
if not eff_engine or not eff_engine.is_model_loaded():
|
| 1765 |
+
raise HTTPException(400, "No engine loaded")
|
| 1766 |
+
|
| 1767 |
+
img_data = _get_image_data(session, req.image_id)
|
| 1768 |
+
if img_data is None:
|
| 1769 |
+
raise HTTPException(404, "Image not found — upload first")
|
| 1770 |
+
|
| 1771 |
+
pil_image = img_data["pil_image"]
|
| 1772 |
+
|
| 1773 |
+
# Per-request cancel event (replaces global cancel_event)
|
| 1774 |
+
request_id = str(uuid.uuid4())
|
| 1775 |
+
cancel_evt = asyncio.Event()
|
| 1776 |
+
session.cancel_events[request_id] = cancel_evt
|
| 1777 |
+
|
| 1778 |
+
async def event_stream():
|
| 1779 |
+
_import_segmenters()
|
| 1780 |
+
|
| 1781 |
+
try:
|
| 1782 |
+
# --- Segmentation ---
|
| 1783 |
+
xml_path = img_data.get("xml_path") if req.use_pagexml else None
|
| 1784 |
+
|
| 1785 |
+
if not eff_engine.requires_line_segmentation() and not xml_path:
|
| 1786 |
+
# Page-level engine with no PAGE XML — send whole page as single line
|
| 1787 |
+
from inference_page import LineSegment
|
| 1788 |
+
lines = [LineSegment(
|
| 1789 |
+
image=pil_image,
|
| 1790 |
+
bbox=(0, 0, pil_image.width, pil_image.height),
|
| 1791 |
+
coords=None,
|
| 1792 |
+
)]
|
| 1793 |
+
img_data["lines"] = lines
|
| 1794 |
+
img_data["line_regions"] = [0]
|
| 1795 |
+
img_data["seg_source"] = "page"
|
| 1796 |
+
img_data["seg_regions"] = []
|
| 1797 |
+
yield _sse("segmentation", {
|
| 1798 |
+
"num_lines": 1,
|
| 1799 |
+
"bboxes": [[0, 0, pil_image.width, pil_image.height]],
|
| 1800 |
+
"source": "page",
|
| 1801 |
+
})
|
| 1802 |
+
else:
|
| 1803 |
+
# Reuse cached segmentation if method matches (e.g. user clicked Segment first)
|
| 1804 |
+
cached_lines = img_data.get("lines")
|
| 1805 |
+
cached_source = img_data.get("seg_source")
|
| 1806 |
+
desired_source = "pagexml" if (xml_path and req.use_pagexml) else req.seg_method
|
| 1807 |
+
|
| 1808 |
+
if cached_lines and cached_source == desired_source:
|
| 1809 |
+
lines = cached_lines
|
| 1810 |
+
yield _sse("status", {"message": "Using cached segmentation..."})
|
| 1811 |
+
seg_event: dict = {
|
| 1812 |
+
"num_lines": len(lines),
|
| 1813 |
+
"bboxes": [list(l.bbox) for l in lines],
|
| 1814 |
+
"source": cached_source,
|
| 1815 |
+
}
|
| 1816 |
+
if img_data.get("seg_regions"):
|
| 1817 |
+
seg_event["regions"] = img_data["seg_regions"]
|
| 1818 |
+
yield _sse("segmentation", seg_event)
|
| 1819 |
+
elif xml_path is not None:
|
| 1820 |
+
yield _sse("status", {"message": "Reading line layout from PAGE XML..."})
|
| 1821 |
+
seg_result = await _run_segmentation(img_data, "pagexml",
|
| 1822 |
+
req.seg_device, req.max_columns,
|
| 1823 |
+
req.split_width_fraction,
|
| 1824 |
+
req.text_direction)
|
| 1825 |
+
lines = img_data["lines"]
|
| 1826 |
+
yield _sse("segmentation", seg_result)
|
| 1827 |
+
else:
|
| 1828 |
+
yield _sse("status", {"message": f"Segmenting with {req.seg_method}..."})
|
| 1829 |
+
seg_result = await _run_segmentation(img_data, req.seg_method,
|
| 1830 |
+
req.seg_device, req.max_columns,
|
| 1831 |
+
req.split_width_fraction,
|
| 1832 |
+
req.text_direction)
|
| 1833 |
+
lines = img_data["lines"]
|
| 1834 |
+
yield _sse("segmentation", seg_result)
|
| 1835 |
+
|
| 1836 |
+
# --- Transcription ---
|
| 1837 |
+
results = []
|
| 1838 |
+
token_usage: Dict[str, Any] = {}
|
| 1839 |
+
start_time = time.time()
|
| 1840 |
+
line_regions = img_data.get("line_regions") or ([0] * len(lines))
|
| 1841 |
+
|
| 1842 |
+
for i, line in enumerate(lines):
|
| 1843 |
+
# Check for cancellation before each line
|
| 1844 |
+
if cancel_evt.is_set():
|
| 1845 |
+
yield _sse("cancelled", {})
|
| 1846 |
+
return
|
| 1847 |
+
|
| 1848 |
+
line_img = line.image if line.image is not None else pil_image.crop(line.bbox)
|
| 1849 |
+
img_array = np.array(line_img.convert("RGB"))
|
| 1850 |
+
|
| 1851 |
+
# Use slot lock to serialize access to this engine instance
|
| 1852 |
+
if slot:
|
| 1853 |
+
async with slot.lock:
|
| 1854 |
+
slot.last_used = time.time()
|
| 1855 |
+
result = await asyncio.to_thread(
|
| 1856 |
+
eff_engine.transcribe_line, img_array, eff_config
|
| 1857 |
+
)
|
| 1858 |
+
else:
|
| 1859 |
+
result = await asyncio.to_thread(
|
| 1860 |
+
eff_engine.transcribe_line, img_array, eff_config
|
| 1861 |
+
)
|
| 1862 |
+
|
| 1863 |
+
text = str(result.text) if hasattr(result, "text") else str(result)
|
| 1864 |
+
confidence = None
|
| 1865 |
+
if hasattr(result, "confidence") and result.confidence is not None:
|
| 1866 |
+
confidence = float(result.confidence)
|
| 1867 |
+
if confidence > 1:
|
| 1868 |
+
confidence = confidence / 100.0
|
| 1869 |
+
# Accumulate token usage and extract thinking text from API engines (e.g. Gemini)
|
| 1870 |
+
thinking_text = None
|
| 1871 |
+
if hasattr(result, "metadata") and isinstance(result.metadata, dict):
|
| 1872 |
+
tu = result.metadata.get("token_usage")
|
| 1873 |
+
if tu:
|
| 1874 |
+
for k, v in tu.items():
|
| 1875 |
+
if v is not None:
|
| 1876 |
+
token_usage[k] = token_usage.get(k, 0) + v
|
| 1877 |
+
thinking_text = result.metadata.get("thinking_text")
|
| 1878 |
+
|
| 1879 |
+
line_data = {
|
| 1880 |
+
"index": i,
|
| 1881 |
+
"text": text,
|
| 1882 |
+
"confidence": confidence,
|
| 1883 |
+
"bbox": list(line.bbox),
|
| 1884 |
+
"region": line_regions[i] if i < len(line_regions) else 0,
|
| 1885 |
+
}
|
| 1886 |
+
if thinking_text:
|
| 1887 |
+
line_data["thinking_text"] = thinking_text
|
| 1888 |
+
results.append(line_data)
|
| 1889 |
+
progress_data: Dict[str, Any] = {
|
| 1890 |
+
"current": i + 1,
|
| 1891 |
+
"total": len(lines),
|
| 1892 |
+
"line": line_data,
|
| 1893 |
+
}
|
| 1894 |
+
if token_usage:
|
| 1895 |
+
progress_data["token_usage"] = dict(token_usage)
|
| 1896 |
+
yield _sse("progress", progress_data)
|
| 1897 |
+
|
| 1898 |
+
# Check for cancellation after each line's progress event
|
| 1899 |
+
if cancel_evt.is_set():
|
| 1900 |
+
yield _sse("cancelled", {})
|
| 1901 |
+
return
|
| 1902 |
+
|
| 1903 |
+
# Store completed results in session image_cache for export
|
| 1904 |
+
img_data["results"] = results
|
| 1905 |
+
|
| 1906 |
+
elapsed = time.time() - start_time
|
| 1907 |
+
complete_data: Dict[str, Any] = {
|
| 1908 |
+
"lines": results,
|
| 1909 |
+
"total_time_s": round(elapsed, 2),
|
| 1910 |
+
"engine": eff_engine_name,
|
| 1911 |
+
}
|
| 1912 |
+
if token_usage:
|
| 1913 |
+
complete_data["token_usage"] = token_usage
|
| 1914 |
+
yield _sse("complete", complete_data)
|
| 1915 |
+
|
| 1916 |
+
except Exception as e:
|
| 1917 |
+
log.exception("Transcription error")
|
| 1918 |
+
yield _sse("error", {"message": str(e)})
|
| 1919 |
+
finally:
|
| 1920 |
+
# Clean up this request's cancel event
|
| 1921 |
+
session.cancel_events.pop(request_id, None)
|
| 1922 |
+
|
| 1923 |
+
return StreamingResponse(
|
| 1924 |
+
event_stream(),
|
| 1925 |
+
media_type="text/event-stream",
|
| 1926 |
+
headers={
|
| 1927 |
+
"Cache-Control": "no-cache",
|
| 1928 |
+
"X-Accel-Buffering": "no", # Disable nginx buffering if behind proxy
|
| 1929 |
+
},
|
| 1930 |
+
)
|
| 1931 |
+
|
| 1932 |
+
|
| 1933 |
+
@app.post("/api/transcribe/cancel")
|
| 1934 |
+
async def cancel_transcription(request: Request):
|
| 1935 |
+
"""Signal all running transcriptions for this session to stop."""
|
| 1936 |
+
session = _get_session(request)
|
| 1937 |
+
for evt in session.cancel_events.values():
|
| 1938 |
+
evt.set()
|
| 1939 |
+
return {"success": True}
|
| 1940 |
+
|
| 1941 |
+
|
| 1942 |
+
@app.post("/api/image/{image_id}/export-xml")
|
| 1943 |
+
async def export_xml(request: Request, image_id: str):
|
| 1944 |
+
"""Export transcription results for image_id as PAGE XML."""
|
| 1945 |
+
session = _get_session(request)
|
| 1946 |
+
pretty, stem = _build_xml_bytes(session, image_id)
|
| 1947 |
+
return Response(
|
| 1948 |
+
content=pretty,
|
| 1949 |
+
media_type="application/xml",
|
| 1950 |
+
headers={"Content-Disposition": f'attachment; filename="{stem}.xml"'},
|
| 1951 |
+
)
|
| 1952 |
+
|
| 1953 |
+
|
| 1954 |
+
def _build_xml_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
|
| 1955 |
+
"""Return (xml_bytes, stem) for a cached image, or raise HTTPException."""
|
| 1956 |
+
import xml.etree.ElementTree as ET
|
| 1957 |
+
from xml.dom import minidom
|
| 1958 |
+
from page_xml_exporter import PageXMLExporter
|
| 1959 |
+
|
| 1960 |
+
img_data = _get_image_data(session, image_id)
|
| 1961 |
+
if img_data is None:
|
| 1962 |
+
raise HTTPException(404, f"Image {image_id} not found")
|
| 1963 |
+
results = img_data.get("results")
|
| 1964 |
+
if not results:
|
| 1965 |
+
raise HTTPException(400, f"No results for {image_id}")
|
| 1966 |
+
|
| 1967 |
+
filename = img_data.get("filename", img_data["path"].name)
|
| 1968 |
+
width = img_data["width"]
|
| 1969 |
+
height = img_data["height"]
|
| 1970 |
+
|
| 1971 |
+
class _SegProxy:
|
| 1972 |
+
__slots__ = ("bbox", "coords", "text", "confidence")
|
| 1973 |
+
def __init__(self, r):
|
| 1974 |
+
bbox = r.get("bbox")
|
| 1975 |
+
self.bbox = tuple(bbox) if bbox else (0, 0, width, height)
|
| 1976 |
+
self.coords = None
|
| 1977 |
+
self.text = r.get("text", "")
|
| 1978 |
+
self.confidence = r.get("confidence")
|
| 1979 |
+
|
| 1980 |
+
segments = [_SegProxy(r) for r in results]
|
| 1981 |
+
exporter = PageXMLExporter(str(filename), width, height)
|
| 1982 |
+
root, page = exporter._make_root("Polyscriptor Web UI", None)
|
| 1983 |
+
|
| 1984 |
+
reading_order = ET.SubElement(page, 'ReadingOrder')
|
| 1985 |
+
ordered_group = ET.SubElement(reading_order, 'OrderedGroup',
|
| 1986 |
+
{'id': 'ro_1', 'caption': 'Regions reading order'})
|
| 1987 |
+
ET.SubElement(ordered_group, 'RegionRefIndexed', {'index': '0', 'regionRef': 'region_1'})
|
| 1988 |
+
|
| 1989 |
+
text_region = ET.SubElement(page, 'TextRegion',
|
| 1990 |
+
{'id': 'region_1', 'type': 'paragraph', 'custom': 'readingOrder {index:0;}'})
|
| 1991 |
+
if segments:
|
| 1992 |
+
x1 = min(s.bbox[0] for s in segments)
|
| 1993 |
+
y1 = min(s.bbox[1] for s in segments)
|
| 1994 |
+
x2 = max(s.bbox[2] for s in segments)
|
| 1995 |
+
y2 = max(s.bbox[3] for s in segments)
|
| 1996 |
+
ET.SubElement(text_region, 'Coords').set('points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}')
|
| 1997 |
+
for idx, seg in enumerate(segments):
|
| 1998 |
+
exporter._add_text_line(text_region, f'line_{idx + 1}', seg, seg.text, idx)
|
| 1999 |
+
|
| 2000 |
+
xml_bytes = ET.tostring(root, encoding='utf-8', method='xml')
|
| 2001 |
+
pretty = minidom.parseString(xml_bytes).toprettyxml(indent=' ', encoding='utf-8')
|
| 2002 |
+
return pretty, Path(filename).stem
|
| 2003 |
+
|
| 2004 |
+
|
| 2005 |
+
def _build_thinking_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
|
| 2006 |
+
"""Return (thinking_bytes, stem) for a cached image, or raise HTTPException(404) if no thinking."""
|
| 2007 |
+
img_data = _get_image_data(session, image_id)
|
| 2008 |
+
if img_data is None:
|
| 2009 |
+
raise HTTPException(404, f"Image {image_id} not found")
|
| 2010 |
+
results = img_data.get("results")
|
| 2011 |
+
if not results:
|
| 2012 |
+
raise HTTPException(400, f"No results for {image_id}")
|
| 2013 |
+
filename = img_data.get("filename", img_data["path"].name)
|
| 2014 |
+
stem = Path(filename).stem
|
| 2015 |
+
blocks = []
|
| 2016 |
+
for i, r in enumerate(results):
|
| 2017 |
+
t = r.get("thinking_text", "")
|
| 2018 |
+
if t:
|
| 2019 |
+
if len(results) > 1:
|
| 2020 |
+
blocks.append(f"=== Line {i + 1} ===\n{t}")
|
| 2021 |
+
else:
|
| 2022 |
+
blocks.append(t)
|
| 2023 |
+
if not blocks:
|
| 2024 |
+
raise HTTPException(404, f"No thinking text for {image_id}")
|
| 2025 |
+
return "\n\n".join(blocks).encode("utf-8"), stem
|
| 2026 |
+
|
| 2027 |
+
|
| 2028 |
+
def _build_txt_bytes(session: UserSession, image_id: str) -> tuple[bytes, str]:
|
| 2029 |
+
"""Return (txt_bytes, stem) for a cached image, or raise HTTPException."""
|
| 2030 |
+
img_data = _get_image_data(session, image_id)
|
| 2031 |
+
if img_data is None:
|
| 2032 |
+
raise HTTPException(404, f"Image {image_id} not found")
|
| 2033 |
+
results = img_data.get("results")
|
| 2034 |
+
if not results:
|
| 2035 |
+
raise HTTPException(400, f"No results for {image_id}")
|
| 2036 |
+
filename = img_data.get("filename", img_data["path"].name)
|
| 2037 |
+
text = "\n".join(r.get("text", "") for r in results)
|
| 2038 |
+
return text.encode("utf-8"), Path(filename).stem
|
| 2039 |
+
|
| 2040 |
+
|
| 2041 |
+
class BatchXMLRequest(BaseModel):
|
| 2042 |
+
image_ids: list[str]
|
| 2043 |
+
|
| 2044 |
+
|
| 2045 |
+
@app.post("/api/batch/export-thinking")
|
| 2046 |
+
async def batch_export_thinking(request: Request, req: BatchXMLRequest):
|
| 2047 |
+
"""Return a ZIP archive containing one thinking-text file per image (skips pages without thinking)."""
|
| 2048 |
+
session = _get_session(request)
|
| 2049 |
+
import zipfile, io
|
| 2050 |
+
buf = io.BytesIO()
|
| 2051 |
+
with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
|
| 2052 |
+
for image_id in req.image_ids:
|
| 2053 |
+
try:
|
| 2054 |
+
thinking_bytes, stem = _build_thinking_bytes(session, image_id)
|
| 2055 |
+
zf.writestr(f"{stem}_thinking.txt", thinking_bytes)
|
| 2056 |
+
except HTTPException:
|
| 2057 |
+
pass # skip pages without thinking
|
| 2058 |
+
buf.seek(0)
|
| 2059 |
+
return Response(
|
| 2060 |
+
content=buf.read(),
|
| 2061 |
+
media_type="application/zip",
|
| 2062 |
+
headers={"Content-Disposition": 'attachment; filename="batch_thinking.zip"'},
|
| 2063 |
+
)
|
| 2064 |
+
|
| 2065 |
+
|
| 2066 |
+
@app.post("/api/batch/export-txt")
|
| 2067 |
+
async def batch_export_txt(request: Request, req: BatchXMLRequest):
|
| 2068 |
+
"""Return a ZIP archive containing one plain-text file per image."""
|
| 2069 |
+
session = _get_session(request)
|
| 2070 |
+
import zipfile, io
|
| 2071 |
+
buf = io.BytesIO()
|
| 2072 |
+
with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
|
| 2073 |
+
for image_id in req.image_ids:
|
| 2074 |
+
try:
|
| 2075 |
+
txt_bytes, stem = _build_txt_bytes(session, image_id)
|
| 2076 |
+
zf.writestr(f"{stem}.txt", txt_bytes)
|
| 2077 |
+
except HTTPException:
|
| 2078 |
+
pass # skip images without results
|
| 2079 |
+
buf.seek(0)
|
| 2080 |
+
return Response(
|
| 2081 |
+
content=buf.read(),
|
| 2082 |
+
media_type="application/zip",
|
| 2083 |
+
headers={"Content-Disposition": 'attachment; filename="batch_export_txt.zip"'},
|
| 2084 |
+
)
|
| 2085 |
+
|
| 2086 |
+
|
| 2087 |
+
@app.post("/api/batch/export-xml")
|
| 2088 |
+
async def batch_export_xml(request: Request, req: BatchXMLRequest):
|
| 2089 |
+
"""Return a ZIP archive containing one PAGE XML file per image."""
|
| 2090 |
+
session = _get_session(request)
|
| 2091 |
+
import zipfile, io
|
| 2092 |
+
buf = io.BytesIO()
|
| 2093 |
+
with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
|
| 2094 |
+
for image_id in req.image_ids:
|
| 2095 |
+
try:
|
| 2096 |
+
xml_bytes, stem = _build_xml_bytes(session, image_id)
|
| 2097 |
+
zf.writestr(f"{stem}.xml", xml_bytes)
|
| 2098 |
+
except HTTPException:
|
| 2099 |
+
pass # skip images without results
|
| 2100 |
+
buf.seek(0)
|
| 2101 |
+
return Response(
|
| 2102 |
+
content=buf.read(),
|
| 2103 |
+
media_type="application/zip",
|
| 2104 |
+
headers={"Content-Disposition": 'attachment; filename="batch_export.zip"'},
|
| 2105 |
+
)
|
| 2106 |
+
|
| 2107 |
+
|
| 2108 |
+
@app.get("/api/session")
|
| 2109 |
+
async def session_info(request: Request):
|
| 2110 |
+
"""Return info about the current session (useful for debugging)."""
|
| 2111 |
+
session = _get_session(request)
|
| 2112 |
+
return {
|
| 2113 |
+
"session_id": session.session_id[:8] + "...",
|
| 2114 |
+
"images": len(session.image_cache),
|
| 2115 |
+
"active_transcriptions": len(session.cancel_events),
|
| 2116 |
+
"pool_key": session.pool_key,
|
| 2117 |
+
"created_at": session.created_at,
|
| 2118 |
+
"last_active": session.last_active,
|
| 2119 |
+
"total_sessions": len(sessions),
|
| 2120 |
+
}
|
| 2121 |
+
|
| 2122 |
+
|
| 2123 |
+
@app.get("/api/engine/pool")
|
| 2124 |
+
async def pool_status():
|
| 2125 |
+
"""Return current engine pool state (admin/debug endpoint)."""
|
| 2126 |
+
slots = []
|
| 2127 |
+
for key, slot in engine_pool.items():
|
| 2128 |
+
slots.append({
|
| 2129 |
+
"pool_key": key,
|
| 2130 |
+
"engine_name": slot.engine_name,
|
| 2131 |
+
"ref_count": slot.ref_count,
|
| 2132 |
+
"loaded": slot.engine.is_model_loaded(),
|
| 2133 |
+
"last_used": slot.last_used,
|
| 2134 |
+
"age_s": round(time.time() - slot.last_used, 0),
|
| 2135 |
+
})
|
| 2136 |
+
return {
|
| 2137 |
+
"pool_size": len(engine_pool),
|
| 2138 |
+
"slots": slots,
|
| 2139 |
+
"total_sessions": len(sessions),
|
| 2140 |
+
}
|
| 2141 |
+
|
| 2142 |
+
|
| 2143 |
+
@app.get("/api/kraken/presets")
|
| 2144 |
+
async def kraken_presets():
|
| 2145 |
+
"""Return list of available Kraken model presets (local + Zenodo)."""
|
| 2146 |
+
try:
|
| 2147 |
+
from engines.kraken_engine import KRAKEN_MODELS
|
| 2148 |
+
except ImportError:
|
| 2149 |
+
return {"presets": []}
|
| 2150 |
+
presets = []
|
| 2151 |
+
for model_id, info in KRAKEN_MODELS.items():
|
| 2152 |
+
presets.append({
|
| 2153 |
+
"id": model_id,
|
| 2154 |
+
"label": info.get("description", model_id),
|
| 2155 |
+
"language": info.get("language", ""),
|
| 2156 |
+
"source": info.get("source", ""),
|
| 2157 |
+
})
|
| 2158 |
+
return {"presets": presets}
|
| 2159 |
+
|
| 2160 |
+
|
| 2161 |
+
@app.post("/api/models/upload")
|
| 2162 |
+
async def upload_model(file: UploadFile = File(...)):
|
| 2163 |
+
"""Upload a Kraken .mlmodel file to the server's models/kraken_uploads/ directory."""
|
| 2164 |
+
filename = file.filename or "model.mlmodel"
|
| 2165 |
+
if not filename.lower().endswith(".mlmodel"):
|
| 2166 |
+
raise HTTPException(400, "Only .mlmodel files are accepted")
|
| 2167 |
+
|
| 2168 |
+
content = await file.read()
|
| 2169 |
+
if len(content) > 500 * 1024 * 1024:
|
| 2170 |
+
raise HTTPException(400, "File too large (max 500 MB)")
|
| 2171 |
+
|
| 2172 |
+
upload_dir = PROJECT_ROOT / "models" / "kraken_uploads"
|
| 2173 |
+
upload_dir.mkdir(parents=True, exist_ok=True)
|
| 2174 |
+
|
| 2175 |
+
# Sanitize filename — keep only safe characters
|
| 2176 |
+
safe_name = Path(filename).name
|
| 2177 |
+
safe_name = "".join(c for c in safe_name if c.isalnum() or c in "._- ")
|
| 2178 |
+
safe_name = safe_name.strip() or "uploaded.mlmodel"
|
| 2179 |
+
|
| 2180 |
+
dest = upload_dir / safe_name
|
| 2181 |
+
dest.write_bytes(content)
|
| 2182 |
+
log.info(f"Uploaded Kraken model: {dest} ({len(content)} bytes)")
|
| 2183 |
+
|
| 2184 |
+
rel_path = str(dest.relative_to(PROJECT_ROOT)) # e.g. models/kraken_uploads/foo.mlmodel
|
| 2185 |
+
return {
|
| 2186 |
+
"path": rel_path,
|
| 2187 |
+
"filename": safe_name,
|
| 2188 |
+
"size": len(content),
|
| 2189 |
+
"options": _scan_kraken_models(), # refreshed list for frontend to repopulate select
|
| 2190 |
+
}
|
| 2191 |
+
|
| 2192 |
+
|
| 2193 |
+
@app.get("/api/gpu")
|
| 2194 |
+
async def gpu_status():
|
| 2195 |
+
try:
|
| 2196 |
+
import torch
|
| 2197 |
+
if not torch.cuda.is_available():
|
| 2198 |
+
return {"available": False, "gpus": []}
|
| 2199 |
+
|
| 2200 |
+
# pynvml (nvidia-ml-py) for utilization %; graceful fallback if missing
|
| 2201 |
+
nvml_utils: dict[int, dict] = {}
|
| 2202 |
+
try:
|
| 2203 |
+
import pynvml
|
| 2204 |
+
pynvml.nvmlInit()
|
| 2205 |
+
for _i in range(pynvml.nvmlDeviceGetCount()):
|
| 2206 |
+
h = pynvml.nvmlDeviceGetHandleByIndex(_i)
|
| 2207 |
+
u = pynvml.nvmlDeviceGetUtilizationRates(h)
|
| 2208 |
+
nvml_utils[_i] = {"gpu_pct": u.gpu, "mem_pct": u.memory}
|
| 2209 |
+
except Exception:
|
| 2210 |
+
pass # pynvml unavailable — utilization fields omitted
|
| 2211 |
+
|
| 2212 |
+
gpus = []
|
| 2213 |
+
for i in range(torch.cuda.device_count()):
|
| 2214 |
+
free, total = torch.cuda.mem_get_info(i)
|
| 2215 |
+
entry: dict = {
|
| 2216 |
+
"index": i,
|
| 2217 |
+
"name": torch.cuda.get_device_name(i),
|
| 2218 |
+
"memory_total_mb": round(total / 1e6),
|
| 2219 |
+
"memory_used_mb": round((total - free) / 1e6),
|
| 2220 |
+
"memory_free_mb": round(free / 1e6),
|
| 2221 |
+
}
|
| 2222 |
+
if i in nvml_utils:
|
| 2223 |
+
entry["utilization_gpu_pct"] = nvml_utils[i]["gpu_pct"]
|
| 2224 |
+
entry["utilization_mem_pct"] = nvml_utils[i]["mem_pct"]
|
| 2225 |
+
gpus.append(entry)
|
| 2226 |
+
return {"available": True, "gpus": gpus}
|
| 2227 |
+
except Exception:
|
| 2228 |
+
return {"available": False, "gpus": []}
|
| 2229 |
+
|
| 2230 |
+
|
| 2231 |
+
# ---------------------------------------------------------------------------
|
| 2232 |
+
# Helpers
|
| 2233 |
+
# ---------------------------------------------------------------------------
|
| 2234 |
+
|
| 2235 |
+
def _sse(event: str, data: dict) -> str:
|
| 2236 |
+
"""Format a Server-Sent Event."""
|
| 2237 |
+
return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
|
web/server_config.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Polyscriptor Web UI — server startup configuration
|
| 2 |
+
# Uncomment and adjust to auto-load an engine on server start.
|
| 3 |
+
#
|
| 4 |
+
# Usage:
|
| 5 |
+
# python -m uvicorn web.polyscriptor_server:app --host 0.0.0.0 --port 8765
|
| 6 |
+
#
|
| 7 |
+
# The server will load the specified engine at startup so the first
|
| 8 |
+
# transcription request doesn't need to wait for model loading.
|
| 9 |
+
|
| 10 |
+
# --- Auto-load (optional) ---
|
| 11 |
+
# Set default_engine to the engine name shown in the UI dropdown.
|
| 12 |
+
# Leave blank or comment out to start without a loaded model.
|
| 13 |
+
|
| 14 |
+
# Example: auto-load Church Slavonic CRNN-CTC model
|
| 15 |
+
# default_engine: "CRNN-CTC (PyLaia-inspired)"
|
| 16 |
+
# default_config:
|
| 17 |
+
# model_path: "Church Slavonic (2.89% CER)"
|
| 18 |
+
# enable_spaces: true
|
| 19 |
+
|
| 20 |
+
# Example: auto-load TrOCR from HuggingFace
|
| 21 |
+
# default_engine: "TrOCR"
|
| 22 |
+
# default_config:
|
| 23 |
+
# model_path: "kazars24/trocr-base-handwritten-ru"
|
| 24 |
+
# num_beams: 4
|
| 25 |
+
# normalize_background: false
|
web/static/app.css
ADDED
|
@@ -0,0 +1,1269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ── Self-hosted fonts ─── */
|
| 2 |
+
@font-face {
|
| 3 |
+
font-family: 'Monomakh';
|
| 4 |
+
src: url('fonts/MonomakhUnicode-Regular.woff2') format('woff2');
|
| 5 |
+
font-weight: normal;
|
| 6 |
+
font-style: normal;
|
| 7 |
+
font-display: swap;
|
| 8 |
+
unicode-range: U+0000-007F, U+0080-00FF, U+0300-036F, U+0400-04FF,
|
| 9 |
+
U+0500-052F, U+1C80-1C8F, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
/* ── Design tokens ───────────────────────────────────────────────────── */
|
| 13 |
+
:root {
|
| 14 |
+
--bg: #111827;
|
| 15 |
+
--bg-panel: #1f2937;
|
| 16 |
+
--bg-section: #1a2333;
|
| 17 |
+
--bg-input: #111827;
|
| 18 |
+
--bg-hover: #2a3a52;
|
| 19 |
+
--text: #e2e8f0;
|
| 20 |
+
--text-muted: #64748b;
|
| 21 |
+
--text-dim: #94a3b8;
|
| 22 |
+
--accent: #e94560;
|
| 23 |
+
--accent-hover:#ff6b81;
|
| 24 |
+
--primary: #3b82f6;
|
| 25 |
+
--primary-hover:#60a5fa;
|
| 26 |
+
--success: #22c55e;
|
| 27 |
+
--warning: #f59e0b;
|
| 28 |
+
--danger: #ef4444;
|
| 29 |
+
--border: #2d3f59;
|
| 30 |
+
--border-light:#3a4f6e;
|
| 31 |
+
--radius: 6px;
|
| 32 |
+
--radius-lg: 10px;
|
| 33 |
+
--font: 'Segoe UI', system-ui, -apple-system, sans-serif;
|
| 34 |
+
--font-mono: 'Consolas', 'Fira Code', 'Cascadia Code', monospace;
|
| 35 |
+
--header-h: 44px;
|
| 36 |
+
--tabs-h: 56px;
|
| 37 |
+
--shadow: 0 4px 20px rgba(0,0,0,0.4);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
/* ── Reset & base ───────────────────────────────────────────────────── */
|
| 41 |
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
| 42 |
+
|
| 43 |
+
html { height: 100%; }
|
| 44 |
+
|
| 45 |
+
body {
|
| 46 |
+
font-family: var(--font);
|
| 47 |
+
background: var(--bg);
|
| 48 |
+
color: var(--text);
|
| 49 |
+
height: 100%;
|
| 50 |
+
display: flex;
|
| 51 |
+
flex-direction: column;
|
| 52 |
+
overflow: hidden;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/* ── Header ─────────────────────────────────────────────────────────── */
|
| 56 |
+
#header {
|
| 57 |
+
height: var(--header-h);
|
| 58 |
+
display: flex;
|
| 59 |
+
align-items: center;
|
| 60 |
+
justify-content: space-between;
|
| 61 |
+
padding: 0 14px;
|
| 62 |
+
background: var(--bg-panel);
|
| 63 |
+
border-bottom: 1px solid var(--border);
|
| 64 |
+
flex-shrink: 0;
|
| 65 |
+
gap: 12px;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
.header-left {
|
| 69 |
+
display: flex;
|
| 70 |
+
align-items: center;
|
| 71 |
+
gap: 8px;
|
| 72 |
+
min-width: 0;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.header-logo {
|
| 76 |
+
font-size: 1.3rem;
|
| 77 |
+
color: var(--primary);
|
| 78 |
+
line-height: 1;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
#header h1 {
|
| 82 |
+
font-size: 1rem;
|
| 83 |
+
font-weight: 700;
|
| 84 |
+
letter-spacing: 0.3px;
|
| 85 |
+
white-space: nowrap;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.header-sub {
|
| 89 |
+
font-weight: 400;
|
| 90 |
+
color: var(--text-muted);
|
| 91 |
+
font-size: 0.9rem;
|
| 92 |
+
letter-spacing: 2px;
|
| 93 |
+
margin-left: 2px;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.header-right {
|
| 97 |
+
display: flex;
|
| 98 |
+
align-items: center;
|
| 99 |
+
gap: 8px;
|
| 100 |
+
flex-shrink: 0;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
.gpu-badge {
|
| 104 |
+
font-size: 0.75rem;
|
| 105 |
+
padding: 3px 10px;
|
| 106 |
+
border-radius: 12px;
|
| 107 |
+
background: var(--bg-input);
|
| 108 |
+
color: var(--text-muted);
|
| 109 |
+
border: 1px solid var(--border);
|
| 110 |
+
white-space: nowrap;
|
| 111 |
+
max-width: 280px;
|
| 112 |
+
overflow: hidden;
|
| 113 |
+
text-overflow: ellipsis;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.btn-icon {
|
| 117 |
+
width: 30px;
|
| 118 |
+
height: 30px;
|
| 119 |
+
border: 1px solid var(--border);
|
| 120 |
+
border-radius: 50%;
|
| 121 |
+
background: var(--bg-input);
|
| 122 |
+
color: var(--text-muted);
|
| 123 |
+
font-size: 0.85rem;
|
| 124 |
+
font-weight: 700;
|
| 125 |
+
cursor: pointer;
|
| 126 |
+
display: flex;
|
| 127 |
+
align-items: center;
|
| 128 |
+
justify-content: center;
|
| 129 |
+
flex-shrink: 0;
|
| 130 |
+
transition: border-color 0.15s, color 0.15s;
|
| 131 |
+
}
|
| 132 |
+
.btn-icon:hover { border-color: var(--primary); color: var(--primary); }
|
| 133 |
+
|
| 134 |
+
/* ── GPU widget ─────────────────────────────────────────────────────── */
|
| 135 |
+
.gpu-widget {
|
| 136 |
+
display: flex;
|
| 137 |
+
gap: 8px;
|
| 138 |
+
align-items: center;
|
| 139 |
+
}
|
| 140 |
+
.gpu-card {
|
| 141 |
+
display: flex;
|
| 142 |
+
flex-direction: column;
|
| 143 |
+
gap: 3px;
|
| 144 |
+
font-size: 0.7rem;
|
| 145 |
+
color: var(--text-muted);
|
| 146 |
+
min-width: 90px;
|
| 147 |
+
max-width: 160px;
|
| 148 |
+
}
|
| 149 |
+
.gpu-card-name {
|
| 150 |
+
display: flex;
|
| 151 |
+
justify-content: space-between;
|
| 152 |
+
align-items: center;
|
| 153 |
+
gap: 4px;
|
| 154 |
+
white-space: nowrap;
|
| 155 |
+
overflow: hidden;
|
| 156 |
+
}
|
| 157 |
+
.gpu-card-name span { overflow: hidden; text-overflow: ellipsis; }
|
| 158 |
+
.gpu-util-pct {
|
| 159 |
+
font-size: 0.68rem;
|
| 160 |
+
color: var(--text-dim);
|
| 161 |
+
flex-shrink: 0;
|
| 162 |
+
}
|
| 163 |
+
.gpu-mem-bar {
|
| 164 |
+
height: 4px;
|
| 165 |
+
background: var(--bg-input);
|
| 166 |
+
border-radius: 2px;
|
| 167 |
+
overflow: hidden;
|
| 168 |
+
}
|
| 169 |
+
.gpu-mem-fill {
|
| 170 |
+
height: 100%;
|
| 171 |
+
border-radius: 2px;
|
| 172 |
+
background: var(--primary);
|
| 173 |
+
transition: width 0.5s ease;
|
| 174 |
+
}
|
| 175 |
+
.gpu-mem-fill.warm { background: var(--warning); }
|
| 176 |
+
.gpu-mem-fill.hot { background: var(--danger); }
|
| 177 |
+
.gpu-mem-label {
|
| 178 |
+
font-size: 0.65rem;
|
| 179 |
+
color: var(--text-muted);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
/* ── Toast notifications ─────────────────────────────────────────────── */
|
| 183 |
+
#toast-container {
|
| 184 |
+
position: fixed;
|
| 185 |
+
bottom: 20px;
|
| 186 |
+
right: 20px;
|
| 187 |
+
display: flex;
|
| 188 |
+
flex-direction: column;
|
| 189 |
+
gap: 8px;
|
| 190 |
+
z-index: 9999;
|
| 191 |
+
pointer-events: none;
|
| 192 |
+
}
|
| 193 |
+
.toast {
|
| 194 |
+
padding: 10px 16px;
|
| 195 |
+
border-radius: var(--radius);
|
| 196 |
+
font-size: 0.85rem;
|
| 197 |
+
box-shadow: var(--shadow);
|
| 198 |
+
pointer-events: auto;
|
| 199 |
+
animation: toast-in 0.2s ease;
|
| 200 |
+
max-width: 320px;
|
| 201 |
+
}
|
| 202 |
+
.toast-error { background: #7f1d1d; color: #fca5a5; border: 1px solid #991b1b; }
|
| 203 |
+
.toast-success { background: #14532d; color: #86efac; border: 1px solid #15803d; }
|
| 204 |
+
.toast-info { background: #1e3a5f; color: #93c5fd; border: 1px solid #1d4ed8; }
|
| 205 |
+
@keyframes toast-in { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: none; } }
|
| 206 |
+
|
| 207 |
+
/* ── Main layout (3 columns) ────────────────────────────────────────── */
|
| 208 |
+
#app {
|
| 209 |
+
display: grid;
|
| 210 |
+
grid-template-columns: var(--panel-left, 260px) 5px 1fr 5px var(--panel-right, 360px);
|
| 211 |
+
grid-template-rows: 1fr;
|
| 212 |
+
gap: 0;
|
| 213 |
+
flex: 1;
|
| 214 |
+
min-height: 0;
|
| 215 |
+
background: var(--border);
|
| 216 |
+
}
|
| 217 |
+
.panel-resize-handle {
|
| 218 |
+
background: var(--border);
|
| 219 |
+
cursor: col-resize;
|
| 220 |
+
transition: background 0.15s;
|
| 221 |
+
z-index: 10;
|
| 222 |
+
position: relative;
|
| 223 |
+
}
|
| 224 |
+
.panel-resize-handle:hover,
|
| 225 |
+
.panel-resize-handle.dragging {
|
| 226 |
+
background: var(--primary);
|
| 227 |
+
}
|
| 228 |
+
.panel-resize-handle::after {
|
| 229 |
+
content: '';
|
| 230 |
+
position: absolute;
|
| 231 |
+
inset: 0 -4px; /* wider hit area */
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
.panel {
|
| 235 |
+
background: var(--bg-panel);
|
| 236 |
+
overflow-y: auto;
|
| 237 |
+
overflow-x: hidden;
|
| 238 |
+
min-height: 0;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
/* Left panel flex column */
|
| 242 |
+
#engine-panel {
|
| 243 |
+
display: flex;
|
| 244 |
+
flex-direction: column;
|
| 245 |
+
gap: 0;
|
| 246 |
+
padding: 0;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.panel-section {
|
| 250 |
+
padding: 12px 12px 8px;
|
| 251 |
+
display: flex;
|
| 252 |
+
flex-direction: column;
|
| 253 |
+
gap: 7px;
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
.panel-footer {
|
| 257 |
+
padding: 10px 12px;
|
| 258 |
+
border-top: 1px solid var(--border);
|
| 259 |
+
margin-top: auto;
|
| 260 |
+
}
|
| 261 |
+
.footer-btn-row {
|
| 262 |
+
display: flex;
|
| 263 |
+
gap: 6px;
|
| 264 |
+
}
|
| 265 |
+
.footer-btn-row .btn {
|
| 266 |
+
flex: 1;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
.panel h2 {
|
| 270 |
+
font-size: 0.7rem;
|
| 271 |
+
text-transform: uppercase;
|
| 272 |
+
letter-spacing: 1.2px;
|
| 273 |
+
color: var(--text-muted);
|
| 274 |
+
margin-bottom: 2px;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
#engine-panel hr {
|
| 278 |
+
border: none;
|
| 279 |
+
border-top: 1px solid var(--border);
|
| 280 |
+
flex-shrink: 0;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
/* ── Form elements ──────────────────────────────────────────────────── */
|
| 284 |
+
label {
|
| 285 |
+
font-size: 0.78rem;
|
| 286 |
+
color: var(--text-dim);
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
select,
|
| 290 |
+
input[type="text"],
|
| 291 |
+
input[type="number"],
|
| 292 |
+
input[type="password"] {
|
| 293 |
+
width: 100%;
|
| 294 |
+
padding: 6px 9px;
|
| 295 |
+
background: var(--bg-input);
|
| 296 |
+
color: var(--text);
|
| 297 |
+
border: 1px solid var(--border);
|
| 298 |
+
border-radius: var(--radius);
|
| 299 |
+
font-size: 0.83rem;
|
| 300 |
+
font-family: var(--font);
|
| 301 |
+
transition: border-color 0.15s;
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
select:focus, input:focus, textarea:focus {
|
| 305 |
+
outline: none;
|
| 306 |
+
border-color: var(--primary);
|
| 307 |
+
box-shadow: 0 0 0 2px rgba(59,130,246,0.12);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
textarea {
|
| 311 |
+
width: 100%;
|
| 312 |
+
padding: 6px 9px;
|
| 313 |
+
background: var(--bg-input);
|
| 314 |
+
color: var(--text);
|
| 315 |
+
border: 1px solid var(--border);
|
| 316 |
+
border-radius: var(--radius);
|
| 317 |
+
font-size: 0.83rem;
|
| 318 |
+
font-family: var(--font);
|
| 319 |
+
transition: border-color 0.15s;
|
| 320 |
+
box-sizing: border-box;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
input::placeholder,
|
| 324 |
+
textarea::placeholder {
|
| 325 |
+
color: var(--text-dim);
|
| 326 |
+
font-style: italic;
|
| 327 |
+
opacity: 0.65;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
select option { background: var(--bg-panel); color: var(--text); }
|
| 331 |
+
|
| 332 |
+
/* Config form fields */
|
| 333 |
+
.config-field {
|
| 334 |
+
display: flex;
|
| 335 |
+
flex-direction: column;
|
| 336 |
+
gap: 3px;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
/* Select + refresh button row */
|
| 340 |
+
.select-row {
|
| 341 |
+
display: flex;
|
| 342 |
+
gap: 4px;
|
| 343 |
+
align-items: center;
|
| 344 |
+
}
|
| 345 |
+
.select-row select { flex: 1; min-width: 0; width: auto; }
|
| 346 |
+
|
| 347 |
+
.btn-refresh {
|
| 348 |
+
flex-shrink: 0;
|
| 349 |
+
width: 28px;
|
| 350 |
+
height: 28px;
|
| 351 |
+
border: 1px solid var(--border);
|
| 352 |
+
border-radius: var(--radius);
|
| 353 |
+
background: var(--bg-input);
|
| 354 |
+
color: var(--text-muted);
|
| 355 |
+
font-size: 1rem;
|
| 356 |
+
cursor: pointer;
|
| 357 |
+
display: flex;
|
| 358 |
+
align-items: center;
|
| 359 |
+
justify-content: center;
|
| 360 |
+
transition: background 0.15s, border-color 0.15s, color 0.15s;
|
| 361 |
+
}
|
| 362 |
+
.btn-refresh:hover:not(:disabled) { background: var(--bg-hover); border-color: var(--primary); color: var(--primary); }
|
| 363 |
+
.btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
|
| 364 |
+
|
| 365 |
+
.dynamic-hint {
|
| 366 |
+
font-size: 0.7rem;
|
| 367 |
+
color: var(--text-muted);
|
| 368 |
+
min-height: 1em;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
.config-field label { font-size: 0.75rem; color: var(--text-muted); }
|
| 372 |
+
|
| 373 |
+
.config-field-checkbox {
|
| 374 |
+
flex-direction: row;
|
| 375 |
+
align-items: center;
|
| 376 |
+
gap: 7px;
|
| 377 |
+
}
|
| 378 |
+
.config-field-checkbox input[type="checkbox"] {
|
| 379 |
+
width: auto;
|
| 380 |
+
accent-color: var(--primary);
|
| 381 |
+
cursor: pointer;
|
| 382 |
+
}
|
| 383 |
+
.config-field-checkbox label { font-size: 0.82rem; color: var(--text); cursor: pointer; }
|
| 384 |
+
|
| 385 |
+
#blla-options {
|
| 386 |
+
display: flex;
|
| 387 |
+
align-items: center;
|
| 388 |
+
gap: 8px;
|
| 389 |
+
}
|
| 390 |
+
#blla-options label { flex-shrink: 0; }
|
| 391 |
+
#blla-options input { width: 64px; }
|
| 392 |
+
|
| 393 |
+
/* ── Buttons ────────────────────────────────────────────────────────── */
|
| 394 |
+
.btn {
|
| 395 |
+
padding: 8px 14px;
|
| 396 |
+
border: none;
|
| 397 |
+
border-radius: var(--radius);
|
| 398 |
+
font-size: 0.83rem;
|
| 399 |
+
font-family: var(--font);
|
| 400 |
+
cursor: pointer;
|
| 401 |
+
transition: background 0.15s, transform 0.1s;
|
| 402 |
+
white-space: nowrap;
|
| 403 |
+
}
|
| 404 |
+
.btn:active:not(:disabled) { transform: translateY(1px); }
|
| 405 |
+
.btn:disabled { opacity: 0.38; cursor: not-allowed; }
|
| 406 |
+
|
| 407 |
+
.btn-full { width: 100%; }
|
| 408 |
+
|
| 409 |
+
.btn-primary { background: var(--primary); color: white; }
|
| 410 |
+
.btn-primary:hover:not(:disabled) { background: var(--primary-hover); }
|
| 411 |
+
|
| 412 |
+
.btn-accent { background: var(--accent); color: white; }
|
| 413 |
+
.btn-accent:hover:not(:disabled) { background: var(--accent-hover); }
|
| 414 |
+
|
| 415 |
+
.btn-small {
|
| 416 |
+
padding: 5px 10px;
|
| 417 |
+
font-size: 0.78rem;
|
| 418 |
+
background: var(--bg-input);
|
| 419 |
+
color: var(--text-dim);
|
| 420 |
+
border: 1px solid var(--border);
|
| 421 |
+
}
|
| 422 |
+
.btn-small:hover:not(:disabled) { background: var(--bg-hover); border-color: var(--border-light); color: var(--text); }
|
| 423 |
+
|
| 424 |
+
.btn-outline {
|
| 425 |
+
background: transparent;
|
| 426 |
+
border: 1px solid var(--border);
|
| 427 |
+
cursor: pointer;
|
| 428 |
+
border-radius: var(--radius);
|
| 429 |
+
display: inline-flex;
|
| 430 |
+
align-items: center;
|
| 431 |
+
font-size: 0.78rem;
|
| 432 |
+
color: var(--text-dim);
|
| 433 |
+
padding: 4px 8px;
|
| 434 |
+
transition: background 0.15s, border-color 0.15s;
|
| 435 |
+
}
|
| 436 |
+
.btn-outline:hover { background: var(--bg-hover); border-color: var(--primary); color: var(--text); }
|
| 437 |
+
|
| 438 |
+
.btn-row {
|
| 439 |
+
display: flex;
|
| 440 |
+
gap: 6px;
|
| 441 |
+
flex-wrap: wrap;
|
| 442 |
+
margin-top: 6px;
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
/* Save key row */
|
| 446 |
+
.key-save-row {
|
| 447 |
+
display: flex;
|
| 448 |
+
align-items: center;
|
| 449 |
+
gap: 6px;
|
| 450 |
+
margin-top: 4px;
|
| 451 |
+
font-size: 0.76rem;
|
| 452 |
+
color: var(--text-muted);
|
| 453 |
+
}
|
| 454 |
+
.key-save-row input[type="checkbox"] { width: auto; margin: 0; accent-color: var(--primary); }
|
| 455 |
+
.key-save-row label { cursor: pointer; }
|
| 456 |
+
|
| 457 |
+
input[disabled] { opacity: 0.45; cursor: not-allowed; }
|
| 458 |
+
|
| 459 |
+
/* ── Upload area ────────────────────────────────────────────────────── */
|
| 460 |
+
.upload-area {
|
| 461 |
+
border: 2px dashed var(--border);
|
| 462 |
+
border-radius: var(--radius-lg);
|
| 463 |
+
padding: 18px 12px;
|
| 464 |
+
text-align: center;
|
| 465 |
+
cursor: pointer;
|
| 466 |
+
transition: border-color 0.2s, background 0.2s;
|
| 467 |
+
font-size: 0.83rem;
|
| 468 |
+
color: var(--text-muted);
|
| 469 |
+
display: flex;
|
| 470 |
+
flex-direction: column;
|
| 471 |
+
align-items: center;
|
| 472 |
+
gap: 8px;
|
| 473 |
+
}
|
| 474 |
+
.upload-area:hover, .upload-area.dragover {
|
| 475 |
+
border-color: var(--primary);
|
| 476 |
+
background: rgba(59,130,246,0.06);
|
| 477 |
+
color: var(--text-dim);
|
| 478 |
+
}
|
| 479 |
+
.upload-icon {
|
| 480 |
+
width: 28px;
|
| 481 |
+
height: 28px;
|
| 482 |
+
opacity: 0.5;
|
| 483 |
+
}
|
| 484 |
+
.upload-area:hover .upload-icon,
|
| 485 |
+
.upload-area.dragover .upload-icon { opacity: 0.8; }
|
| 486 |
+
|
| 487 |
+
/* XML row */
|
| 488 |
+
.xml-row {
|
| 489 |
+
display: flex;
|
| 490 |
+
align-items: center;
|
| 491 |
+
gap: 8px;
|
| 492 |
+
}
|
| 493 |
+
.xml-row .muted {
|
| 494 |
+
flex: 1;
|
| 495 |
+
font-size: 0.78rem;
|
| 496 |
+
overflow: hidden;
|
| 497 |
+
text-overflow: ellipsis;
|
| 498 |
+
white-space: nowrap;
|
| 499 |
+
}
|
| 500 |
+
.xml-ok { color: var(--success) !important; }
|
| 501 |
+
|
| 502 |
+
/* ── Image viewer (center) ──────────────────────────────────────────── */
|
| 503 |
+
#viewer-panel {
|
| 504 |
+
padding: 0;
|
| 505 |
+
position: relative;
|
| 506 |
+
overflow: hidden;
|
| 507 |
+
display: flex;
|
| 508 |
+
flex-direction: column;
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
/* Zoom toolbar */
|
| 512 |
+
.zoom-toolbar {
|
| 513 |
+
display: flex;
|
| 514 |
+
align-items: center;
|
| 515 |
+
gap: 4px;
|
| 516 |
+
padding: 5px 8px;
|
| 517 |
+
background: var(--bg-panel);
|
| 518 |
+
border-bottom: 1px solid var(--border);
|
| 519 |
+
flex-shrink: 0;
|
| 520 |
+
z-index: 2;
|
| 521 |
+
}
|
| 522 |
+
.zoom-btn {
|
| 523 |
+
width: 26px;
|
| 524 |
+
height: 26px;
|
| 525 |
+
border: 1px solid var(--border);
|
| 526 |
+
border-radius: var(--radius);
|
| 527 |
+
background: var(--bg-input);
|
| 528 |
+
color: var(--text-dim);
|
| 529 |
+
font-size: 1rem;
|
| 530 |
+
line-height: 1;
|
| 531 |
+
cursor: pointer;
|
| 532 |
+
display: flex;
|
| 533 |
+
align-items: center;
|
| 534 |
+
justify-content: center;
|
| 535 |
+
transition: background 0.15s, border-color 0.15s;
|
| 536 |
+
}
|
| 537 |
+
.zoom-btn:hover { background: var(--bg-hover); border-color: var(--border-light); color: var(--text); }
|
| 538 |
+
.zoom-fit { font-size: 0.8rem; width: auto; padding: 0 7px; }
|
| 539 |
+
.zoom-toolbar-sep { width: 1px; background: var(--border); margin: 0 4px; align-self: stretch; }
|
| 540 |
+
.nav-btn { padding: 2px 8px; font-size: .8rem; line-height: 1.6; }
|
| 541 |
+
.nav-btn:disabled { opacity: 0.3; cursor: default; }
|
| 542 |
+
.batch-nav-label-toolbar { font-size: .78rem; color: var(--text-muted); min-width: 36px; text-align: center; }
|
| 543 |
+
.zoom-level {
|
| 544 |
+
font-size: 0.75rem;
|
| 545 |
+
color: var(--text-muted);
|
| 546 |
+
min-width: 3.5em;
|
| 547 |
+
text-align: center;
|
| 548 |
+
font-family: var(--font-mono);
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
/* Scrollable image area */
|
| 552 |
+
#viewer-scroll {
|
| 553 |
+
flex: 1;
|
| 554 |
+
overflow: auto;
|
| 555 |
+
display: flex;
|
| 556 |
+
align-items: flex-start;
|
| 557 |
+
justify-content: flex-start;
|
| 558 |
+
min-height: 0;
|
| 559 |
+
position: relative;
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
/* Placeholder — fills scroll area and centers content */
|
| 563 |
+
.viewer-placeholder {
|
| 564 |
+
width: 100%;
|
| 565 |
+
height: 100%;
|
| 566 |
+
min-height: 200px;
|
| 567 |
+
display: flex;
|
| 568 |
+
flex-direction: column;
|
| 569 |
+
align-items: center;
|
| 570 |
+
justify-content: center;
|
| 571 |
+
gap: 14px;
|
| 572 |
+
color: var(--text-muted);
|
| 573 |
+
font-size: 0.9rem;
|
| 574 |
+
user-select: none;
|
| 575 |
+
}
|
| 576 |
+
.viewer-placeholder.dragover {
|
| 577 |
+
color: var(--primary);
|
| 578 |
+
background: rgba(59, 130, 246, 0.08);
|
| 579 |
+
}
|
| 580 |
+
.viewer-placeholder svg {
|
| 581 |
+
width: 56px;
|
| 582 |
+
height: 56px;
|
| 583 |
+
opacity: 0.25;
|
| 584 |
+
}
|
| 585 |
+
.viewer-placeholder p { opacity: 0.6; }
|
| 586 |
+
|
| 587 |
+
/* Image container — only shows when image is loaded */
|
| 588 |
+
#image-container {
|
| 589 |
+
position: relative;
|
| 590 |
+
flex-shrink: 0;
|
| 591 |
+
line-height: 0;
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
#page-image {
|
| 595 |
+
display: block;
|
| 596 |
+
/* width controlled by zoom JS; height auto */
|
| 597 |
+
transition: width 0.08s ease-out, height 0.08s ease-out;
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
#overlay-canvas {
|
| 601 |
+
position: absolute;
|
| 602 |
+
top: 0;
|
| 603 |
+
left: 0;
|
| 604 |
+
pointer-events: auto;
|
| 605 |
+
cursor: crosshair;
|
| 606 |
+
transition: width 0.08s ease-out, height 0.08s ease-out;
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
/* ── Results panel (right) ──────────────────────────────────────────── */
|
| 610 |
+
#results-panel {
|
| 611 |
+
display: flex;
|
| 612 |
+
flex-direction: column;
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
.results-header {
|
| 616 |
+
padding: 12px 12px 8px;
|
| 617 |
+
border-bottom: 1px solid var(--border);
|
| 618 |
+
flex-shrink: 0;
|
| 619 |
+
}
|
| 620 |
+
.results-header-row {
|
| 621 |
+
display: flex;
|
| 622 |
+
align-items: center;
|
| 623 |
+
justify-content: space-between;
|
| 624 |
+
margin-bottom: 0;
|
| 625 |
+
}
|
| 626 |
+
.results-header-row h2 { margin-bottom: 0; }
|
| 627 |
+
.results-header-controls {
|
| 628 |
+
display: flex;
|
| 629 |
+
align-items: center;
|
| 630 |
+
gap: 5px;
|
| 631 |
+
}
|
| 632 |
+
.btn-icon.active { border-color: var(--primary); color: var(--primary); background: rgba(59,130,246,0.1); }
|
| 633 |
+
|
| 634 |
+
/* Font selector in results header */
|
| 635 |
+
.font-select {
|
| 636 |
+
width: auto !important;
|
| 637 |
+
padding: 3px 5px !important;
|
| 638 |
+
font-size: 0.72rem !important;
|
| 639 |
+
height: 26px;
|
| 640 |
+
border-radius: var(--radius);
|
| 641 |
+
color: var(--text-muted);
|
| 642 |
+
max-width: 140px;
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
#transcription-lines {
|
| 646 |
+
flex: 1;
|
| 647 |
+
overflow-y: auto;
|
| 648 |
+
font-family: var(--font-results, var(--font-mono));
|
| 649 |
+
font-size: 0.83rem;
|
| 650 |
+
line-height: 1.5;
|
| 651 |
+
padding: 4px 0;
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
.line-result {
|
| 655 |
+
padding: 5px 10px;
|
| 656 |
+
border-bottom: 1px solid rgba(45,63,89,0.5);
|
| 657 |
+
cursor: pointer;
|
| 658 |
+
transition: background 0.1s;
|
| 659 |
+
}
|
| 660 |
+
.line-result:last-child { border-bottom: none; }
|
| 661 |
+
.line-result:hover { background: var(--bg-hover); }
|
| 662 |
+
|
| 663 |
+
.line-num {
|
| 664 |
+
color: var(--text-muted);
|
| 665 |
+
font-size: 0.68rem;
|
| 666 |
+
margin-right: 7px;
|
| 667 |
+
user-select: none;
|
| 668 |
+
display: inline-block;
|
| 669 |
+
min-width: 2.2em;
|
| 670 |
+
text-align: right;
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
.confidence {
|
| 674 |
+
float: right;
|
| 675 |
+
font-size: 0.68rem;
|
| 676 |
+
padding: 1px 6px;
|
| 677 |
+
border-radius: 8px;
|
| 678 |
+
margin-left: 6px;
|
| 679 |
+
margin-top: 2px;
|
| 680 |
+
}
|
| 681 |
+
.conf-high { background: rgba(34,197,94,0.15); color: var(--success); }
|
| 682 |
+
.conf-mid { background: rgba(245,158,11,0.15); color: var(--warning); }
|
| 683 |
+
.conf-low { background: rgba(239,68,68,0.15); color: var(--danger); }
|
| 684 |
+
|
| 685 |
+
.line-result.line-active {
|
| 686 |
+
background: rgba(233,69,96,0.12);
|
| 687 |
+
border-left: 3px solid var(--accent);
|
| 688 |
+
}
|
| 689 |
+
.line-result.highlight {
|
| 690 |
+
background: rgba(59,130,246,0.12);
|
| 691 |
+
border-left: 3px solid var(--primary);
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
/* Dimmed lines (below confidence threshold) */
|
| 695 |
+
.line-result.line-dimmed {
|
| 696 |
+
opacity: 0.28;
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
/* Inline editing */
|
| 700 |
+
.line-text {
|
| 701 |
+
display: inline;
|
| 702 |
+
outline: none;
|
| 703 |
+
border-radius: 2px;
|
| 704 |
+
}
|
| 705 |
+
.line-text[contenteditable="true"] {
|
| 706 |
+
background: rgba(58, 134, 255, 0.08);
|
| 707 |
+
outline: 1px dashed var(--primary);
|
| 708 |
+
padding: 0 3px;
|
| 709 |
+
cursor: text;
|
| 710 |
+
}
|
| 711 |
+
/* Gemini thinking/reasoning block */
|
| 712 |
+
.thinking-block {
|
| 713 |
+
display: block;
|
| 714 |
+
width: 100%;
|
| 715 |
+
margin-top: 4px;
|
| 716 |
+
}
|
| 717 |
+
.thinking-toggle {
|
| 718 |
+
font-size: 0.7rem;
|
| 719 |
+
color: var(--text-dim);
|
| 720 |
+
cursor: pointer;
|
| 721 |
+
user-select: none;
|
| 722 |
+
letter-spacing: 0.04em;
|
| 723 |
+
text-transform: uppercase;
|
| 724 |
+
}
|
| 725 |
+
.thinking-toggle:hover { color: var(--primary); }
|
| 726 |
+
.thinking-text {
|
| 727 |
+
margin: 4px 0 0 0;
|
| 728 |
+
padding: 6px 10px;
|
| 729 |
+
font-size: 0.72rem;
|
| 730 |
+
font-family: monospace;
|
| 731 |
+
white-space: pre-wrap;
|
| 732 |
+
word-break: break-word;
|
| 733 |
+
background: var(--bg-input);
|
| 734 |
+
border-left: 2px solid var(--border);
|
| 735 |
+
color: var(--text-dim);
|
| 736 |
+
border-radius: 0 3px 3px 0;
|
| 737 |
+
max-height: 300px;
|
| 738 |
+
overflow-y: auto;
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
.line-result.line-edited .line-num::after {
|
| 742 |
+
content: '✎';
|
| 743 |
+
color: var(--primary);
|
| 744 |
+
font-size: 0.6rem;
|
| 745 |
+
margin-left: 2px;
|
| 746 |
+
}
|
| 747 |
+
|
| 748 |
+
/* Confidence filter row */
|
| 749 |
+
.results-search-row {
|
| 750 |
+
display: flex;
|
| 751 |
+
align-items: center;
|
| 752 |
+
gap: 6px;
|
| 753 |
+
padding: 4px 0;
|
| 754 |
+
}
|
| 755 |
+
.results-search-row input[type="search"] {
|
| 756 |
+
flex: 1;
|
| 757 |
+
min-width: 0;
|
| 758 |
+
background: var(--bg-input);
|
| 759 |
+
border: 1px solid var(--border);
|
| 760 |
+
border-radius: var(--radius);
|
| 761 |
+
color: var(--text);
|
| 762 |
+
font-size: 0.8rem;
|
| 763 |
+
padding: 3px 8px;
|
| 764 |
+
}
|
| 765 |
+
.results-search-row input[type="search"]:focus {
|
| 766 |
+
outline: none;
|
| 767 |
+
border-color: var(--primary);
|
| 768 |
+
}
|
| 769 |
+
#results-search-count {
|
| 770 |
+
font-size: 0.72rem;
|
| 771 |
+
white-space: nowrap;
|
| 772 |
+
}
|
| 773 |
+
.line-result.line-hidden { display: none; }
|
| 774 |
+
.line-result mark { background: color-mix(in srgb, var(--accent) 35%, transparent); border-radius: 2px; }
|
| 775 |
+
|
| 776 |
+
/* Thinking / reasoning block (Gemini, Claude) */
|
| 777 |
+
.thinking-block {
|
| 778 |
+
margin-top: 4px;
|
| 779 |
+
border-left: 2px solid var(--accent);
|
| 780 |
+
border-radius: 0 var(--radius-sm) var(--radius-sm) 0;
|
| 781 |
+
background: color-mix(in srgb, var(--accent) 6%, var(--bg-secondary));
|
| 782 |
+
font-size: 0.75rem;
|
| 783 |
+
}
|
| 784 |
+
.thinking-toggle {
|
| 785 |
+
cursor: pointer;
|
| 786 |
+
padding: 2px 6px;
|
| 787 |
+
color: var(--accent);
|
| 788 |
+
user-select: none;
|
| 789 |
+
font-style: italic;
|
| 790 |
+
list-style: none;
|
| 791 |
+
}
|
| 792 |
+
.thinking-toggle::marker,
|
| 793 |
+
.thinking-toggle::-webkit-details-marker { display: none; }
|
| 794 |
+
.thinking-toggle::before {
|
| 795 |
+
content: '▶ ';
|
| 796 |
+
font-style: normal;
|
| 797 |
+
font-size: 0.65rem;
|
| 798 |
+
transition: transform 0.15s;
|
| 799 |
+
}
|
| 800 |
+
details[open] > .thinking-toggle::before { content: '▼ '; }
|
| 801 |
+
.thinking-text {
|
| 802 |
+
margin: 0;
|
| 803 |
+
padding: 4px 8px 6px;
|
| 804 |
+
white-space: pre-wrap;
|
| 805 |
+
word-break: break-word;
|
| 806 |
+
color: var(--text-secondary);
|
| 807 |
+
font-family: inherit;
|
| 808 |
+
line-height: 1.45;
|
| 809 |
+
max-height: 200px;
|
| 810 |
+
overflow-y: auto;
|
| 811 |
+
}
|
| 812 |
+
|
| 813 |
+
.conf-filter-row {
|
| 814 |
+
display: flex;
|
| 815 |
+
align-items: center;
|
| 816 |
+
gap: 8px;
|
| 817 |
+
padding: 4px 0 6px;
|
| 818 |
+
font-size: 0.75rem;
|
| 819 |
+
color: var(--text-muted);
|
| 820 |
+
}
|
| 821 |
+
.conf-filter-row input[type="range"] {
|
| 822 |
+
flex: 1;
|
| 823 |
+
width: auto;
|
| 824 |
+
height: 3px;
|
| 825 |
+
cursor: pointer;
|
| 826 |
+
accent-color: var(--primary);
|
| 827 |
+
padding: 0;
|
| 828 |
+
background: none;
|
| 829 |
+
border: none;
|
| 830 |
+
}
|
| 831 |
+
|
| 832 |
+
/* Batch queue */
|
| 833 |
+
#batch-queue-section {
|
| 834 |
+
margin-top: 6px;
|
| 835 |
+
border-top: 1px solid var(--border);
|
| 836 |
+
padding-top: 8px;
|
| 837 |
+
}
|
| 838 |
+
.batch-queue-header {
|
| 839 |
+
display: flex;
|
| 840 |
+
align-items: center;
|
| 841 |
+
justify-content: space-between;
|
| 842 |
+
margin-bottom: 4px;
|
| 843 |
+
}
|
| 844 |
+
.section-label {
|
| 845 |
+
font-size: 0.7rem;
|
| 846 |
+
font-weight: 600;
|
| 847 |
+
text-transform: uppercase;
|
| 848 |
+
letter-spacing: .06em;
|
| 849 |
+
color: var(--text-muted);
|
| 850 |
+
}
|
| 851 |
+
.batch-overall-progress {
|
| 852 |
+
font-size: 0.72rem;
|
| 853 |
+
font-family: var(--font-mono);
|
| 854 |
+
color: var(--accent);
|
| 855 |
+
background: color-mix(in srgb, var(--accent) 12%, transparent);
|
| 856 |
+
padding: 1px 7px;
|
| 857 |
+
border-radius: 10px;
|
| 858 |
+
}
|
| 859 |
+
.batch-item {
|
| 860 |
+
display: flex;
|
| 861 |
+
align-items: center;
|
| 862 |
+
gap: 6px;
|
| 863 |
+
padding: 4px 2px;
|
| 864 |
+
font-size: 0.78rem;
|
| 865 |
+
border-bottom: 1px solid rgba(45,63,89,0.4);
|
| 866 |
+
}
|
| 867 |
+
.batch-item:last-child { border-bottom: none; }
|
| 868 |
+
.batch-drag-handle {
|
| 869 |
+
cursor: grab;
|
| 870 |
+
color: var(--text-muted);
|
| 871 |
+
font-size: 0.9rem;
|
| 872 |
+
line-height: 1;
|
| 873 |
+
padding: 0 2px;
|
| 874 |
+
flex-shrink: 0;
|
| 875 |
+
user-select: none;
|
| 876 |
+
}
|
| 877 |
+
.batch-drag-handle:active { cursor: grabbing; }
|
| 878 |
+
.batch-item.batch-dragging { opacity: 0.4; }
|
| 879 |
+
.batch-item.batch-drag-over {
|
| 880 |
+
border-top: 2px solid var(--accent);
|
| 881 |
+
margin-top: -1px;
|
| 882 |
+
}
|
| 883 |
+
.batch-item-name {
|
| 884 |
+
flex: 1;
|
| 885 |
+
overflow: hidden;
|
| 886 |
+
text-overflow: ellipsis;
|
| 887 |
+
white-space: nowrap;
|
| 888 |
+
color: var(--text);
|
| 889 |
+
}
|
| 890 |
+
.batch-status {
|
| 891 |
+
font-size: 0.68rem;
|
| 892 |
+
flex-shrink: 0;
|
| 893 |
+
min-width: 56px;
|
| 894 |
+
text-align: right;
|
| 895 |
+
color: var(--text-muted);
|
| 896 |
+
}
|
| 897 |
+
.batch-status.done { color: var(--success); }
|
| 898 |
+
.batch-status.error { color: var(--danger); }
|
| 899 |
+
.batch-status.active { color: var(--primary); }
|
| 900 |
+
|
| 901 |
+
.batch-nav-row {
|
| 902 |
+
display: flex;
|
| 903 |
+
align-items: center;
|
| 904 |
+
gap: 4px;
|
| 905 |
+
margin-top: 6px;
|
| 906 |
+
}
|
| 907 |
+
.batch-options-row {
|
| 908 |
+
display: flex;
|
| 909 |
+
gap: 14px;
|
| 910 |
+
align-items: center;
|
| 911 |
+
margin-top: 6px;
|
| 912 |
+
flex-wrap: wrap;
|
| 913 |
+
}
|
| 914 |
+
.checkbox-label {
|
| 915 |
+
display: flex;
|
| 916 |
+
align-items: center;
|
| 917 |
+
gap: 5px;
|
| 918 |
+
font-size: .8rem;
|
| 919 |
+
color: var(--text-muted);
|
| 920 |
+
cursor: pointer;
|
| 921 |
+
user-select: none;
|
| 922 |
+
}
|
| 923 |
+
.checkbox-label input[type="checkbox"] { cursor: pointer; }
|
| 924 |
+
|
| 925 |
+
/* Column layout (multi-region side-by-side) */
|
| 926 |
+
#transcription-lines.col-layout {
|
| 927 |
+
display: flex;
|
| 928 |
+
flex-direction: row;
|
| 929 |
+
align-items: flex-start; /* columns grow to their content height */
|
| 930 |
+
overflow-x: auto;
|
| 931 |
+
/* overflow-y stays 'auto' from the base rule — unified scroll */
|
| 932 |
+
padding: 0;
|
| 933 |
+
gap: 0;
|
| 934 |
+
}
|
| 935 |
+
.region-column {
|
| 936 |
+
flex: 0 0 auto; /* don't shrink; grow to content */
|
| 937 |
+
min-width: 220px;
|
| 938 |
+
width: max-content; /* each column as wide as its widest line */
|
| 939 |
+
max-width: min(520px, 75vw);
|
| 940 |
+
display: flex;
|
| 941 |
+
flex-direction: column;
|
| 942 |
+
border-right: 1px solid var(--border);
|
| 943 |
+
/* No overflow-y — parent handles the single scrollbar */
|
| 944 |
+
}
|
| 945 |
+
.region-column:last-child { border-right: none; }
|
| 946 |
+
/* Prevent line text from wrapping inside column cells */
|
| 947 |
+
.region-column .line-result { white-space: nowrap; }
|
| 948 |
+
.region-col-header {
|
| 949 |
+
display: flex;
|
| 950 |
+
align-items: center;
|
| 951 |
+
justify-content: space-between;
|
| 952 |
+
gap: 4px;
|
| 953 |
+
padding: 4px 8px;
|
| 954 |
+
font-size: 0.68rem;
|
| 955 |
+
font-weight: 600;
|
| 956 |
+
text-transform: uppercase;
|
| 957 |
+
letter-spacing: 0.8px;
|
| 958 |
+
color: var(--text-muted);
|
| 959 |
+
background: var(--bg-section);
|
| 960 |
+
border-bottom: 1px solid var(--border);
|
| 961 |
+
position: sticky;
|
| 962 |
+
top: 0;
|
| 963 |
+
z-index: 1;
|
| 964 |
+
}
|
| 965 |
+
.region-col-close {
|
| 966 |
+
flex-shrink: 0;
|
| 967 |
+
background: none;
|
| 968 |
+
border: none;
|
| 969 |
+
color: var(--text-muted);
|
| 970 |
+
cursor: pointer;
|
| 971 |
+
font-size: 1rem;
|
| 972 |
+
line-height: 1;
|
| 973 |
+
padding: 0 2px;
|
| 974 |
+
border-radius: 3px;
|
| 975 |
+
transition: color 0.1s, background 0.1s;
|
| 976 |
+
}
|
| 977 |
+
.region-col-close:hover { color: var(--danger); background: rgba(239,68,68,0.1); }
|
| 978 |
+
|
| 979 |
+
/* Detected region list (below segmentation controls) */
|
| 980 |
+
#seg-regions-list {
|
| 981 |
+
margin: 0 12px 8px;
|
| 982 |
+
border: 1px solid var(--border);
|
| 983 |
+
border-radius: var(--radius);
|
| 984 |
+
background: var(--bg-section);
|
| 985 |
+
overflow: hidden;
|
| 986 |
+
}
|
| 987 |
+
.seg-regions-header {
|
| 988 |
+
padding: 5px 10px;
|
| 989 |
+
font-size: 0.68rem;
|
| 990 |
+
font-weight: 600;
|
| 991 |
+
text-transform: uppercase;
|
| 992 |
+
letter-spacing: 0.8px;
|
| 993 |
+
color: var(--text-muted);
|
| 994 |
+
border-bottom: 1px solid var(--border);
|
| 995 |
+
}
|
| 996 |
+
.seg-region-row {
|
| 997 |
+
display: flex;
|
| 998 |
+
align-items: center;
|
| 999 |
+
gap: 7px;
|
| 1000 |
+
padding: 5px 10px;
|
| 1001 |
+
border-bottom: 1px solid rgba(45,63,89,0.4);
|
| 1002 |
+
font-size: 0.78rem;
|
| 1003 |
+
}
|
| 1004 |
+
.seg-region-row:last-child { border-bottom: none; }
|
| 1005 |
+
.seg-region-dot {
|
| 1006 |
+
width: 10px;
|
| 1007 |
+
height: 10px;
|
| 1008 |
+
border-radius: 50%;
|
| 1009 |
+
flex-shrink: 0;
|
| 1010 |
+
}
|
| 1011 |
+
.seg-region-label { font-weight: 600; color: var(--text); min-width: 2em; }
|
| 1012 |
+
.seg-region-count { flex: 1; color: var(--text-muted); }
|
| 1013 |
+
.seg-region-del {
|
| 1014 |
+
width: 22px !important;
|
| 1015 |
+
height: 22px !important;
|
| 1016 |
+
font-size: 0.9rem !important;
|
| 1017 |
+
flex-shrink: 0;
|
| 1018 |
+
}
|
| 1019 |
+
|
| 1020 |
+
/* Region separator */
|
| 1021 |
+
.region-separator {
|
| 1022 |
+
padding: 4px 10px;
|
| 1023 |
+
font-size: 0.68rem;
|
| 1024 |
+
color: var(--text-muted);
|
| 1025 |
+
background: var(--bg-section);
|
| 1026 |
+
border-bottom: 1px solid var(--border);
|
| 1027 |
+
letter-spacing: 0.5px;
|
| 1028 |
+
text-transform: uppercase;
|
| 1029 |
+
}
|
| 1030 |
+
|
| 1031 |
+
#results-footer {
|
| 1032 |
+
padding: 8px 12px;
|
| 1033 |
+
border-top: 1px solid var(--border);
|
| 1034 |
+
flex-shrink: 0;
|
| 1035 |
+
}
|
| 1036 |
+
|
| 1037 |
+
/* ── Progress bar ───────────────────────────────────────────────────── */
|
| 1038 |
+
.progress-row {
|
| 1039 |
+
display: flex;
|
| 1040 |
+
align-items: center;
|
| 1041 |
+
justify-content: space-between;
|
| 1042 |
+
gap: 8px;
|
| 1043 |
+
margin-top: 4px;
|
| 1044 |
+
}
|
| 1045 |
+
|
| 1046 |
+
#progress-bar {
|
| 1047 |
+
height: 4px;
|
| 1048 |
+
background: var(--bg-input);
|
| 1049 |
+
border-radius: 2px;
|
| 1050 |
+
overflow: hidden;
|
| 1051 |
+
margin-top: 8px;
|
| 1052 |
+
}
|
| 1053 |
+
#progress-fill {
|
| 1054 |
+
height: 100%;
|
| 1055 |
+
width: 0%;
|
| 1056 |
+
background: linear-gradient(90deg, var(--primary), var(--accent));
|
| 1057 |
+
transition: width 0.25s ease;
|
| 1058 |
+
border-radius: 2px;
|
| 1059 |
+
}
|
| 1060 |
+
|
| 1061 |
+
/* ── Status badges ──────────────────────────────────────────────────── */
|
| 1062 |
+
.status-badge {
|
| 1063 |
+
font-size: 0.78rem;
|
| 1064 |
+
padding: 4px 10px;
|
| 1065 |
+
border-radius: var(--radius);
|
| 1066 |
+
text-align: center;
|
| 1067 |
+
}
|
| 1068 |
+
.status-loaded { background: rgba(34,197,94,0.12); color: var(--success); border: 1px solid rgba(34,197,94,0.25); }
|
| 1069 |
+
.status-loading { background: rgba(59,130,246,0.12); color: var(--primary); border: 1px solid rgba(59,130,246,0.25); }
|
| 1070 |
+
|
| 1071 |
+
/* ── Spinner on buttons ─────────────────────────────────────────────── */
|
| 1072 |
+
.btn.loading { pointer-events: none; opacity: 0.7; }
|
| 1073 |
+
.btn.loading::after {
|
| 1074 |
+
content: '';
|
| 1075 |
+
display: inline-block;
|
| 1076 |
+
width: 11px;
|
| 1077 |
+
height: 11px;
|
| 1078 |
+
margin-left: 7px;
|
| 1079 |
+
border: 2px solid transparent;
|
| 1080 |
+
border-top-color: currentColor;
|
| 1081 |
+
border-radius: 50%;
|
| 1082 |
+
animation: spin 0.65s linear infinite;
|
| 1083 |
+
vertical-align: middle;
|
| 1084 |
+
}
|
| 1085 |
+
@keyframes spin { to { transform: rotate(360deg); } }
|
| 1086 |
+
|
| 1087 |
+
/* ── Utilities ──────────────────────────────────────────────────────── */
|
| 1088 |
+
.muted { color: var(--text-muted); font-size: 0.8rem; }
|
| 1089 |
+
.hidden { display: none !important; }
|
| 1090 |
+
|
| 1091 |
+
/* ── Help modal ─────────────────────────────────────────────────────── */
|
| 1092 |
+
#help-modal {
|
| 1093 |
+
background: var(--bg-panel);
|
| 1094 |
+
color: var(--text);
|
| 1095 |
+
border: 1px solid var(--border);
|
| 1096 |
+
border-radius: var(--radius-lg);
|
| 1097 |
+
box-shadow: var(--shadow);
|
| 1098 |
+
padding: 0;
|
| 1099 |
+
width: min(680px, 96vw);
|
| 1100 |
+
max-height: 82vh;
|
| 1101 |
+
overflow: hidden;
|
| 1102 |
+
}
|
| 1103 |
+
#help-modal[open] {
|
| 1104 |
+
display: flex;
|
| 1105 |
+
flex-direction: column;
|
| 1106 |
+
}
|
| 1107 |
+
#help-modal::backdrop {
|
| 1108 |
+
background: rgba(0,0,0,0.65);
|
| 1109 |
+
backdrop-filter: blur(2px);
|
| 1110 |
+
}
|
| 1111 |
+
|
| 1112 |
+
.modal-header {
|
| 1113 |
+
display: flex;
|
| 1114 |
+
align-items: center;
|
| 1115 |
+
justify-content: space-between;
|
| 1116 |
+
padding: 14px 18px;
|
| 1117 |
+
border-bottom: 1px solid var(--border);
|
| 1118 |
+
flex-shrink: 0;
|
| 1119 |
+
}
|
| 1120 |
+
.modal-header h2 { font-size: 1rem; font-weight: 600; }
|
| 1121 |
+
|
| 1122 |
+
.modal-body {
|
| 1123 |
+
overflow-y: auto;
|
| 1124 |
+
padding: 18px;
|
| 1125 |
+
display: flex;
|
| 1126 |
+
flex-direction: column;
|
| 1127 |
+
gap: 16px;
|
| 1128 |
+
font-size: 0.88rem;
|
| 1129 |
+
line-height: 1.6;
|
| 1130 |
+
}
|
| 1131 |
+
|
| 1132 |
+
.modal-body h3 {
|
| 1133 |
+
font-size: 0.8rem;
|
| 1134 |
+
text-transform: uppercase;
|
| 1135 |
+
letter-spacing: 0.8px;
|
| 1136 |
+
color: var(--text-muted);
|
| 1137 |
+
border-bottom: 1px solid var(--border);
|
| 1138 |
+
padding-bottom: 4px;
|
| 1139 |
+
margin-top: 4px;
|
| 1140 |
+
}
|
| 1141 |
+
|
| 1142 |
+
.modal-body ol, .modal-body ul { padding-left: 1.4em; display: flex; flex-direction: column; gap: 5px; }
|
| 1143 |
+
.modal-body li { color: var(--text-dim); }
|
| 1144 |
+
.modal-body strong { color: var(--text); font-weight: 600; }
|
| 1145 |
+
|
| 1146 |
+
.modal-body table {
|
| 1147 |
+
width: 100%;
|
| 1148 |
+
border-collapse: collapse;
|
| 1149 |
+
font-size: 0.83rem;
|
| 1150 |
+
}
|
| 1151 |
+
.modal-body th, .modal-body td {
|
| 1152 |
+
padding: 5px 10px;
|
| 1153 |
+
text-align: left;
|
| 1154 |
+
border-bottom: 1px solid var(--border);
|
| 1155 |
+
}
|
| 1156 |
+
.modal-body th { color: var(--text-muted); font-weight: 600; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.5px; }
|
| 1157 |
+
.modal-body td:first-child { color: var(--text); font-weight: 500; white-space: nowrap; }
|
| 1158 |
+
.modal-body tr:last-child td { border-bottom: none; }
|
| 1159 |
+
.modal-body tr:hover td { background: var(--bg-hover); }
|
| 1160 |
+
|
| 1161 |
+
.modal-body kbd {
|
| 1162 |
+
display: inline-block;
|
| 1163 |
+
padding: 1px 6px;
|
| 1164 |
+
background: var(--bg-input);
|
| 1165 |
+
border: 1px solid var(--border);
|
| 1166 |
+
border-radius: 4px;
|
| 1167 |
+
font-family: var(--font-mono);
|
| 1168 |
+
font-size: 0.78rem;
|
| 1169 |
+
color: var(--text-dim);
|
| 1170 |
+
}
|
| 1171 |
+
|
| 1172 |
+
.demo-badge {
|
| 1173 |
+
font-size: 0.72rem;
|
| 1174 |
+
padding: 1px 6px;
|
| 1175 |
+
border-radius: 8px;
|
| 1176 |
+
display: inline-block;
|
| 1177 |
+
margin: 0 2px;
|
| 1178 |
+
}
|
| 1179 |
+
|
| 1180 |
+
/* ── Mobile tab bar ─────────────────────────────────────────────────── */
|
| 1181 |
+
#mobile-tabs {
|
| 1182 |
+
display: none; /* hidden on desktop */
|
| 1183 |
+
height: var(--tabs-h);
|
| 1184 |
+
background: var(--bg-panel);
|
| 1185 |
+
border-top: 1px solid var(--border);
|
| 1186 |
+
flex-shrink: 0;
|
| 1187 |
+
}
|
| 1188 |
+
|
| 1189 |
+
.tab-btn {
|
| 1190 |
+
flex: 1;
|
| 1191 |
+
display: flex;
|
| 1192 |
+
flex-direction: column;
|
| 1193 |
+
align-items: center;
|
| 1194 |
+
justify-content: center;
|
| 1195 |
+
gap: 3px;
|
| 1196 |
+
background: none;
|
| 1197 |
+
border: none;
|
| 1198 |
+
color: var(--text-muted);
|
| 1199 |
+
font-size: 0.68rem;
|
| 1200 |
+
cursor: pointer;
|
| 1201 |
+
padding: 6px 4px;
|
| 1202 |
+
transition: color 0.15s;
|
| 1203 |
+
}
|
| 1204 |
+
.tab-btn svg { width: 20px; height: 20px; }
|
| 1205 |
+
.tab-btn.active { color: var(--primary); }
|
| 1206 |
+
.tab-btn:hover { color: var(--text-dim); }
|
| 1207 |
+
.tab-btn.active:hover { color: var(--primary-hover); }
|
| 1208 |
+
|
| 1209 |
+
/* ── Responsive — tablet (≤ 960px) ─────────────────────────────────── */
|
| 1210 |
+
@media (max-width: 960px) {
|
| 1211 |
+
#app { grid-template-columns: var(--panel-left, 240px) 5px 1fr 5px var(--panel-right, 300px); }
|
| 1212 |
+
.gpu-badge { max-width: 160px; font-size: 0.7rem; }
|
| 1213 |
+
}
|
| 1214 |
+
|
| 1215 |
+
@media (max-width: 780px) and (min-width: 641px) {
|
| 1216 |
+
#app { grid-template-columns: var(--panel-left, 200px) 5px 1fr 5px var(--panel-right, 240px); }
|
| 1217 |
+
.gpu-badge { max-width: 120px; font-size: 0.68rem; }
|
| 1218 |
+
}
|
| 1219 |
+
|
| 1220 |
+
/* ── Responsive — mobile (≤ 640px) ─────────────────────────────────── */
|
| 1221 |
+
@media (max-width: 640px) {
|
| 1222 |
+
:root { --header-h: 48px; }
|
| 1223 |
+
|
| 1224 |
+
#header h1 { font-size: 0.9rem; }
|
| 1225 |
+
.gpu-badge { display: none; } /* too little space */
|
| 1226 |
+
|
| 1227 |
+
/* Single-column; tab bar controls which panel is visible */
|
| 1228 |
+
#app {
|
| 1229 |
+
grid-template-columns: 1fr;
|
| 1230 |
+
grid-template-rows: 1fr;
|
| 1231 |
+
}
|
| 1232 |
+
.panel-resize-handle { display: none; }
|
| 1233 |
+
|
| 1234 |
+
#mobile-tabs { display: flex; }
|
| 1235 |
+
|
| 1236 |
+
/* All panels are hidden by default; JS adds panel-active */
|
| 1237 |
+
[data-panel] {
|
| 1238 |
+
display: none;
|
| 1239 |
+
}
|
| 1240 |
+
[data-panel].panel-active {
|
| 1241 |
+
display: flex;
|
| 1242 |
+
flex-direction: column;
|
| 1243 |
+
}
|
| 1244 |
+
/* Engine panel needs special treatment (flex column) */
|
| 1245 |
+
[data-panel="settings"].panel-active {
|
| 1246 |
+
overflow-y: auto;
|
| 1247 |
+
}
|
| 1248 |
+
|
| 1249 |
+
body { overflow: hidden; }
|
| 1250 |
+
/* Account for tab bar height */
|
| 1251 |
+
#app { height: calc(100vh - var(--header-h) - var(--tabs-h)); }
|
| 1252 |
+
|
| 1253 |
+
/* Results panel: stack vertically */
|
| 1254 |
+
#results-panel.panel-active { gap: 0; }
|
| 1255 |
+
|
| 1256 |
+
/* Larger touch targets */
|
| 1257 |
+
.btn { padding: 10px 16px; font-size: 0.9rem; }
|
| 1258 |
+
.btn-small { padding: 7px 12px; font-size: 0.82rem; }
|
| 1259 |
+
select, input[type="text"], input[type="number"], input[type="password"] {
|
| 1260 |
+
padding: 8px 10px;
|
| 1261 |
+
font-size: 0.9rem;
|
| 1262 |
+
}
|
| 1263 |
+
|
| 1264 |
+
/* Upload area takes less vertical space */
|
| 1265 |
+
.upload-area { padding: 14px 10px; }
|
| 1266 |
+
|
| 1267 |
+
/* Full-width help modal */
|
| 1268 |
+
#help-modal { width: 100vw; max-height: 90vh; border-radius: var(--radius-lg) var(--radius-lg) 0 0; }
|
| 1269 |
+
}
|
web/static/app.js
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Polyscriptor Web UI — Main application entry point
|
| 3 |
+
*
|
| 4 |
+
* Central state + event bus, wires up components.
|
| 5 |
+
* No framework, no build step — native ES modules.
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
import { initEnginePanel } from './components/engine-panel.js';
|
| 9 |
+
import { initImageViewer } from './components/image-viewer.js';
|
| 10 |
+
import { initTranscriptionPanel } from './components/transcription-panel.js';
|
| 11 |
+
import { initBatchPanel } from './components/batch-panel.js';
|
| 12 |
+
|
| 13 |
+
// ── Global state ───────────────────────────────────────────────────────
|
| 14 |
+
export const state = {
|
| 15 |
+
engines: [],
|
| 16 |
+
currentEngine: null,
|
| 17 |
+
engineLoaded: false,
|
| 18 |
+
imageId: null,
|
| 19 |
+
imageInfo: null,
|
| 20 |
+
lines: [], // [{index, text, confidence, bbox, region}]
|
| 21 |
+
regions: [], // [{id, bbox, num_lines}] — from latest segmentation
|
| 22 |
+
isProcessing: false,
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
// ── Event bus ──────────────────────────────────────────────────────────
|
| 26 |
+
export const events = new EventTarget();
|
| 27 |
+
export function emit(name, detail) {
|
| 28 |
+
events.dispatchEvent(new CustomEvent(name, { detail }));
|
| 29 |
+
}
|
| 30 |
+
export function on(name, fn) {
|
| 31 |
+
events.addEventListener(name, e => fn(e.detail));
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
// ── API helper ─────────────────────────────────────────────────────────
|
| 35 |
+
export async function api(path, options = {}) {
|
| 36 |
+
const resp = await fetch(path, {
|
| 37 |
+
headers: { 'Content-Type': 'application/json', ...options.headers },
|
| 38 |
+
...options,
|
| 39 |
+
});
|
| 40 |
+
if (!resp.ok) {
|
| 41 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 42 |
+
throw new Error(err.detail || err.message || 'API error');
|
| 43 |
+
}
|
| 44 |
+
return resp;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// ── Toast notifications ────────────────────────────────────────────────
|
| 48 |
+
export function toast(message, type = 'info', durationMs = 4000) {
|
| 49 |
+
const container = document.getElementById('toast-container');
|
| 50 |
+
const el = document.createElement('div');
|
| 51 |
+
el.className = `toast toast-${type}`;
|
| 52 |
+
el.textContent = message;
|
| 53 |
+
container.appendChild(el);
|
| 54 |
+
setTimeout(() => el.remove(), durationMs);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
// ── GPU status widget ──────────────────────────────────────────────────
|
| 58 |
+
function shortName(name) {
|
| 59 |
+
// Abbreviate long GPU names for the header
|
| 60 |
+
return name
|
| 61 |
+
.replace('NVIDIA ', '')
|
| 62 |
+
.replace('GeForce ', '')
|
| 63 |
+
.replace('Tesla ', '')
|
| 64 |
+
.replace('Quadro ', '');
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
async function updateGpuStatus() {
|
| 68 |
+
const widget = document.getElementById('gpu-status');
|
| 69 |
+
try {
|
| 70 |
+
const resp = await api('/api/gpu');
|
| 71 |
+
const data = await resp.json();
|
| 72 |
+
|
| 73 |
+
if (!data.available || data.gpus.length === 0) {
|
| 74 |
+
widget.innerHTML = '<span class="gpu-card-name"><span>GPU: N/A</span></span>';
|
| 75 |
+
return;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
widget.innerHTML = data.gpus.map(g => {
|
| 79 |
+
const usedPct = Math.round((g.memory_used_mb / g.memory_total_mb) * 100);
|
| 80 |
+
const fillClass = usedPct >= 85 ? 'hot' : usedPct >= 60 ? 'warm' : '';
|
| 81 |
+
const usedGb = (g.memory_used_mb / 1000).toFixed(1);
|
| 82 |
+
const totalGb = (g.memory_total_mb / 1000).toFixed(0);
|
| 83 |
+
const utilHtml = g.utilization_gpu_pct != null
|
| 84 |
+
? `<span class="gpu-util-pct">${g.utilization_gpu_pct}%</span>` : '';
|
| 85 |
+
|
| 86 |
+
return `<div class="gpu-card">
|
| 87 |
+
<div class="gpu-card-name">
|
| 88 |
+
<span title="${g.name}">${shortName(g.name)}</span>${utilHtml}
|
| 89 |
+
</div>
|
| 90 |
+
<div class="gpu-mem-bar">
|
| 91 |
+
<div class="gpu-mem-fill ${fillClass}" style="width:${usedPct}%"></div>
|
| 92 |
+
</div>
|
| 93 |
+
<div class="gpu-mem-label">${usedGb}/${totalGb} GB VRAM</div>
|
| 94 |
+
</div>`;
|
| 95 |
+
}).join('');
|
| 96 |
+
} catch {
|
| 97 |
+
widget.innerHTML = '<span style="font-size:.75rem;color:var(--text-muted)">GPU: error</span>';
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
// ── Zoom controls ──────────────────────────────────────────────────────
|
| 102 |
+
let zoomLevel = 1.0;
|
| 103 |
+
const ZOOM_STEP = 0.25;
|
| 104 |
+
let ZOOM_MIN = 0.25; // updated per image in fitZoom() so large images are always reachable
|
| 105 |
+
const ZOOM_MAX = 4.0;
|
| 106 |
+
|
| 107 |
+
function applyZoom(level) {
|
| 108 |
+
const img = document.getElementById('page-image');
|
| 109 |
+
const canvas = document.getElementById('overlay-canvas');
|
| 110 |
+
if (!img || !img.naturalWidth) return;
|
| 111 |
+
|
| 112 |
+
zoomLevel = Math.max(ZOOM_MIN, Math.min(ZOOM_MAX, level));
|
| 113 |
+
const w = Math.round(img.naturalWidth * zoomLevel);
|
| 114 |
+
const h = Math.round(img.naturalHeight * zoomLevel);
|
| 115 |
+
|
| 116 |
+
img.style.width = w + 'px';
|
| 117 |
+
img.style.height = h + 'px';
|
| 118 |
+
canvas.style.width = w + 'px';
|
| 119 |
+
canvas.style.height = h + 'px';
|
| 120 |
+
|
| 121 |
+
document.getElementById('zoom-level').textContent =
|
| 122 |
+
Math.round(zoomLevel * 100) + '%';
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
export function fitZoom() {
|
| 126 |
+
const img = document.getElementById('page-image');
|
| 127 |
+
const scroll = document.getElementById('viewer-scroll');
|
| 128 |
+
if (!img || !img.naturalWidth || !scroll) return;
|
| 129 |
+
const scaleW = scroll.clientWidth / img.naturalWidth;
|
| 130 |
+
const scaleH = scroll.clientHeight / img.naturalHeight;
|
| 131 |
+
const fit = Math.min(scaleW, scaleH, 1.0); // never zoom in beyond 100% on fit
|
| 132 |
+
// Ensure the fit level is always reachable: lower ZOOM_MIN for large images (min 5%)
|
| 133 |
+
ZOOM_MIN = Math.min(0.25, Math.max(0.05, fit));
|
| 134 |
+
applyZoom(fit);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
function initZoomControls() {
|
| 138 |
+
document.getElementById('btn-zoom-in') .addEventListener('click', () => applyZoom(zoomLevel + ZOOM_STEP));
|
| 139 |
+
document.getElementById('btn-zoom-out').addEventListener('click', () => applyZoom(zoomLevel - ZOOM_STEP));
|
| 140 |
+
document.getElementById('btn-zoom-fit').addEventListener('click', fitZoom);
|
| 141 |
+
|
| 142 |
+
// Mouse-wheel zoom in viewer — multiplicative for smooth feel
|
| 143 |
+
document.getElementById('viewer-scroll').addEventListener('wheel', e => {
|
| 144 |
+
if (!e.ctrlKey && !e.metaKey) return;
|
| 145 |
+
e.preventDefault();
|
| 146 |
+
const factor = e.deltaY < 0 ? 1.10 : 1 / 1.10;
|
| 147 |
+
applyZoom(zoomLevel * factor);
|
| 148 |
+
}, { passive: false });
|
| 149 |
+
|
| 150 |
+
on('image-uploaded', () => {
|
| 151 |
+
document.getElementById('zoom-toolbar').classList.remove('hidden');
|
| 152 |
+
// fit after short delay to let image render
|
| 153 |
+
setTimeout(fitZoom, 80);
|
| 154 |
+
});
|
| 155 |
+
|
| 156 |
+
// Also show toolbar when a batch item is displayed in the viewer
|
| 157 |
+
on('batch-item-start', () => {
|
| 158 |
+
document.getElementById('zoom-toolbar').classList.remove('hidden');
|
| 159 |
+
});
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
// ── Sticky engine config (localStorage) ───────────────────────────────
|
| 163 |
+
const LS_ENGINE = 'polyscriptor_last_engine';
|
| 164 |
+
const LS_CONFIG = name => `polyscriptor_config_${name}`;
|
| 165 |
+
|
| 166 |
+
export function saveEngineConfig(engineName, configObj) {
|
| 167 |
+
try {
|
| 168 |
+
localStorage.setItem(LS_ENGINE, engineName);
|
| 169 |
+
localStorage.setItem(LS_CONFIG(engineName), JSON.stringify(configObj));
|
| 170 |
+
} catch { /* storage full or private mode */ }
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
export function loadSavedEngineName() {
|
| 174 |
+
try { return localStorage.getItem(LS_ENGINE); } catch { return null; }
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
export function loadSavedEngineConfig(engineName) {
|
| 178 |
+
try {
|
| 179 |
+
const raw = localStorage.getItem(LS_CONFIG(engineName));
|
| 180 |
+
return raw ? JSON.parse(raw) : null;
|
| 181 |
+
} catch { return null; }
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
// ── Mobile tab helper ───────────────────────────────────────────────────
|
| 185 |
+
function mobileActivateTab(target) {
|
| 186 |
+
const tabBtns = document.querySelectorAll('.tab-btn');
|
| 187 |
+
const panels = document.querySelectorAll('[data-panel]');
|
| 188 |
+
if (!tabBtns.length) return;
|
| 189 |
+
tabBtns.forEach(b => b.classList.toggle('active', b.dataset.target === target));
|
| 190 |
+
panels.forEach(p => p.classList.toggle('panel-active', p.dataset.panel === target));
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
// ── Resizable panels ───────────────────────────────────────────────────
|
| 194 |
+
const LS_PANEL_LEFT = 'polyscriptor_panel_left';
|
| 195 |
+
const LS_PANEL_RIGHT = 'polyscriptor_panel_right';
|
| 196 |
+
|
| 197 |
+
function initResizablePanels() {
|
| 198 |
+
const app = document.getElementById('app');
|
| 199 |
+
const handleLeft = document.getElementById('resize-left');
|
| 200 |
+
const handleRight = document.getElementById('resize-right');
|
| 201 |
+
if (!handleLeft || !handleRight) return;
|
| 202 |
+
|
| 203 |
+
// Restore saved widths
|
| 204 |
+
const savedLeft = localStorage.getItem(LS_PANEL_LEFT);
|
| 205 |
+
const savedRight = localStorage.getItem(LS_PANEL_RIGHT);
|
| 206 |
+
if (savedLeft) document.documentElement.style.setProperty('--panel-left', savedLeft);
|
| 207 |
+
if (savedRight) document.documentElement.style.setProperty('--panel-right', savedRight);
|
| 208 |
+
|
| 209 |
+
function startDrag(handle, isLeft) {
|
| 210 |
+
handle.classList.add('dragging');
|
| 211 |
+
document.body.style.cursor = 'col-resize';
|
| 212 |
+
document.body.style.userSelect = 'none';
|
| 213 |
+
|
| 214 |
+
const onMove = (e) => {
|
| 215 |
+
const appRect = app.getBoundingClientRect();
|
| 216 |
+
const x = (e.touches ? e.touches[0].clientX : e.clientX) - appRect.left;
|
| 217 |
+
const totalW = appRect.width;
|
| 218 |
+
|
| 219 |
+
if (isLeft) {
|
| 220 |
+
const w = Math.max(160, Math.min(x, totalW * 0.4));
|
| 221 |
+
const val = Math.round(w) + 'px';
|
| 222 |
+
document.documentElement.style.setProperty('--panel-left', val);
|
| 223 |
+
localStorage.setItem(LS_PANEL_LEFT, val);
|
| 224 |
+
} else {
|
| 225 |
+
const w = Math.max(200, Math.min(totalW - x, totalW * 0.5));
|
| 226 |
+
const val = Math.round(w) + 'px';
|
| 227 |
+
document.documentElement.style.setProperty('--panel-right', val);
|
| 228 |
+
localStorage.setItem(LS_PANEL_RIGHT, val);
|
| 229 |
+
}
|
| 230 |
+
};
|
| 231 |
+
|
| 232 |
+
const onUp = () => {
|
| 233 |
+
handle.classList.remove('dragging');
|
| 234 |
+
document.body.style.cursor = '';
|
| 235 |
+
document.body.style.userSelect = '';
|
| 236 |
+
document.removeEventListener('mousemove', onMove);
|
| 237 |
+
document.removeEventListener('mouseup', onUp);
|
| 238 |
+
document.removeEventListener('touchmove', onMove);
|
| 239 |
+
document.removeEventListener('touchend', onUp);
|
| 240 |
+
};
|
| 241 |
+
|
| 242 |
+
document.addEventListener('mousemove', onMove);
|
| 243 |
+
document.addEventListener('mouseup', onUp);
|
| 244 |
+
document.addEventListener('touchmove', onMove, { passive: true });
|
| 245 |
+
document.addEventListener('touchend', onUp);
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
handleLeft.addEventListener('mousedown', e => { e.preventDefault(); startDrag(handleLeft, true); });
|
| 249 |
+
handleRight.addEventListener('mousedown', e => { e.preventDefault(); startDrag(handleRight, false); });
|
| 250 |
+
handleLeft.addEventListener('touchstart', e => startDrag(handleLeft, true), { passive: true });
|
| 251 |
+
handleRight.addEventListener('touchstart', e => startDrag(handleRight, false), { passive: true });
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
// ── Keyboard shortcuts ─────────────────────────────────────────────────
|
| 255 |
+
function initKeyboardShortcuts() {
|
| 256 |
+
document.addEventListener('keydown', e => {
|
| 257 |
+
// Ignore when typing in an input / textarea / contenteditable
|
| 258 |
+
const tag = e.target.tagName;
|
| 259 |
+
const editable = e.target.isContentEditable;
|
| 260 |
+
if (tag === 'INPUT' || tag === 'TEXTAREA' || tag === 'SELECT' || editable) return;
|
| 261 |
+
|
| 262 |
+
// Ctrl+Enter — transcribe
|
| 263 |
+
if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
|
| 264 |
+
e.preventDefault();
|
| 265 |
+
document.getElementById('btn-transcribe')?.click();
|
| 266 |
+
return;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
// ArrowLeft / ArrowRight — batch prev/next
|
| 270 |
+
if (e.key === 'ArrowLeft') { e.preventDefault(); document.getElementById('btn-nav-prev')?.click(); }
|
| 271 |
+
if (e.key === 'ArrowRight') { e.preventDefault(); document.getElementById('btn-nav-next')?.click(); }
|
| 272 |
+
});
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
// ── Prevent browser from opening dropped files in a new tab ────────────
|
| 276 |
+
function initGlobalDropBlocker() {
|
| 277 |
+
document.addEventListener('dragover', e => e.preventDefault());
|
| 278 |
+
document.addEventListener('drop', e => e.preventDefault());
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
// ── Init ───────────────────────────────────────────────────────────────
|
| 282 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 283 |
+
initEnginePanel();
|
| 284 |
+
initImageViewer();
|
| 285 |
+
initTranscriptionPanel();
|
| 286 |
+
initBatchPanel();
|
| 287 |
+
initZoomControls();
|
| 288 |
+
initResizablePanels();
|
| 289 |
+
initKeyboardShortcuts();
|
| 290 |
+
initGlobalDropBlocker();
|
| 291 |
+
updateGpuStatus();
|
| 292 |
+
setInterval(updateGpuStatus, 15000); // refresh every 15s
|
| 293 |
+
|
| 294 |
+
// On mobile: auto-switch tab after key events
|
| 295 |
+
on('image-uploaded', () => mobileActivateTab('image'));
|
| 296 |
+
on('segment-preview', () => mobileActivateTab('image'));
|
| 297 |
+
on('transcription-start', () => mobileActivateTab('results'));
|
| 298 |
+
});
|
web/static/components/batch-panel.js
ADDED
|
@@ -0,0 +1,735 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Batch Panel — multi-image queue, sequential processing, combined export
|
| 3 |
+
*
|
| 4 |
+
* Activated when the user selects/drops multiple images.
|
| 5 |
+
* Each item is processed using the existing upload + transcribe flow.
|
| 6 |
+
* Results are stored per-item and can be exported as combined TXT or CSV.
|
| 7 |
+
*/
|
| 8 |
+
|
| 9 |
+
import { state, emit, on, api, toast } from '../app.js';
|
| 10 |
+
|
| 11 |
+
const $ = id => document.getElementById(id);
|
| 12 |
+
|
| 13 |
+
// Batch state (separate from state.lines which tracks the current single image)
|
| 14 |
+
const batch = {
|
| 15 |
+
items: [], // { file, imageId, status, lines, filename }
|
| 16 |
+
running: false,
|
| 17 |
+
cancelled: false,
|
| 18 |
+
currentIndex: -1, // item currently shown in the viewer
|
| 19 |
+
processingIndex: -1, // item currently being transcribed (may differ when user navigates away)
|
| 20 |
+
userNavigated: false, // user manually navigated away from auto-advance
|
| 21 |
+
abortController: null,
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
export function initBatchPanel() {
|
| 25 |
+
// Hook into the file input to detect multiple files, PDFs, or second image.
|
| 26 |
+
// Use capture:true so this fires before image-viewer's bubble listener, letting us
|
| 27 |
+
// stopImmediatePropagation() and own the upload when batch-panel takes over.
|
| 28 |
+
const fileInput = $('file-input');
|
| 29 |
+
fileInput.addEventListener('change', e => {
|
| 30 |
+
const files = Array.from(fileInput.files);
|
| 31 |
+
const hasPdf = files.some(f => f.name.toLowerCase().endsWith('.pdf'));
|
| 32 |
+
// Intercept: multiple files, PDF, or single image when one is already loaded
|
| 33 |
+
if (files.length > 1 || hasPdf || (files.length === 1 && !hasPdf && state.imageId)) {
|
| 34 |
+
e.stopImmediatePropagation(); // prevent image-viewer from also uploading the PDF
|
| 35 |
+
handleMultipleFiles(files);
|
| 36 |
+
fileInput.value = '';
|
| 37 |
+
}
|
| 38 |
+
// Single non-PDF image with no existing image → handled by image-viewer.js
|
| 39 |
+
}, true); // capture:true — fires before image-viewer's non-capture listener
|
| 40 |
+
|
| 41 |
+
// Multiple XML selection from the Upload XML button
|
| 42 |
+
const xmlInput = $('xml-input');
|
| 43 |
+
xmlInput.addEventListener('change', e => {
|
| 44 |
+
if (xmlInput.files.length <= 1) return; // single XML → image-viewer handles normally
|
| 45 |
+
e.stopImmediatePropagation();
|
| 46 |
+
uploadXmlFiles(Array.from(xmlInput.files));
|
| 47 |
+
xmlInput.value = '';
|
| 48 |
+
}, true); // capture — fires before image-viewer's listener
|
| 49 |
+
|
| 50 |
+
// Drag-drop: intercept multiple images/PDFs or any drop when image already loaded
|
| 51 |
+
const uploadArea = $('upload-area');
|
| 52 |
+
uploadArea.addEventListener('drop', e => {
|
| 53 |
+
const files = Array.from(e.dataTransfer.files);
|
| 54 |
+
const xmlFiles = files.filter(f => f.name.toLowerCase().endsWith('.xml'));
|
| 55 |
+
const nonXml = files.filter(f => !f.name.toLowerCase().endsWith('.xml'));
|
| 56 |
+
const hasPdf = nonXml.some(f => f.name.toLowerCase().endsWith('.pdf'));
|
| 57 |
+
|
| 58 |
+
// Take over if: multiple images, a PDF, a second image on top of existing, or multiple XMLs
|
| 59 |
+
const takeBatch = nonXml.length > 1 || hasPdf || (nonXml.length === 1 && state.imageId);
|
| 60 |
+
const takeXml = xmlFiles.length > 1 || (xmlFiles.length === 1 && batch.items.length > 0);
|
| 61 |
+
|
| 62 |
+
if (takeBatch || takeXml) {
|
| 63 |
+
e.preventDefault();
|
| 64 |
+
e.stopImmediatePropagation();
|
| 65 |
+
if (nonXml.length > 0) handleMultipleFiles(nonXml);
|
| 66 |
+
if (xmlFiles.length > 0) uploadXmlFiles(xmlFiles);
|
| 67 |
+
}
|
| 68 |
+
}, true); // capture phase — fires before image-viewer's bubble handler
|
| 69 |
+
|
| 70 |
+
// PDF pages from single-PDF drop on image-viewer — add to batch
|
| 71 |
+
on('pdf-pages-ready', data => {
|
| 72 |
+
const existing = new Set(batch.items.map(i => i.filename));
|
| 73 |
+
for (const page of data.pages) {
|
| 74 |
+
if (!existing.has(page.filename)) {
|
| 75 |
+
batch.items.push({
|
| 76 |
+
file: null,
|
| 77 |
+
imageId: page.image_id,
|
| 78 |
+
status: 'pending',
|
| 79 |
+
lines: [],
|
| 80 |
+
filename: page.filename,
|
| 81 |
+
preUploaded: true,
|
| 82 |
+
});
|
| 83 |
+
existing.add(page.filename);
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
if (batch.items.length > 0) {
|
| 87 |
+
renderQueue();
|
| 88 |
+
// PDF pages are already uploaded — always preview the first one directly,
|
| 89 |
+
// bypassing the state.imageId guard in previewFirstBatchItem().
|
| 90 |
+
const first = batch.items[0];
|
| 91 |
+
if (first && first.preUploaded && first.imageId) {
|
| 92 |
+
batch.currentIndex = 0;
|
| 93 |
+
emit('batch-item-start', { imageId: first.imageId, filename: first.filename });
|
| 94 |
+
updateNavButtons();
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
});
|
| 98 |
+
|
| 99 |
+
$('btn-process-batch').addEventListener('click', processBatch);
|
| 100 |
+
$('btn-clear-batch').addEventListener('click', clearBatch);
|
| 101 |
+
$('btn-export-batch-txt').addEventListener('click', exportAllTxt);
|
| 102 |
+
$('btn-export-batch-csv').addEventListener('click', exportAllCsv);
|
| 103 |
+
$('btn-export-batch-txt-zip').addEventListener('click', exportAllTxtZip);
|
| 104 |
+
$('btn-export-batch-thinking-zip').addEventListener('click', exportAllThinkingZip);
|
| 105 |
+
$('btn-export-batch-xml').addEventListener('click', exportAllXml);
|
| 106 |
+
|
| 107 |
+
$('btn-nav-prev').addEventListener('click', () => navigate(-1));
|
| 108 |
+
$('btn-nav-next').addEventListener('click', () => navigate(+1));
|
| 109 |
+
|
| 110 |
+
// Persist PAGE XML and resume checkboxes across sessions
|
| 111 |
+
const usePageXmlEl = $('batch-use-pagexml');
|
| 112 |
+
const resumeEl = $('batch-resume');
|
| 113 |
+
const savedPageXml = localStorage.getItem('batch_use_pagexml');
|
| 114 |
+
const savedResume = localStorage.getItem('batch_resume');
|
| 115 |
+
if (savedPageXml !== null) usePageXmlEl.checked = savedPageXml === 'true';
|
| 116 |
+
if (savedResume !== null) resumeEl.checked = savedResume === 'true';
|
| 117 |
+
usePageXmlEl.addEventListener('change', () => localStorage.setItem('batch_use_pagexml', usePageXmlEl.checked));
|
| 118 |
+
resumeEl.addEventListener('change', () => localStorage.setItem('batch_resume', resumeEl.checked));
|
| 119 |
+
|
| 120 |
+
// Cancel during batch: abort current SSE + stop the queue loop
|
| 121 |
+
$('btn-cancel').addEventListener('click', () => {
|
| 122 |
+
if (!batch.running) return;
|
| 123 |
+
batch.cancelled = true;
|
| 124 |
+
batch.abortController?.abort();
|
| 125 |
+
}, { capture: true });
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
// ── XML matching for batch ────────────────────────────────────────────────────
|
| 129 |
+
|
| 130 |
+
// Match XML files to batch items by filename stem (e.g. page001.xml → page001.jpg)
|
| 131 |
+
async function uploadXmlFiles(xmlFiles) {
|
| 132 |
+
if (!xmlFiles.length) return;
|
| 133 |
+
const stem = name => name.replace(/\.[^/.]+$/, '').toLowerCase();
|
| 134 |
+
|
| 135 |
+
let matched = 0, deferred = 0, skipped = 0;
|
| 136 |
+
|
| 137 |
+
for (const xml of xmlFiles) {
|
| 138 |
+
const xmlStem = stem(xml.name);
|
| 139 |
+
const item = batch.items.find(it => stem(it.filename) === xmlStem);
|
| 140 |
+
if (!item) { skipped++; continue; }
|
| 141 |
+
|
| 142 |
+
if (item.imageId) {
|
| 143 |
+
// Already uploaded → send to server immediately
|
| 144 |
+
try {
|
| 145 |
+
const fd = new FormData();
|
| 146 |
+
fd.append('file', xml);
|
| 147 |
+
const resp = await fetch(`/api/image/${item.imageId}/xml`, { method: 'POST', body: fd });
|
| 148 |
+
if (!resp.ok) throw new Error((await resp.json()).detail);
|
| 149 |
+
item.xmlUploaded = true;
|
| 150 |
+
matched++;
|
| 151 |
+
} catch (err) {
|
| 152 |
+
toast(`XML ${xml.name}: ${err.message}`, 'error');
|
| 153 |
+
}
|
| 154 |
+
} else {
|
| 155 |
+
// Image not yet uploaded — store XML, send during processBatch
|
| 156 |
+
item.xmlFile = xml;
|
| 157 |
+
deferred++;
|
| 158 |
+
}
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
const parts = [];
|
| 162 |
+
if (matched > 0) parts.push(`${matched} uploaded`);
|
| 163 |
+
if (deferred > 0) parts.push(`${deferred} queued for batch`);
|
| 164 |
+
if (skipped > 0) parts.push(`${skipped} unmatched`);
|
| 165 |
+
toast(`XML files: ${parts.join(', ')}`, matched + deferred > 0 ? 'success' : 'error');
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// ── Queue management ─────────────────────────────────────────────────────────
|
| 169 |
+
|
| 170 |
+
function handleMultipleFiles(files) {
|
| 171 |
+
// If a single image is already loaded (not yet in batch), add it first
|
| 172 |
+
if (batch.items.length === 0 && state.imageId) {
|
| 173 |
+
batch.items.push({
|
| 174 |
+
file: null,
|
| 175 |
+
imageId: state.imageId,
|
| 176 |
+
status: 'pending',
|
| 177 |
+
lines: state.lines.length ? state.lines : [],
|
| 178 |
+
filename: (state.imageInfo && state.imageInfo.filename) || 'current image',
|
| 179 |
+
preUploaded: true,
|
| 180 |
+
});
|
| 181 |
+
}
|
| 182 |
+
// Add new files (skip duplicates by name)
|
| 183 |
+
const existing = new Set(batch.items.map(i => i.filename));
|
| 184 |
+
const added = files.filter(f => !existing.has(f.name));
|
| 185 |
+
added.forEach(f => {
|
| 186 |
+
batch.items.push({ file: f, imageId: null, status: 'pending', lines: [], filename: f.name });
|
| 187 |
+
});
|
| 188 |
+
if (batch.items.length > 0) { renderQueue(); previewFirstBatchItem(); }
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// Auto-preview all batch items (upload if needed), expanding PDFs into pages immediately
|
| 192 |
+
async function previewFirstBatchItem() {
|
| 193 |
+
if (batch.running) return;
|
| 194 |
+
|
| 195 |
+
let i = 0;
|
| 196 |
+
let safetyCounter = 0;
|
| 197 |
+
while (i < batch.items.length && safetyCounter < 100) {
|
| 198 |
+
safetyCounter++;
|
| 199 |
+
const item = batch.items[i];
|
| 200 |
+
|
| 201 |
+
if (item.preUploaded && item.imageId) {
|
| 202 |
+
i++;
|
| 203 |
+
continue;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
if (item.file) {
|
| 207 |
+
try {
|
| 208 |
+
const fd = new FormData();
|
| 209 |
+
fd.append('file', item.file);
|
| 210 |
+
const resp = await fetch('/api/image/upload', { method: 'POST', body: fd });
|
| 211 |
+
if (!resp.ok) { i++; continue; }
|
| 212 |
+
const data = await resp.json();
|
| 213 |
+
|
| 214 |
+
if (data.is_pdf) {
|
| 215 |
+
const newItems = data.pages.map(p => ({
|
| 216 |
+
file: null, imageId: p.image_id, status: 'pending',
|
| 217 |
+
lines: [], filename: p.filename, preUploaded: true,
|
| 218 |
+
}));
|
| 219 |
+
batch.items.splice(i, 1, ...newItems);
|
| 220 |
+
renderQueue();
|
| 221 |
+
continue;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
item.imageId = data.image_id;
|
| 225 |
+
item.preUploaded = true;
|
| 226 |
+
renderQueue();
|
| 227 |
+
|
| 228 |
+
if (i === 0 && !state.imageId) {
|
| 229 |
+
batch.currentIndex = 0;
|
| 230 |
+
emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
|
| 231 |
+
updateNavButtons();
|
| 232 |
+
}
|
| 233 |
+
i++;
|
| 234 |
+
} catch (err) {
|
| 235 |
+
console.error('Error pre-uploading batch item:', err);
|
| 236 |
+
i++;
|
| 237 |
+
}
|
| 238 |
+
} else {
|
| 239 |
+
i++;
|
| 240 |
+
}
|
| 241 |
+
}
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
function clearBatch() {
|
| 245 |
+
if (batch.running) return;
|
| 246 |
+
batch.items = [];
|
| 247 |
+
batch.currentIndex = -1;
|
| 248 |
+
$('batch-queue-section').classList.add('hidden');
|
| 249 |
+
$('batch-export-row').classList.add('hidden');
|
| 250 |
+
updateNavButtons();
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
let _dragSrcIndex = null;
|
| 254 |
+
|
| 255 |
+
function renderQueue() {
|
| 256 |
+
const section = $('batch-queue-section');
|
| 257 |
+
const list = $('batch-list');
|
| 258 |
+
section.classList.remove('hidden');
|
| 259 |
+
list.innerHTML = '';
|
| 260 |
+
batch.items.forEach((item, i) => {
|
| 261 |
+
const row = document.createElement('div');
|
| 262 |
+
row.className = 'batch-item';
|
| 263 |
+
row.id = `batch-item-${i}`;
|
| 264 |
+
row.dataset.index = i;
|
| 265 |
+
|
| 266 |
+
// Drag handle
|
| 267 |
+
const handle = document.createElement('span');
|
| 268 |
+
handle.className = 'batch-drag-handle';
|
| 269 |
+
handle.textContent = '⠿';
|
| 270 |
+
handle.title = 'Drag to reorder';
|
| 271 |
+
|
| 272 |
+
const name = document.createElement('span');
|
| 273 |
+
name.className = 'batch-item-name';
|
| 274 |
+
name.title = item.filename;
|
| 275 |
+
name.textContent = item.filename;
|
| 276 |
+
|
| 277 |
+
const status = document.createElement('span');
|
| 278 |
+
status.className = 'batch-status';
|
| 279 |
+
status.id = `batch-status-${i}`;
|
| 280 |
+
_setStatusEl(status, item.status, item.lines.length);
|
| 281 |
+
|
| 282 |
+
row.appendChild(handle);
|
| 283 |
+
row.appendChild(name);
|
| 284 |
+
row.appendChild(status);
|
| 285 |
+
|
| 286 |
+
// Click a done item to reload it, or a preUploaded pending item to load for manual transcription
|
| 287 |
+
const canPreview = item.status === 'done' || (item.preUploaded && item.imageId);
|
| 288 |
+
if (canPreview) {
|
| 289 |
+
row.style.cursor = 'pointer';
|
| 290 |
+
row.addEventListener('click', e => {
|
| 291 |
+
if (e.target === handle) return; // don't trigger on drag handle click
|
| 292 |
+
if (item.status === 'done') {
|
| 293 |
+
loadBatchItem(i);
|
| 294 |
+
} else {
|
| 295 |
+
// Load preUploaded pending page so user can manually segment/transcribe it
|
| 296 |
+
batch.currentIndex = i;
|
| 297 |
+
emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
|
| 298 |
+
updateNavButtons();
|
| 299 |
+
}
|
| 300 |
+
});
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
// Drag-to-reorder (only when not running)
|
| 304 |
+
if (!batch.running) {
|
| 305 |
+
row.draggable = true;
|
| 306 |
+
row.addEventListener('dragstart', e => {
|
| 307 |
+
_dragSrcIndex = i;
|
| 308 |
+
e.dataTransfer.effectAllowed = 'move';
|
| 309 |
+
row.classList.add('batch-dragging');
|
| 310 |
+
});
|
| 311 |
+
row.addEventListener('dragend', () => {
|
| 312 |
+
row.classList.remove('batch-dragging');
|
| 313 |
+
list.querySelectorAll('.batch-item').forEach(r => r.classList.remove('batch-drag-over'));
|
| 314 |
+
});
|
| 315 |
+
row.addEventListener('dragover', e => {
|
| 316 |
+
e.preventDefault();
|
| 317 |
+
e.dataTransfer.dropEffect = 'move';
|
| 318 |
+
list.querySelectorAll('.batch-item').forEach(r => r.classList.remove('batch-drag-over'));
|
| 319 |
+
row.classList.add('batch-drag-over');
|
| 320 |
+
});
|
| 321 |
+
row.addEventListener('dragleave', () => row.classList.remove('batch-drag-over'));
|
| 322 |
+
row.addEventListener('drop', e => {
|
| 323 |
+
e.preventDefault();
|
| 324 |
+
row.classList.remove('batch-drag-over');
|
| 325 |
+
const destIndex = parseInt(row.dataset.index, 10);
|
| 326 |
+
if (_dragSrcIndex == null || _dragSrcIndex === destIndex) return;
|
| 327 |
+
|
| 328 |
+
// Reorder batch.items
|
| 329 |
+
const [moved] = batch.items.splice(_dragSrcIndex, 1);
|
| 330 |
+
batch.items.splice(destIndex, 0, moved);
|
| 331 |
+
|
| 332 |
+
// Fix currentIndex if it pointed to a moved item
|
| 333 |
+
if (batch.currentIndex === _dragSrcIndex) {
|
| 334 |
+
batch.currentIndex = destIndex;
|
| 335 |
+
} else if (_dragSrcIndex < destIndex) {
|
| 336 |
+
if (batch.currentIndex > _dragSrcIndex && batch.currentIndex <= destIndex) batch.currentIndex--;
|
| 337 |
+
} else {
|
| 338 |
+
if (batch.currentIndex >= destIndex && batch.currentIndex < _dragSrcIndex) batch.currentIndex++;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
_dragSrcIndex = null;
|
| 342 |
+
renderQueue();
|
| 343 |
+
});
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
list.appendChild(row);
|
| 347 |
+
});
|
| 348 |
+
|
| 349 |
+
// Show export row if any item is done
|
| 350 |
+
const anyDone = batch.items.some(i => i.status === 'done');
|
| 351 |
+
$('batch-export-row').classList.toggle('hidden', !anyDone);
|
| 352 |
+
updateNavButtons();
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
function _setStatusEl(el, status, lineCount) {
|
| 356 |
+
el.className = 'batch-status';
|
| 357 |
+
if (status === 'pending') { el.textContent = 'pending'; }
|
| 358 |
+
else if (status === 'active'){ el.textContent = 'running…'; el.classList.add('active'); }
|
| 359 |
+
else if (status === 'done') { el.textContent = `✓ ${lineCount} lines`; el.classList.add('done'); }
|
| 360 |
+
else if (status === 'error') { el.textContent = 'error'; el.classList.add('error'); }
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
function updateItemStatus(index, status, lineCount = 0) {
|
| 364 |
+
batch.items[index].status = status;
|
| 365 |
+
const el = $(`batch-status-${index}`);
|
| 366 |
+
if (el) _setStatusEl(el, status, lineCount);
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
function updateOverallProgress(current = null, total = null) {
|
| 370 |
+
const el = $('batch-overall-progress');
|
| 371 |
+
if (current == null) {
|
| 372 |
+
el.classList.add('hidden');
|
| 373 |
+
el.textContent = '';
|
| 374 |
+
} else {
|
| 375 |
+
el.textContent = `${current} / ${total}`;
|
| 376 |
+
el.classList.remove('hidden');
|
| 377 |
+
}
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
function updateNavButtons() {
|
| 381 |
+
const done = batch.items.filter(i => i.status === 'done');
|
| 382 |
+
const hasBatch = done.length > 0;
|
| 383 |
+
const idx = batch.currentIndex;
|
| 384 |
+
// Allow navigation to done items even while batch is running
|
| 385 |
+
const prevDone = hasBatch && batch.items.slice(0, idx).some(i => i.status === 'done');
|
| 386 |
+
const nextDone = hasBatch && batch.items.slice(idx + 1).some(i => i.status === 'done');
|
| 387 |
+
$('btn-nav-prev').disabled = !prevDone;
|
| 388 |
+
$('btn-nav-next').disabled = !nextDone;
|
| 389 |
+
const label = $('batch-nav-label');
|
| 390 |
+
if (hasBatch && idx >= 0) {
|
| 391 |
+
const pos = done.indexOf(batch.items[idx]) + 1;
|
| 392 |
+
label.textContent = `${pos}/${done.length}`;
|
| 393 |
+
} else {
|
| 394 |
+
label.textContent = '';
|
| 395 |
+
}
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
function navigate(delta) {
|
| 399 |
+
const indices = batch.items
|
| 400 |
+
.map((item, i) => item.status === 'done' ? i : -1)
|
| 401 |
+
.filter(i => i >= 0);
|
| 402 |
+
if (indices.length < 2) return;
|
| 403 |
+
const cur = indices.indexOf(batch.currentIndex);
|
| 404 |
+
const next = indices[cur + delta];
|
| 405 |
+
if (next != null) loadBatchItem(next);
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
// ── Processing ───────────────────────────────────────────────────────────────
|
| 409 |
+
|
| 410 |
+
async function processBatch() {
|
| 411 |
+
if (batch.running || !state.engineLoaded) {
|
| 412 |
+
if (!state.engineLoaded) toast('Load an engine first', 'error');
|
| 413 |
+
return;
|
| 414 |
+
}
|
| 415 |
+
batch.running = true;
|
| 416 |
+
batch.cancelled = false;
|
| 417 |
+
batch.userNavigated = false; // reset: auto-advance viewer from scratch
|
| 418 |
+
$('btn-process-batch').disabled = true;
|
| 419 |
+
$('btn-cancel').classList.remove('hidden');
|
| 420 |
+
|
| 421 |
+
const segMethod = $('seg-method').value;
|
| 422 |
+
const segDevice = $('seg-device').value;
|
| 423 |
+
const maxColumns = parseInt($('seg-max-columns')?.value || '6', 10);
|
| 424 |
+
const splitWidth = parseFloat($('seg-split-width')?.value || '40') / 100;
|
| 425 |
+
const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
|
| 426 |
+
const usePageXml = $('batch-use-pagexml').checked;
|
| 427 |
+
const resume = $('batch-resume').checked;
|
| 428 |
+
const pending = batch.items.filter(i => resume ? i.status === 'pending' : i.status !== 'done').length;
|
| 429 |
+
let doneThisRun = 0;
|
| 430 |
+
updateOverallProgress(0, pending);
|
| 431 |
+
|
| 432 |
+
for (let i = 0; i < batch.items.length; i++) {
|
| 433 |
+
if (batch.cancelled) {
|
| 434 |
+
// Mark remaining pending items back to pending (they stay pending)
|
| 435 |
+
break;
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
const item = batch.items[i];
|
| 439 |
+
if (item.status === 'done') {
|
| 440 |
+
// Resume mode: skip done; non-resume mode: also skip done
|
| 441 |
+
continue;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
batch.processingIndex = i;
|
| 445 |
+
updateItemStatus(i, 'active');
|
| 446 |
+
updateNavButtons();
|
| 447 |
+
|
| 448 |
+
try {
|
| 449 |
+
// 1. Upload image (skip if already uploaded, e.g. PDF page pre-rendered by server)
|
| 450 |
+
if (item.preUploaded && item.imageId) {
|
| 451 |
+
// Already registered server-side — no upload needed
|
| 452 |
+
} else {
|
| 453 |
+
const fd = new FormData();
|
| 454 |
+
fd.append('file', item.file);
|
| 455 |
+
const upResp = await fetch('/api/image/upload', { method: 'POST', body: fd });
|
| 456 |
+
if (!upResp.ok) throw new Error(`Upload failed: ${upResp.statusText}`);
|
| 457 |
+
const upData = await upResp.json();
|
| 458 |
+
// PDF uploaded directly: expand into sub-items and skip this placeholder
|
| 459 |
+
if (upData.is_pdf) {
|
| 460 |
+
const newItems = upData.pages.map(p => ({
|
| 461 |
+
file: null, imageId: p.image_id, status: 'pending',
|
| 462 |
+
lines: [], filename: p.filename, preUploaded: true,
|
| 463 |
+
}));
|
| 464 |
+
batch.items.splice(i + 1, 0, ...newItems);
|
| 465 |
+
updateItemStatus(i, 'done', 0);
|
| 466 |
+
renderQueue();
|
| 467 |
+
continue;
|
| 468 |
+
}
|
| 469 |
+
item.imageId = upData.image_id;
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
// Upload deferred XML if one was matched earlier
|
| 473 |
+
if (item.xmlFile && item.imageId) {
|
| 474 |
+
try {
|
| 475 |
+
const fd = new FormData();
|
| 476 |
+
fd.append('file', item.xmlFile);
|
| 477 |
+
await fetch(`/api/image/${item.imageId}/xml`, { method: 'POST', body: fd });
|
| 478 |
+
item.xmlUploaded = true;
|
| 479 |
+
} catch { /* non-fatal */ }
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
// Show in viewer — skip if user manually navigated to a different item
|
| 483 |
+
if (!batch.userNavigated) {
|
| 484 |
+
batch.currentIndex = i;
|
| 485 |
+
emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
// 2. Transcribe via SSE (abortable)
|
| 489 |
+
batch.abortController = new AbortController();
|
| 490 |
+
const result = await transcribeSSE(
|
| 491 |
+
item.imageId, segMethod, segDevice, maxColumns, splitWidth, usePageXml, batch.abortController.signal, textDirection
|
| 492 |
+
);
|
| 493 |
+
item.lines = result.lines;
|
| 494 |
+
item.time_s = result.time_s;
|
| 495 |
+
item.token_usage = result.token_usage;
|
| 496 |
+
updateItemStatus(i, 'done', result.lines.length);
|
| 497 |
+
doneThisRun++;
|
| 498 |
+
updateOverallProgress(doneThisRun, pending);
|
| 499 |
+
// Fire sse-complete so the panel shows footer, column toggle, confidence filter, etc.
|
| 500 |
+
if (batch.currentIndex === i) {
|
| 501 |
+
emit('sse-complete', { lines: item.lines, total_time_s: item.time_s, engine: '(batch)', token_usage: item.token_usage });
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
} catch (err) {
|
| 505 |
+
if (err.name === 'AbortError' || batch.cancelled) {
|
| 506 |
+
updateItemStatus(i, 'pending');
|
| 507 |
+
} else {
|
| 508 |
+
updateItemStatus(i, 'error');
|
| 509 |
+
toast(`${item.filename}: ${err.message}`, 'error');
|
| 510 |
+
}
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
// Re-render to make done items clickable
|
| 514 |
+
renderQueue();
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
batch.running = false;
|
| 518 |
+
batch.processingIndex = -1;
|
| 519 |
+
batch.userNavigated = false;
|
| 520 |
+
batch.abortController = null;
|
| 521 |
+
$('btn-process-batch').disabled = false;
|
| 522 |
+
$('btn-cancel').classList.add('hidden');
|
| 523 |
+
$('batch-export-row').classList.remove('hidden');
|
| 524 |
+
updateOverallProgress(null);
|
| 525 |
+
updateNavButtons();
|
| 526 |
+
|
| 527 |
+
const doneCount = batch.items.filter(i => i.status === 'done').length;
|
| 528 |
+
if (batch.cancelled) {
|
| 529 |
+
toast(`Batch cancelled — ${doneCount} image(s) done`, 'info', 4000);
|
| 530 |
+
} else {
|
| 531 |
+
toast(`Batch complete: ${doneCount}/${batch.items.length} images`, 'success', 5000);
|
| 532 |
+
}
|
| 533 |
+
emit('batch-complete', { items: batch.items });
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
function _collectLiveOverrides() {
|
| 537 |
+
const overrides = {};
|
| 538 |
+
const form = document.getElementById('config-form');
|
| 539 |
+
if (!form) return overrides;
|
| 540 |
+
for (const el of form.querySelectorAll('[data-key]')) {
|
| 541 |
+
if (el.dataset.saveFor) continue;
|
| 542 |
+
if (el.dataset.passwordField) continue;
|
| 543 |
+
const key = el.dataset.key;
|
| 544 |
+
if (el.type === 'checkbox') overrides[key] = el.checked;
|
| 545 |
+
else if (el.type === 'number') overrides[key] = Number(el.value);
|
| 546 |
+
else overrides[key] = el.value;
|
| 547 |
+
}
|
| 548 |
+
return overrides;
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
function transcribeSSE(imageId, segMethod, segDevice, maxColumns, splitWidthFraction = 0.4, usePageXml = true, signal = null, textDirection = 'horizontal-lr') {
|
| 552 |
+
return new Promise((resolve, reject) => {
|
| 553 |
+
const lines = [];
|
| 554 |
+
let startTime = null;
|
| 555 |
+
let lastTokenUsage = null;
|
| 556 |
+
const body = JSON.stringify({
|
| 557 |
+
image_id: imageId, seg_method: segMethod,
|
| 558 |
+
seg_device: segDevice, max_columns: maxColumns,
|
| 559 |
+
split_width_fraction: splitWidthFraction,
|
| 560 |
+
text_direction: textDirection,
|
| 561 |
+
use_pagexml: usePageXml,
|
| 562 |
+
engine_config_overrides: _collectLiveOverrides(),
|
| 563 |
+
});
|
| 564 |
+
|
| 565 |
+
const finish = (cancelled = false) => {
|
| 566 |
+
const time_s = startTime ? Math.round((Date.now() - startTime) / 100) / 10 : 0;
|
| 567 |
+
resolve({ lines, time_s, token_usage: lastTokenUsage, cancelled });
|
| 568 |
+
};
|
| 569 |
+
|
| 570 |
+
fetch('/api/transcribe', {
|
| 571 |
+
method: 'POST',
|
| 572 |
+
headers: { 'Content-Type': 'application/json' },
|
| 573 |
+
body,
|
| 574 |
+
signal,
|
| 575 |
+
}).then(resp => {
|
| 576 |
+
if (!resp.ok) return reject(new Error(resp.statusText));
|
| 577 |
+
const reader = resp.body.getReader();
|
| 578 |
+
const decoder = new TextDecoder();
|
| 579 |
+
let buf = '';
|
| 580 |
+
|
| 581 |
+
const pump = () => reader.read().then(({ done, value }) => {
|
| 582 |
+
if (done) { finish(); return; }
|
| 583 |
+
buf += decoder.decode(value, { stream: true });
|
| 584 |
+
const parts = buf.split('\n\n');
|
| 585 |
+
buf = parts.pop();
|
| 586 |
+
for (const chunk of parts) {
|
| 587 |
+
const evLine = chunk.split('\n').find(l => l.startsWith('event:'));
|
| 588 |
+
const dataLine = chunk.split('\n').find(l => l.startsWith('data:'));
|
| 589 |
+
if (!evLine || !dataLine) continue;
|
| 590 |
+
const event = evLine.slice(7).trim();
|
| 591 |
+
const data = JSON.parse(dataLine.slice(5).trim());
|
| 592 |
+
if (event === 'progress') {
|
| 593 |
+
if (!startTime) startTime = Date.now();
|
| 594 |
+
if (data.token_usage) lastTokenUsage = data.token_usage;
|
| 595 |
+
lines.push(data.line);
|
| 596 |
+
// Only stream to panel when user is watching this item
|
| 597 |
+
if (batch.currentIndex === batch.processingIndex) emit('sse-progress', data);
|
| 598 |
+
} else if (event === 'segmentation') {
|
| 599 |
+
// Store bboxes/regions so loadBatchItem can restore them later
|
| 600 |
+
if (batch.items[batch.processingIndex]) {
|
| 601 |
+
batch.items[batch.processingIndex].bboxes = data.bboxes || [];
|
| 602 |
+
batch.items[batch.processingIndex].regions = data.regions || [];
|
| 603 |
+
}
|
| 604 |
+
if (batch.currentIndex === batch.processingIndex) emit('sse-segmentation', data);
|
| 605 |
+
} else if (event === 'complete') {
|
| 606 |
+
if (data.token_usage) lastTokenUsage = data.token_usage;
|
| 607 |
+
finish();
|
| 608 |
+
} else if (event === 'error') {
|
| 609 |
+
reject(new Error(data.message));
|
| 610 |
+
} else if (event === 'cancelled') {
|
| 611 |
+
finish(true);
|
| 612 |
+
}
|
| 613 |
+
}
|
| 614 |
+
pump();
|
| 615 |
+
}).catch(reject);
|
| 616 |
+
pump();
|
| 617 |
+
}).catch(reject);
|
| 618 |
+
});
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
// Load a completed batch item back into the viewer / results panel
|
| 622 |
+
function loadBatchItem(index) {
|
| 623 |
+
const item = batch.items[index];
|
| 624 |
+
if (item.status !== 'done') return;
|
| 625 |
+
batch.currentIndex = index;
|
| 626 |
+
batch.userNavigated = true; // user left auto-advance mode
|
| 627 |
+
emit('batch-item-start', { imageId: item.imageId, filename: item.filename });
|
| 628 |
+
updateNavButtons();
|
| 629 |
+
// Restore segmentation data so line-click highlighting works.
|
| 630 |
+
// batch-item-start clears currentBboxes in the image viewer; re-populate them here.
|
| 631 |
+
const bboxes = item.bboxes || [];
|
| 632 |
+
const regions = item.regions || [];
|
| 633 |
+
emit('sse-segmentation', { num_lines: item.lines.length, bboxes, regions, source: 'batch-restore' });
|
| 634 |
+
// Re-populate state.lines so exports and confidence filter work
|
| 635 |
+
state.lines = item.lines.map((l, i) => ({ ...l, index: i }));
|
| 636 |
+
// Re-emit each line to rebuild the transcription panel
|
| 637 |
+
$('transcription-lines').innerHTML = '';
|
| 638 |
+
$('conf-filter-row').classList.add('hidden');
|
| 639 |
+
state.lines.forEach(l => emit('sse-progress', {
|
| 640 |
+
current: l.index + 1, total: state.lines.length, line: l
|
| 641 |
+
}));
|
| 642 |
+
emit('sse-complete', { lines: state.lines, total_time_s: item.time_s || 0, engine: '(batch)', token_usage: item.token_usage || null });
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
// ── Export ────────────────────────────────────────────────────────────────────
|
| 646 |
+
|
| 647 |
+
function exportAllTxt() {
|
| 648 |
+
const done = batch.items.filter(i => i.status === 'done');
|
| 649 |
+
if (!done.length) return;
|
| 650 |
+
const text = done.map(item =>
|
| 651 |
+
`=== ${item.filename} ===\n` + item.lines.map(l => l.text).join('\n')
|
| 652 |
+
).join('\n\n');
|
| 653 |
+
downloadFile('batch_transcription.txt', text, 'text/plain');
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
function exportAllCsv() {
|
| 657 |
+
const done = batch.items.filter(i => i.status === 'done');
|
| 658 |
+
if (!done.length) return;
|
| 659 |
+
const header = 'File,Line,Text,Confidence\n';
|
| 660 |
+
const rows = done.flatMap(item =>
|
| 661 |
+
item.lines.map(l => {
|
| 662 |
+
const conf = l.confidence != null ? l.confidence.toFixed(4) : '';
|
| 663 |
+
return `"${item.filename.replace(/"/g,'""')}",${l.index + 1},"${l.text.replace(/"/g,'""')}",${conf}`;
|
| 664 |
+
})
|
| 665 |
+
);
|
| 666 |
+
downloadFile('batch_transcription.csv', header + rows.join('\n'), 'text/csv');
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
async function exportAllThinkingZip() {
|
| 670 |
+
const done = batch.items.filter(i => i.status === 'done' && i.imageId);
|
| 671 |
+
if (!done.length) return;
|
| 672 |
+
try {
|
| 673 |
+
const resp = await fetch('/api/batch/export-thinking', {
|
| 674 |
+
method: 'POST',
|
| 675 |
+
headers: { 'Content-Type': 'application/json' },
|
| 676 |
+
body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
|
| 677 |
+
});
|
| 678 |
+
if (!resp.ok) throw new Error(await resp.text());
|
| 679 |
+
const blob = await resp.blob();
|
| 680 |
+
const url = URL.createObjectURL(blob);
|
| 681 |
+
const a = document.createElement('a');
|
| 682 |
+
a.href = url; a.download = 'batch_thinking.zip'; a.click();
|
| 683 |
+
URL.revokeObjectURL(url);
|
| 684 |
+
} catch (err) {
|
| 685 |
+
toast(`Thinking export failed: ${err.message}`, 'error');
|
| 686 |
+
}
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
async function exportAllTxtZip() {
|
| 690 |
+
const done = batch.items.filter(i => i.status === 'done' && i.imageId);
|
| 691 |
+
if (!done.length) return;
|
| 692 |
+
try {
|
| 693 |
+
const resp = await fetch('/api/batch/export-txt', {
|
| 694 |
+
method: 'POST',
|
| 695 |
+
headers: { 'Content-Type': 'application/json' },
|
| 696 |
+
body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
|
| 697 |
+
});
|
| 698 |
+
if (!resp.ok) throw new Error(await resp.text());
|
| 699 |
+
const blob = await resp.blob();
|
| 700 |
+
const url = URL.createObjectURL(blob);
|
| 701 |
+
const a = document.createElement('a');
|
| 702 |
+
a.href = url; a.download = 'batch_export_txt.zip'; a.click();
|
| 703 |
+
URL.revokeObjectURL(url);
|
| 704 |
+
} catch (err) {
|
| 705 |
+
toast(`TXT ZIP export failed: ${err.message}`, 'error');
|
| 706 |
+
}
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
async function exportAllXml() {
|
| 710 |
+
const done = batch.items.filter(i => i.status === 'done' && i.imageId);
|
| 711 |
+
if (!done.length) return;
|
| 712 |
+
try {
|
| 713 |
+
const resp = await fetch('/api/batch/export-xml', {
|
| 714 |
+
method: 'POST',
|
| 715 |
+
headers: { 'Content-Type': 'application/json' },
|
| 716 |
+
body: JSON.stringify({ image_ids: done.map(i => i.imageId) }),
|
| 717 |
+
});
|
| 718 |
+
if (!resp.ok) throw new Error(await resp.text());
|
| 719 |
+
const blob = await resp.blob();
|
| 720 |
+
const url = URL.createObjectURL(blob);
|
| 721 |
+
const a = document.createElement('a');
|
| 722 |
+
a.href = url; a.download = 'batch_export.zip'; a.click();
|
| 723 |
+
URL.revokeObjectURL(url);
|
| 724 |
+
} catch (err) {
|
| 725 |
+
toast(`XML export failed: ${err.message}`, 'error');
|
| 726 |
+
}
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
function downloadFile(filename, content, mime) {
|
| 730 |
+
const blob = new Blob([content], { type: mime });
|
| 731 |
+
const url = URL.createObjectURL(blob);
|
| 732 |
+
const a = document.createElement('a');
|
| 733 |
+
a.href = url; a.download = filename; a.click();
|
| 734 |
+
URL.revokeObjectURL(url);
|
| 735 |
+
}
|
web/static/components/engine-panel.js
ADDED
|
@@ -0,0 +1,1091 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Engine Panel — engine selection, dynamic config form, model loading
|
| 3 |
+
*/
|
| 4 |
+
|
| 5 |
+
import { state, emit, on, api, saveEngineConfig, loadSavedEngineName, loadSavedEngineConfig, toast } from '../app.js';
|
| 6 |
+
|
| 7 |
+
const $ = id => document.getElementById(id);
|
| 8 |
+
|
| 9 |
+
// --- API Key localStorage helpers (keys never stored on server) ---
|
| 10 |
+
const _KEY_PREFIX = 'polyscriptor_key_';
|
| 11 |
+
let _browserOpenWebUIConfig = null;
|
| 12 |
+
let _browserOpenWebUIAbort = null;
|
| 13 |
+
|
| 14 |
+
function _loadBrowserKey(slot) {
|
| 15 |
+
try { return localStorage.getItem(_KEY_PREFIX + slot) || ''; }
|
| 16 |
+
catch (_) { return ''; }
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
function _saveBrowserKey(slot, key) {
|
| 20 |
+
try {
|
| 21 |
+
if (key) localStorage.setItem(_KEY_PREFIX + slot, key);
|
| 22 |
+
else localStorage.removeItem(_KEY_PREFIX + slot);
|
| 23 |
+
return true;
|
| 24 |
+
} catch (_) { /* private browsing etc. */ }
|
| 25 |
+
return false;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
function _hasBrowserKey(slot) {
|
| 29 |
+
return !!_loadBrowserKey(slot);
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
function _normalizeBaseUrl(baseUrl) {
|
| 33 |
+
return (baseUrl || '').trim().replace(/\/+$/, '');
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
function _openWebUIModelUrls(baseUrl) {
|
| 37 |
+
const base = _normalizeBaseUrl(baseUrl);
|
| 38 |
+
if (!base) return [];
|
| 39 |
+
const urls = [`${base}/models`];
|
| 40 |
+
if (base.endsWith('/api')) {
|
| 41 |
+
urls.push(`${base}/v1/models`);
|
| 42 |
+
urls.push(`${base.slice(0, -4)}/v1/models`);
|
| 43 |
+
} else if (base.endsWith('/api/v1')) {
|
| 44 |
+
urls.push(`${base.slice(0, -3)}/models`);
|
| 45 |
+
urls.push(`${base}/models`);
|
| 46 |
+
} else if (base.endsWith('/v1')) {
|
| 47 |
+
urls.push(`${base.slice(0, -3)}/api/models`);
|
| 48 |
+
} else {
|
| 49 |
+
urls.push(`${base}/api/models`);
|
| 50 |
+
urls.push(`${base}/api/v1/models`);
|
| 51 |
+
urls.push(`${base}/v1/models`);
|
| 52 |
+
}
|
| 53 |
+
return [...new Set(urls)];
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
function _extractModelIds(payload) {
|
| 57 |
+
if (Array.isArray(payload)) {
|
| 58 |
+
return [...new Set(payload.map(item => {
|
| 59 |
+
if (typeof item === 'string') return item;
|
| 60 |
+
if (item && typeof item === 'object') return item.id || item.name || item.model;
|
| 61 |
+
return null;
|
| 62 |
+
}).filter(Boolean))].sort();
|
| 63 |
+
}
|
| 64 |
+
if (payload && typeof payload === 'object') {
|
| 65 |
+
for (const key of ['data', 'models']) {
|
| 66 |
+
if (Array.isArray(payload[key])) return _extractModelIds(payload[key]);
|
| 67 |
+
}
|
| 68 |
+
return _extractModelIds(Object.values(payload));
|
| 69 |
+
}
|
| 70 |
+
return [];
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
async function _fetchOpenWebUIModelsInBrowser(baseUrl, apiKey) {
|
| 74 |
+
const errors = [];
|
| 75 |
+
for (const url of _openWebUIModelUrls(baseUrl)) {
|
| 76 |
+
try {
|
| 77 |
+
const resp = await fetch(url, {
|
| 78 |
+
headers: {
|
| 79 |
+
'Authorization': `Bearer ${apiKey}`,
|
| 80 |
+
'Content-Type': 'application/json',
|
| 81 |
+
'Accept': 'application/json',
|
| 82 |
+
},
|
| 83 |
+
});
|
| 84 |
+
const contentType = resp.headers.get('content-type') || '';
|
| 85 |
+
const text = await resp.text();
|
| 86 |
+
if (!resp.ok) {
|
| 87 |
+
errors.push(`${url}: HTTP ${resp.status}`);
|
| 88 |
+
continue;
|
| 89 |
+
}
|
| 90 |
+
if (!contentType.includes('json')) {
|
| 91 |
+
const sample = text.trim().replace(/\s+/g, ' ').slice(0, 120) || '<empty response>';
|
| 92 |
+
errors.push(`${url}: non-JSON response: ${sample}`);
|
| 93 |
+
continue;
|
| 94 |
+
}
|
| 95 |
+
const models = _extractModelIds(JSON.parse(text));
|
| 96 |
+
if (models.length) return models;
|
| 97 |
+
errors.push(`${url}: no model ids in response`);
|
| 98 |
+
} catch (err) {
|
| 99 |
+
errors.push(`${url}: ${err.message}`);
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
throw new Error(errors.join('; ') || 'No OpenWebUI model endpoint tried');
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
async function _blobToDataUrl(blob) {
|
| 106 |
+
return await new Promise((resolve, reject) => {
|
| 107 |
+
const reader = new FileReader();
|
| 108 |
+
reader.onload = () => resolve(reader.result);
|
| 109 |
+
reader.onerror = () => reject(reader.error || new Error('Could not read image'));
|
| 110 |
+
reader.readAsDataURL(blob);
|
| 111 |
+
});
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
function _resolveOpenWebUIModel(config) {
|
| 115 |
+
if (config.model === '__custom__') return (config.model_custom || '').trim();
|
| 116 |
+
return (config.model || '').trim();
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
export function initEnginePanel() {
|
| 120 |
+
loadEngines();
|
| 121 |
+
|
| 122 |
+
$('engine-select').addEventListener('change', onEngineSelected);
|
| 123 |
+
$('btn-load-model').addEventListener('click', onLoadModel);
|
| 124 |
+
$('btn-transcribe').addEventListener('click', onTranscribe);
|
| 125 |
+
$('btn-segment').addEventListener('click', onSegment);
|
| 126 |
+
|
| 127 |
+
// Show/hide blla-specific options
|
| 128 |
+
const segMethodSel = $('seg-method');
|
| 129 |
+
const bllaopts = $('blla-options');
|
| 130 |
+
const syncBllaOpts = () => {
|
| 131 |
+
if (bllaopts) bllaopts.style.display = segMethodSel.value === 'kraken-blla' ? '' : 'none';
|
| 132 |
+
};
|
| 133 |
+
segMethodSel.addEventListener('change', syncBllaOpts);
|
| 134 |
+
syncBllaOpts();
|
| 135 |
+
|
| 136 |
+
// Cancel button — visible during transcription
|
| 137 |
+
$('btn-cancel').addEventListener('click', async () => {
|
| 138 |
+
if ($('engine-select')?.value === 'OpenWebUI' && _browserOpenWebUIAbort) {
|
| 139 |
+
_browserOpenWebUIAbort.abort();
|
| 140 |
+
return;
|
| 141 |
+
}
|
| 142 |
+
try {
|
| 143 |
+
await fetch('/api/transcribe/cancel', { method: 'POST' });
|
| 144 |
+
} catch (_) { /* ignore */ }
|
| 145 |
+
});
|
| 146 |
+
|
| 147 |
+
// Enable transcribe/segment buttons when image is ready
|
| 148 |
+
on('engine-loaded', () => { updateTranscribeBtn(); updateSegmentBtn(); });
|
| 149 |
+
on('image-uploaded', () => { updateTranscribeBtn(); updateSegmentBtn(); });
|
| 150 |
+
on('batch-item-start', () => { updateTranscribeBtn(); updateSegmentBtn(); });
|
| 151 |
+
on('transcription-complete', () => {
|
| 152 |
+
state.isProcessing = false;
|
| 153 |
+
$('btn-transcribe').classList.remove('loading');
|
| 154 |
+
$('btn-transcribe').textContent = 'Transcribe';
|
| 155 |
+
$('btn-cancel').classList.add('hidden');
|
| 156 |
+
updateTranscribeBtn();
|
| 157 |
+
updateSegmentBtn();
|
| 158 |
+
});
|
| 159 |
+
|
| 160 |
+
// Region list — appears after segmentation, cleared on new image/transcription
|
| 161 |
+
on('sse-segmentation', data => renderRegionList(data.regions || []));
|
| 162 |
+
on('image-uploaded', () => { $('seg-regions-list').classList.add('hidden'); $('seg-regions-list').innerHTML = ''; });
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
async function loadEngines() {
|
| 166 |
+
try {
|
| 167 |
+
const resp = await api('/api/engines');
|
| 168 |
+
state.engines = await resp.json();
|
| 169 |
+
|
| 170 |
+
const select = $('engine-select');
|
| 171 |
+
select.innerHTML = '';
|
| 172 |
+
|
| 173 |
+
const available = state.engines.filter(e => e.available);
|
| 174 |
+
const unavailable = state.engines.filter(e => !e.available);
|
| 175 |
+
|
| 176 |
+
if (available.length === 0) {
|
| 177 |
+
select.innerHTML = '<option>No engines available</option>';
|
| 178 |
+
return;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
const savedEngine = loadSavedEngineName();
|
| 182 |
+
|
| 183 |
+
for (const eng of available) {
|
| 184 |
+
const opt = document.createElement('option');
|
| 185 |
+
opt.value = eng.name;
|
| 186 |
+
opt.textContent = eng.name;
|
| 187 |
+
select.appendChild(opt);
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
if (unavailable.length > 0) {
|
| 191 |
+
const group = document.createElement('optgroup');
|
| 192 |
+
group.label = 'Unavailable';
|
| 193 |
+
for (const eng of unavailable) {
|
| 194 |
+
const opt = document.createElement('option');
|
| 195 |
+
opt.value = eng.name;
|
| 196 |
+
opt.textContent = `${eng.name} (${eng.unavailable_reason || 'missing deps'})`;
|
| 197 |
+
opt.disabled = true;
|
| 198 |
+
group.appendChild(opt);
|
| 199 |
+
}
|
| 200 |
+
select.appendChild(group);
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
// Restore last used engine if available
|
| 204 |
+
if (savedEngine && available.find(e => e.name === savedEngine)) {
|
| 205 |
+
select.value = savedEngine;
|
| 206 |
+
}
|
| 207 |
+
select.disabled = false;
|
| 208 |
+
onEngineSelected();
|
| 209 |
+
} catch (err) {
|
| 210 |
+
$('engine-description').textContent = `Error loading engines: ${err.message}`;
|
| 211 |
+
}
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
async function onEngineSelected() {
|
| 215 |
+
const name = $('engine-select').value;
|
| 216 |
+
const eng = state.engines.find(e => e.name === name);
|
| 217 |
+
state.currentEngine = eng;
|
| 218 |
+
|
| 219 |
+
// Description
|
| 220 |
+
$('engine-description').textContent = eng?.description || '';
|
| 221 |
+
|
| 222 |
+
// Show/hide segmentation controls based on engine capability
|
| 223 |
+
updateSegmentationVisibility(eng);
|
| 224 |
+
|
| 225 |
+
// Load config schema
|
| 226 |
+
const configForm = $('config-form');
|
| 227 |
+
configForm.innerHTML = '';
|
| 228 |
+
|
| 229 |
+
if (!eng) return;
|
| 230 |
+
|
| 231 |
+
try {
|
| 232 |
+
const resp = await api(`/api/engine/${encodeURIComponent(name)}/config-schema`);
|
| 233 |
+
const schema = await resp.json();
|
| 234 |
+
|
| 235 |
+
for (const field of schema.fields || []) {
|
| 236 |
+
configForm.appendChild(createField(field));
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
// Restore saved config values for this engine (skip password fields for security)
|
| 240 |
+
const savedCfg = loadSavedEngineConfig(name);
|
| 241 |
+
if (savedCfg) {
|
| 242 |
+
for (const el of configForm.querySelectorAll('[data-key]')) {
|
| 243 |
+
if (el.dataset.passwordField) continue; // never prefill secrets
|
| 244 |
+
const val = savedCfg[el.dataset.key];
|
| 245 |
+
if (val == null) continue;
|
| 246 |
+
if (el.type === 'checkbox') el.checked = !!val;
|
| 247 |
+
else el.value = val;
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
$('btn-load-model').disabled = false;
|
| 252 |
+
|
| 253 |
+
// For Commercial APIs: when provider changes, swap model list and update key hint
|
| 254 |
+
const providerSel = $('cfg-provider');
|
| 255 |
+
const modelSel = $('cfg-model');
|
| 256 |
+
if (providerSel && modelSel) {
|
| 257 |
+
const syncModelList = async () => {
|
| 258 |
+
// Clear model list and auto-fetch from live API if a key is available
|
| 259 |
+
_populateSelect(modelSel, []); // show "— click ↻ to load —"
|
| 260 |
+
modelSel.dispatchEvent(new Event('change'));
|
| 261 |
+
|
| 262 |
+
// Auto-trigger fetch if we have a browser key for this provider
|
| 263 |
+
const prov = providerSel.value.toLowerCase();
|
| 264 |
+
const keyEl = $('cfg-api_key');
|
| 265 |
+
const hasBrowser = _hasBrowserKey(prov);
|
| 266 |
+
const hasTyped = keyEl?.value?.trim().length > 0;
|
| 267 |
+
if (hasBrowser || hasTyped) {
|
| 268 |
+
const refreshBtn = modelSel.closest('.config-field')?.querySelector('.btn-refresh');
|
| 269 |
+
if (refreshBtn) refreshBtn.click();
|
| 270 |
+
}
|
| 271 |
+
};
|
| 272 |
+
providerSel.addEventListener('change', syncModelList);
|
| 273 |
+
syncModelList(); // run once on load to match default provider
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
const keyInput = $('cfg-api_key');
|
| 277 |
+
if (providerSel && keyInput) {
|
| 278 |
+
const updateKeyHint = () => {
|
| 279 |
+
const slot = providerSel.value.toLowerCase();
|
| 280 |
+
const hasBrowser = _hasBrowserKey(slot);
|
| 281 |
+
const saveRow = keyInput.closest('.config-field')?.querySelector('.key-save-row');
|
| 282 |
+
const saveBox = saveRow?.querySelector('input[type="checkbox"]');
|
| 283 |
+
if (hasBrowser) {
|
| 284 |
+
keyInput.placeholder = '•••••••• (saved in browser — leave blank to keep)';
|
| 285 |
+
keyInput.dataset.hasBrowser = 'true';
|
| 286 |
+
keyInput.disabled = false;
|
| 287 |
+
if (saveRow) { saveRow.style.display = ''; saveRow.querySelector('label').textContent = 'Key saved in browser'; }
|
| 288 |
+
if (saveBox) saveBox.checked = true;
|
| 289 |
+
} else {
|
| 290 |
+
keyInput.placeholder = 'Paste API key here';
|
| 291 |
+
keyInput.disabled = false;
|
| 292 |
+
delete keyInput.dataset.hasBrowser;
|
| 293 |
+
if (saveRow) { saveRow.style.display = ''; saveRow.querySelector('label').textContent = 'Save key in browser'; }
|
| 294 |
+
if (saveBox) saveBox.checked = false;
|
| 295 |
+
}
|
| 296 |
+
};
|
| 297 |
+
providerSel.addEventListener('change', updateKeyHint);
|
| 298 |
+
updateKeyHint(); // run once on load
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
// Kraken: show preset dropdown and load preset list
|
| 302 |
+
const krakenPresetRow = $('kraken-preset-row');
|
| 303 |
+
if (krakenPresetRow) {
|
| 304 |
+
if (name === 'Kraken') {
|
| 305 |
+
krakenPresetRow.classList.remove('hidden');
|
| 306 |
+
_loadKrakenPresets();
|
| 307 |
+
} else {
|
| 308 |
+
krakenPresetRow.classList.add('hidden');
|
| 309 |
+
}
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
// Auto-load model if this engine was previously configured.
|
| 313 |
+
// Skip engines with dynamic model lists (need live fetch first — user loads manually).
|
| 314 |
+
const hasDynamic = schema.fields?.some(f => f.dynamic);
|
| 315 |
+
if (savedCfg && !hasDynamic) {
|
| 316 |
+
onLoadModel();
|
| 317 |
+
}
|
| 318 |
+
} catch (err) {
|
| 319 |
+
configForm.innerHTML = `<p class="muted">Error: ${err.message}</p>`;
|
| 320 |
+
}
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
let _krakenPresetsLoaded = false;
|
| 324 |
+
async function _loadKrakenPresets() {
|
| 325 |
+
if (_krakenPresetsLoaded) return;
|
| 326 |
+
const sel = $('kraken-preset-select');
|
| 327 |
+
const status = $('kraken-preset-status');
|
| 328 |
+
if (!sel) return;
|
| 329 |
+
try {
|
| 330 |
+
const resp = await fetch('/api/kraken/presets');
|
| 331 |
+
const data = await resp.json();
|
| 332 |
+
sel.innerHTML = '';
|
| 333 |
+
const blank = document.createElement('option');
|
| 334 |
+
blank.value = '';
|
| 335 |
+
blank.textContent = '— use model path above —';
|
| 336 |
+
sel.appendChild(blank);
|
| 337 |
+
for (const p of data.presets || []) {
|
| 338 |
+
const opt = document.createElement('option');
|
| 339 |
+
opt.value = p.id;
|
| 340 |
+
const icon = p.source === 'local' ? '📁' : '⬇️';
|
| 341 |
+
opt.textContent = `${icon} ${p.label} (${p.language})`;
|
| 342 |
+
sel.appendChild(opt);
|
| 343 |
+
}
|
| 344 |
+
_krakenPresetsLoaded = true;
|
| 345 |
+
} catch (e) {
|
| 346 |
+
if (status) status.textContent = 'Could not load presets';
|
| 347 |
+
}
|
| 348 |
+
sel.addEventListener('change', () => {
|
| 349 |
+
const status = $('kraken-preset-status');
|
| 350 |
+
const modelPathEl = $('cfg-model_path');
|
| 351 |
+
const val = sel.value;
|
| 352 |
+
if (!val) {
|
| 353 |
+
if (status) status.textContent = '';
|
| 354 |
+
return;
|
| 355 |
+
}
|
| 356 |
+
if (status) {
|
| 357 |
+
status.textContent = val === 'blla-local'
|
| 358 |
+
? '📁 Local model — loads instantly'
|
| 359 |
+
: '⬇️ Auto-downloads from Zenodo on first use (~30–120s)';
|
| 360 |
+
}
|
| 361 |
+
// Pre-fill model_path field with the preset ID so server knows what to load
|
| 362 |
+
if (modelPathEl) modelPathEl.value = ''; // clear — preset_id takes priority
|
| 363 |
+
});
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
/**
|
| 367 |
+
* Show or hide segmentation controls depending on whether the selected engine
|
| 368 |
+
* requires line segmentation. Page-level engines (VLMs, Commercial APIs, etc.)
|
| 369 |
+
* do their own segmentation internally — showing these controls is misleading.
|
| 370 |
+
*/
|
| 371 |
+
function updateSegmentationVisibility(eng) {
|
| 372 |
+
const needsSeg = eng ? eng.requires_line_segmentation : true;
|
| 373 |
+
const segControls = $('seg-controls');
|
| 374 |
+
if (segControls) {
|
| 375 |
+
segControls.style.display = needsSeg ? '' : 'none';
|
| 376 |
+
}
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
function createField(field) {
|
| 380 |
+
const wrapper = document.createElement('div');
|
| 381 |
+
|
| 382 |
+
if (field.type === 'checkbox') {
|
| 383 |
+
wrapper.className = 'config-field config-field-checkbox';
|
| 384 |
+
const input = document.createElement('input');
|
| 385 |
+
input.type = 'checkbox';
|
| 386 |
+
input.id = `cfg-${field.key}`;
|
| 387 |
+
input.dataset.key = field.key;
|
| 388 |
+
input.checked = field.default ?? false;
|
| 389 |
+
|
| 390 |
+
const label = document.createElement('label');
|
| 391 |
+
label.htmlFor = input.id;
|
| 392 |
+
label.textContent = field.label;
|
| 393 |
+
|
| 394 |
+
wrapper.appendChild(input);
|
| 395 |
+
wrapper.appendChild(label);
|
| 396 |
+
} else {
|
| 397 |
+
wrapper.className = 'config-field';
|
| 398 |
+
const label = document.createElement('label');
|
| 399 |
+
label.htmlFor = `cfg-${field.key}`;
|
| 400 |
+
label.textContent = field.label;
|
| 401 |
+
wrapper.appendChild(label);
|
| 402 |
+
|
| 403 |
+
if (field.type === 'select') {
|
| 404 |
+
// Row: select + optional refresh button
|
| 405 |
+
const selectRow = document.createElement('div');
|
| 406 |
+
selectRow.className = 'select-row';
|
| 407 |
+
|
| 408 |
+
const select = document.createElement('select');
|
| 409 |
+
select.id = `cfg-${field.key}`;
|
| 410 |
+
select.dataset.key = field.key;
|
| 411 |
+
if (field.per_provider_options) {
|
| 412 |
+
// Store for later use when provider changes
|
| 413 |
+
select.dataset.perProviderOptions = JSON.stringify(field.per_provider_options);
|
| 414 |
+
}
|
| 415 |
+
_populateSelect(select, field.options || [], field.default);
|
| 416 |
+
selectRow.appendChild(select);
|
| 417 |
+
|
| 418 |
+
// Dynamic refresh button — fetches live model list from server
|
| 419 |
+
if (field.dynamic) {
|
| 420 |
+
const hint = document.createElement('span');
|
| 421 |
+
hint.className = 'dynamic-hint muted';
|
| 422 |
+
hint.textContent = field.dynamic_hint || 'Click ↻ to load models';
|
| 423 |
+
|
| 424 |
+
const refreshBtn = document.createElement('button');
|
| 425 |
+
refreshBtn.type = 'button';
|
| 426 |
+
refreshBtn.className = 'btn-refresh';
|
| 427 |
+
refreshBtn.title = 'Refresh model list from server';
|
| 428 |
+
refreshBtn.textContent = '↻';
|
| 429 |
+
refreshBtn.addEventListener('click', async () => {
|
| 430 |
+
const engineName = $('engine-select').value;
|
| 431 |
+
const providerEl = $('cfg-provider');
|
| 432 |
+
const keyEl = $('cfg-api_key');
|
| 433 |
+
const provider = providerEl?.value?.toLowerCase() || 'openai';
|
| 434 |
+
const keySlot = engineName === 'OpenWebUI' ? 'openwebui' : provider;
|
| 435 |
+
const apiKey = keyEl?.value?.trim() || _loadBrowserKey(keySlot);
|
| 436 |
+
|
| 437 |
+
refreshBtn.textContent = '…';
|
| 438 |
+
refreshBtn.disabled = true;
|
| 439 |
+
try {
|
| 440 |
+
const baseUrlEl = $('cfg-base_url');
|
| 441 |
+
const baseUrl = baseUrlEl?.value?.trim() || '';
|
| 442 |
+
let data;
|
| 443 |
+
if (engineName === 'OpenWebUI') {
|
| 444 |
+
if (!baseUrl) throw new Error('Enter your OpenWebUI base URL');
|
| 445 |
+
if (!apiKey) throw new Error('Enter your OpenWebUI API key');
|
| 446 |
+
const models = await _fetchOpenWebUIModelsInBrowser(baseUrl, apiKey);
|
| 447 |
+
data = { models };
|
| 448 |
+
} else {
|
| 449 |
+
const params = new URLSearchParams({ provider, api_key: apiKey, base_url: baseUrl });
|
| 450 |
+
const resp = await fetch(
|
| 451 |
+
`/api/engine/${encodeURIComponent(engineName)}/models?${params}`
|
| 452 |
+
);
|
| 453 |
+
data = await resp.json();
|
| 454 |
+
}
|
| 455 |
+
if (data.error) {
|
| 456 |
+
hint.textContent = `Error: ${data.error}`;
|
| 457 |
+
} else if (data.models.length === 0) {
|
| 458 |
+
hint.textContent = 'No models found';
|
| 459 |
+
} else {
|
| 460 |
+
const current = select.value;
|
| 461 |
+
// Build options, keep __custom__ at the end if present
|
| 462 |
+
const newOpts = data.models.map(m => ({ label: m, value: m }));
|
| 463 |
+
if (field.custom_key) newOpts.push({ label: 'Custom model ID…', value: '__custom__' });
|
| 464 |
+
_populateSelect(select, newOpts, current);
|
| 465 |
+
hint.textContent = `${data.models.length} models loaded`;
|
| 466 |
+
}
|
| 467 |
+
} catch (e) {
|
| 468 |
+
hint.textContent = `Error: ${e.message}`;
|
| 469 |
+
} finally {
|
| 470 |
+
refreshBtn.textContent = '↻';
|
| 471 |
+
refreshBtn.disabled = false;
|
| 472 |
+
}
|
| 473 |
+
});
|
| 474 |
+
selectRow.appendChild(refreshBtn);
|
| 475 |
+
wrapper.appendChild(selectRow);
|
| 476 |
+
wrapper.appendChild(hint);
|
| 477 |
+
} else {
|
| 478 |
+
wrapper.appendChild(selectRow);
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
// If this select can have a __custom__ sentinel, wire up a
|
| 482 |
+
// hidden text input that appears when "__custom__" is chosen.
|
| 483 |
+
if (field.custom_key) {
|
| 484 |
+
const customInput = document.createElement('input');
|
| 485 |
+
customInput.type = 'text';
|
| 486 |
+
customInput.id = `cfg-${field.custom_key}`;
|
| 487 |
+
customInput.dataset.key = field.custom_key;
|
| 488 |
+
customInput.placeholder = field.custom_placeholder || 'Enter custom value';
|
| 489 |
+
customInput.style.marginTop = '4px';
|
| 490 |
+
|
| 491 |
+
// Show/hide based on current select value
|
| 492 |
+
const syncCustomVisibility = () => {
|
| 493 |
+
const isCustom = select.value === '__custom__';
|
| 494 |
+
customInput.style.display = isCustom ? '' : 'none';
|
| 495 |
+
customInput.required = isCustom;
|
| 496 |
+
};
|
| 497 |
+
select.addEventListener('change', syncCustomVisibility);
|
| 498 |
+
syncCustomVisibility(); // run once on creation
|
| 499 |
+
|
| 500 |
+
wrapper.appendChild(customInput);
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
// Upload button — lets users upload a local .mlmodel file from their machine
|
| 504 |
+
if (field.upload) {
|
| 505 |
+
const uploadRow = document.createElement('div');
|
| 506 |
+
uploadRow.className = 'upload-model-row';
|
| 507 |
+
uploadRow.style.cssText = 'display:flex;align-items:center;gap:6px;margin-top:6px;';
|
| 508 |
+
|
| 509 |
+
const fileInput = document.createElement('input');
|
| 510 |
+
fileInput.type = 'file';
|
| 511 |
+
fileInput.accept = '.mlmodel';
|
| 512 |
+
fileInput.style.display = 'none';
|
| 513 |
+
|
| 514 |
+
const uploadBtn = document.createElement('button');
|
| 515 |
+
uploadBtn.type = 'button';
|
| 516 |
+
uploadBtn.className = 'btn-secondary btn-sm';
|
| 517 |
+
uploadBtn.textContent = 'Upload .mlmodel…';
|
| 518 |
+
uploadBtn.title = 'Upload a Kraken model file from your computer';
|
| 519 |
+
|
| 520 |
+
const uploadStatus = document.createElement('span');
|
| 521 |
+
uploadStatus.className = 'muted';
|
| 522 |
+
uploadStatus.style.fontSize = '0.85em';
|
| 523 |
+
|
| 524 |
+
uploadBtn.addEventListener('click', () => fileInput.click());
|
| 525 |
+
|
| 526 |
+
fileInput.addEventListener('change', async () => {
|
| 527 |
+
const f = fileInput.files[0];
|
| 528 |
+
if (!f) return;
|
| 529 |
+
uploadStatus.textContent = `Uploading ${f.name}…`;
|
| 530 |
+
uploadBtn.disabled = true;
|
| 531 |
+
try {
|
| 532 |
+
const fd = new FormData();
|
| 533 |
+
fd.append('file', f);
|
| 534 |
+
const resp = await fetch('/api/models/upload', { method: 'POST', body: fd });
|
| 535 |
+
if (!resp.ok) {
|
| 536 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 537 |
+
throw new Error(err.detail || resp.statusText);
|
| 538 |
+
}
|
| 539 |
+
const data = await resp.json();
|
| 540 |
+
// Repopulate select with fresh options returned by server
|
| 541 |
+
const newPath = data.path;
|
| 542 |
+
_populateSelect(select, data.options, newPath);
|
| 543 |
+
uploadStatus.textContent = `Uploaded: ${data.filename}`;
|
| 544 |
+
// Re-run custom visibility sync (new value might not be __custom__)
|
| 545 |
+
if (field.custom_key) {
|
| 546 |
+
const isCustom = select.value === '__custom__';
|
| 547 |
+
const ci = document.getElementById(`cfg-${field.custom_key}`);
|
| 548 |
+
if (ci) { ci.style.display = isCustom ? '' : 'none'; ci.required = isCustom; }
|
| 549 |
+
}
|
| 550 |
+
} catch (e) {
|
| 551 |
+
uploadStatus.textContent = `Upload failed: ${e.message}`;
|
| 552 |
+
} finally {
|
| 553 |
+
uploadBtn.disabled = false;
|
| 554 |
+
fileInput.value = '';
|
| 555 |
+
}
|
| 556 |
+
});
|
| 557 |
+
|
| 558 |
+
uploadRow.appendChild(fileInput);
|
| 559 |
+
uploadRow.appendChild(uploadBtn);
|
| 560 |
+
uploadRow.appendChild(uploadStatus);
|
| 561 |
+
wrapper.appendChild(uploadRow);
|
| 562 |
+
}
|
| 563 |
+
} else if (field.type === 'number') {
|
| 564 |
+
const input = document.createElement('input');
|
| 565 |
+
input.type = 'number';
|
| 566 |
+
input.id = `cfg-${field.key}`;
|
| 567 |
+
input.dataset.key = field.key;
|
| 568 |
+
if (field.min != null) input.min = field.min;
|
| 569 |
+
if (field.max != null) input.max = field.max;
|
| 570 |
+
input.value = field.default ?? '';
|
| 571 |
+
wrapper.appendChild(input);
|
| 572 |
+
} else if (field.type === 'password') {
|
| 573 |
+
const input = document.createElement('input');
|
| 574 |
+
input.type = 'password';
|
| 575 |
+
input.id = `cfg-${field.key}`;
|
| 576 |
+
input.dataset.key = field.key;
|
| 577 |
+
input.dataset.passwordField = 'true';
|
| 578 |
+
|
| 579 |
+
// Determine effective key slot for localStorage lookup
|
| 580 |
+
function _getKeySlot() {
|
| 581 |
+
const providerEl = $('cfg-provider');
|
| 582 |
+
if (providerEl) return providerEl.value.toLowerCase();
|
| 583 |
+
const engineEl = $('engine-select');
|
| 584 |
+
if (engineEl?.value === 'OpenWebUI') return 'openwebui';
|
| 585 |
+
return field.key;
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
function applyKeyHint() {
|
| 589 |
+
const slot = _getKeySlot();
|
| 590 |
+
const hasBrowser = _hasBrowserKey(slot);
|
| 591 |
+
if (hasBrowser) {
|
| 592 |
+
input.placeholder = '•••••••• (saved in browser — leave blank to keep)';
|
| 593 |
+
input.dataset.hasBrowser = 'true';
|
| 594 |
+
} else {
|
| 595 |
+
input.placeholder = field.placeholder || 'Paste API key here';
|
| 596 |
+
delete input.dataset.hasBrowser;
|
| 597 |
+
}
|
| 598 |
+
input.disabled = false;
|
| 599 |
+
}
|
| 600 |
+
applyKeyHint();
|
| 601 |
+
wrapper.appendChild(input);
|
| 602 |
+
|
| 603 |
+
// "Save key in browser" checkbox
|
| 604 |
+
const saveRow = document.createElement('div');
|
| 605 |
+
saveRow.className = 'key-save-row';
|
| 606 |
+
const saveBox = document.createElement('input');
|
| 607 |
+
saveBox.type = 'checkbox';
|
| 608 |
+
saveBox.id = `cfg-${field.key}-save`;
|
| 609 |
+
saveBox.dataset.saveFor = field.key;
|
| 610 |
+
const slot = _getKeySlot();
|
| 611 |
+
saveBox.checked = _hasBrowserKey(slot);
|
| 612 |
+
const saveLabel = document.createElement('label');
|
| 613 |
+
saveLabel.htmlFor = saveBox.id;
|
| 614 |
+
saveLabel.textContent = _hasBrowserKey(slot)
|
| 615 |
+
? 'Key saved in browser' : 'Save key in browser';
|
| 616 |
+
saveRow.appendChild(saveBox);
|
| 617 |
+
saveRow.appendChild(saveLabel);
|
| 618 |
+
wrapper.appendChild(saveRow);
|
| 619 |
+
} else if (field.type === 'textarea') {
|
| 620 |
+
const ta = document.createElement('textarea');
|
| 621 |
+
ta.id = `cfg-${field.key}`;
|
| 622 |
+
ta.dataset.key = field.key;
|
| 623 |
+
ta.rows = field.rows || 3;
|
| 624 |
+
ta.value = field.default ?? '';
|
| 625 |
+
if (field.placeholder) ta.placeholder = field.placeholder;
|
| 626 |
+
ta.style.width = '100%';
|
| 627 |
+
ta.style.resize = 'vertical';
|
| 628 |
+
wrapper.appendChild(ta);
|
| 629 |
+
if (field.hint) {
|
| 630 |
+
const hint = document.createElement('small');
|
| 631 |
+
hint.textContent = field.hint;
|
| 632 |
+
hint.style.color = 'var(--text-muted, #888)';
|
| 633 |
+
wrapper.appendChild(hint);
|
| 634 |
+
}
|
| 635 |
+
} else {
|
| 636 |
+
// text
|
| 637 |
+
const input = document.createElement('input');
|
| 638 |
+
input.type = 'text';
|
| 639 |
+
input.id = `cfg-${field.key}`;
|
| 640 |
+
input.dataset.key = field.key;
|
| 641 |
+
input.value = field.default ?? '';
|
| 642 |
+
if (field.placeholder) input.placeholder = field.placeholder;
|
| 643 |
+
wrapper.appendChild(input);
|
| 644 |
+
}
|
| 645 |
+
}
|
| 646 |
+
return wrapper;
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
function collectConfig() {
|
| 650 |
+
const config = {};
|
| 651 |
+
const fields = $('config-form').querySelectorAll('[data-key]');
|
| 652 |
+
for (const el of fields) {
|
| 653 |
+
const key = el.dataset.key;
|
| 654 |
+
if (el.dataset.saveFor) continue; // "save key" checkboxes are not config
|
| 655 |
+
if (el.type === 'checkbox') {
|
| 656 |
+
config[key] = el.checked;
|
| 657 |
+
} else if (el.type === 'number') {
|
| 658 |
+
config[key] = Number(el.value);
|
| 659 |
+
} else if (el.dataset.passwordField && !el.value.trim()) {
|
| 660 |
+
// Blank password field — inject key from browser localStorage
|
| 661 |
+
const providerEl = $('cfg-provider');
|
| 662 |
+
let slot = key;
|
| 663 |
+
if (providerEl) slot = providerEl.value.toLowerCase();
|
| 664 |
+
else if ($('engine-select')?.value === 'OpenWebUI') slot = 'openwebui';
|
| 665 |
+
const browserKey = _loadBrowserKey(slot);
|
| 666 |
+
config[key] = browserKey; // may be empty — server will check env next
|
| 667 |
+
} else {
|
| 668 |
+
config[key] = el.value;
|
| 669 |
+
}
|
| 670 |
+
}
|
| 671 |
+
return config;
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
function _persistNewKeys(engineName) {
|
| 675 |
+
// Save any typed API key to browser localStorage automatically.
|
| 676 |
+
// Unchecking "Save key" is the explicit opt-out (deletes saved key).
|
| 677 |
+
const saveBoxes = $('config-form').querySelectorAll('[data-save-for]');
|
| 678 |
+
for (const box of saveBoxes) {
|
| 679 |
+
const keyField = $(`cfg-${box.dataset.saveFor}`);
|
| 680 |
+
const newKey = keyField?.value?.trim();
|
| 681 |
+
|
| 682 |
+
// Determine slot from engine name
|
| 683 |
+
const slotMap = {
|
| 684 |
+
'OpenWebUI': 'openwebui',
|
| 685 |
+
'Commercial APIs': null, // slot depends on selected provider
|
| 686 |
+
};
|
| 687 |
+
let slot = slotMap[engineName];
|
| 688 |
+
if (engineName === 'Commercial APIs') {
|
| 689 |
+
const providerEl = $('cfg-provider');
|
| 690 |
+
slot = providerEl?.value?.toLowerCase() || 'openai';
|
| 691 |
+
}
|
| 692 |
+
if (!slot) continue;
|
| 693 |
+
|
| 694 |
+
if (newKey) {
|
| 695 |
+
const label = box.nextElementSibling;
|
| 696 |
+
if (_saveBrowserKey(slot, newKey)) {
|
| 697 |
+
keyField.value = ''; // clear field; hint shows key is saved
|
| 698 |
+
keyField.placeholder = '•••••••• (saved in browser — leave blank to keep)';
|
| 699 |
+
keyField.dataset.hasBrowser = 'true';
|
| 700 |
+
box.checked = true;
|
| 701 |
+
if (label) label.textContent = 'Key saved in browser';
|
| 702 |
+
} else {
|
| 703 |
+
box.checked = false;
|
| 704 |
+
if (label) label.textContent = 'Could not save key in browser';
|
| 705 |
+
}
|
| 706 |
+
} else if (!box.checked && _hasBrowserKey(slot)) {
|
| 707 |
+
// Explicit opt-out: unchecked + no typed key → delete saved key
|
| 708 |
+
_saveBrowserKey(slot, '');
|
| 709 |
+
delete keyField?.dataset?.hasBrowser;
|
| 710 |
+
}
|
| 711 |
+
}
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
+
async function onLoadModel() {
|
| 715 |
+
const name = $('engine-select').value;
|
| 716 |
+
const config = collectConfig();
|
| 717 |
+
// Attach Kraken preset ID if one is selected
|
| 718 |
+
if (name === 'Kraken') {
|
| 719 |
+
const presetSel = $('kraken-preset-select');
|
| 720 |
+
if (presetSel?.value) config.preset_id = presetSel.value;
|
| 721 |
+
}
|
| 722 |
+
const btn = $('btn-load-model');
|
| 723 |
+
const status = $('engine-status');
|
| 724 |
+
|
| 725 |
+
btn.classList.add('loading');
|
| 726 |
+
btn.textContent = 'Loading...';
|
| 727 |
+
status.className = 'status-badge status-loading';
|
| 728 |
+
status.textContent = `Loading ${name}...`;
|
| 729 |
+
status.classList.remove('hidden');
|
| 730 |
+
|
| 731 |
+
try {
|
| 732 |
+
if (name === 'OpenWebUI') {
|
| 733 |
+
config.base_url = _normalizeBaseUrl(config.base_url);
|
| 734 |
+
config.model = _resolveOpenWebUIModel(config);
|
| 735 |
+
if (!config.base_url) throw new Error('Enter your OpenWebUI base URL');
|
| 736 |
+
if (!config.api_key) throw new Error('Enter your OpenWebUI API key');
|
| 737 |
+
if (!config.model) throw new Error('Load the model list or enter an OpenWebUI model ID');
|
| 738 |
+
|
| 739 |
+
_browserOpenWebUIConfig = { ...config };
|
| 740 |
+
state.engineLoaded = true;
|
| 741 |
+
status.className = 'status-badge status-loaded';
|
| 742 |
+
status.textContent = `${name} ready in browser (${config.model})`;
|
| 743 |
+
|
| 744 |
+
_persistNewKeys(name);
|
| 745 |
+
const storedConfig = { ...config };
|
| 746 |
+
delete storedConfig.api_key;
|
| 747 |
+
saveEngineConfig(name, storedConfig);
|
| 748 |
+
emit('engine-loaded', {
|
| 749 |
+
success: true,
|
| 750 |
+
load_time_s: 0,
|
| 751 |
+
engine_name: name,
|
| 752 |
+
browser_direct: true,
|
| 753 |
+
});
|
| 754 |
+
return;
|
| 755 |
+
}
|
| 756 |
+
|
| 757 |
+
const resp = await api('/api/engine/load', {
|
| 758 |
+
method: 'POST',
|
| 759 |
+
body: JSON.stringify({ engine_name: name, config }),
|
| 760 |
+
});
|
| 761 |
+
const data = await resp.json();
|
| 762 |
+
|
| 763 |
+
state.engineLoaded = true;
|
| 764 |
+
status.className = 'status-badge status-loaded';
|
| 765 |
+
status.textContent = `${name} loaded (${data.load_time_s}s)`;
|
| 766 |
+
|
| 767 |
+
_persistNewKeys(name); // save keys only after the typed key was used for loading
|
| 768 |
+
|
| 769 |
+
// Persist engine + config for next session
|
| 770 |
+
const storedConfig = { ...config };
|
| 771 |
+
delete storedConfig.api_key;
|
| 772 |
+
saveEngineConfig(name, storedConfig);
|
| 773 |
+
|
| 774 |
+
emit('engine-loaded', data);
|
| 775 |
+
} catch (err) {
|
| 776 |
+
status.className = 'status-badge';
|
| 777 |
+
status.style.color = 'var(--danger)';
|
| 778 |
+
status.textContent = `Error: ${err.message}`;
|
| 779 |
+
state.engineLoaded = false;
|
| 780 |
+
} finally {
|
| 781 |
+
btn.classList.remove('loading');
|
| 782 |
+
btn.textContent = 'Load Model';
|
| 783 |
+
}
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
async function onTranscribe() {
|
| 787 |
+
if (state.isProcessing) return;
|
| 788 |
+
if (!state.engineLoaded || !state.imageId) return;
|
| 789 |
+
|
| 790 |
+
state.isProcessing = true;
|
| 791 |
+
const btn = $('btn-transcribe');
|
| 792 |
+
btn.classList.add('loading');
|
| 793 |
+
btn.textContent = 'Transcribing...';
|
| 794 |
+
btn.disabled = true;
|
| 795 |
+
$('btn-cancel').classList.remove('hidden');
|
| 796 |
+
|
| 797 |
+
const segMethod = $('seg-method').value;
|
| 798 |
+
const segDevice = $('seg-device').value;
|
| 799 |
+
const maxColumns = parseInt($('seg-max-columns')?.value || '6', 10);
|
| 800 |
+
const splitWidth = parseFloat($('seg-split-width')?.value || '40') / 100;
|
| 801 |
+
const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
|
| 802 |
+
|
| 803 |
+
emit('transcription-start');
|
| 804 |
+
|
| 805 |
+
try {
|
| 806 |
+
if ($('engine-select').value === 'OpenWebUI') {
|
| 807 |
+
await transcribeOpenWebUIInBrowser();
|
| 808 |
+
return;
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
// Collect live config overrides — non-password form fields are sent at
|
| 812 |
+
// transcription time so changes (e.g. custom_prompt, thinking_mode) take
|
| 813 |
+
// effect immediately without requiring a model reload.
|
| 814 |
+
const liveOverrides = {};
|
| 815 |
+
for (const el of $('config-form').querySelectorAll('[data-key]')) {
|
| 816 |
+
if (el.dataset.saveFor) continue; // skip "save key" checkboxes
|
| 817 |
+
if (el.dataset.passwordField) continue; // never resend secrets
|
| 818 |
+
const key = el.dataset.key;
|
| 819 |
+
if (el.type === 'checkbox') liveOverrides[key] = el.checked;
|
| 820 |
+
else if (el.type === 'number') liveOverrides[key] = Number(el.value);
|
| 821 |
+
else liveOverrides[key] = el.value;
|
| 822 |
+
}
|
| 823 |
+
|
| 824 |
+
const resp = await fetch('/api/transcribe', {
|
| 825 |
+
method: 'POST',
|
| 826 |
+
headers: { 'Content-Type': 'application/json' },
|
| 827 |
+
body: JSON.stringify({
|
| 828 |
+
image_id: state.imageId,
|
| 829 |
+
seg_method: segMethod,
|
| 830 |
+
seg_device: segDevice,
|
| 831 |
+
max_columns: maxColumns,
|
| 832 |
+
split_width_fraction: splitWidth,
|
| 833 |
+
text_direction: textDirection,
|
| 834 |
+
engine_config_overrides: liveOverrides,
|
| 835 |
+
}),
|
| 836 |
+
});
|
| 837 |
+
|
| 838 |
+
if (!resp.ok) {
|
| 839 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 840 |
+
throw new Error(err.detail || 'Transcription failed');
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
const reader = resp.body.getReader();
|
| 844 |
+
const decoder = new TextDecoder();
|
| 845 |
+
let buffer = '';
|
| 846 |
+
|
| 847 |
+
while (true) {
|
| 848 |
+
const { done, value } = await reader.read();
|
| 849 |
+
if (done) break;
|
| 850 |
+
buffer += decoder.decode(value, { stream: true });
|
| 851 |
+
|
| 852 |
+
const parts = buffer.split('\n\n');
|
| 853 |
+
buffer = parts.pop(); // keep incomplete
|
| 854 |
+
|
| 855 |
+
for (const part of parts) {
|
| 856 |
+
if (!part.trim()) continue;
|
| 857 |
+
const eventMatch = part.match(/event: (\w+)/);
|
| 858 |
+
const dataMatch = part.match(/data: (.+)/s);
|
| 859 |
+
if (eventMatch && dataMatch) {
|
| 860 |
+
const eventName = eventMatch[1];
|
| 861 |
+
const data = JSON.parse(dataMatch[1]);
|
| 862 |
+
emit(`sse-${eventName}`, data);
|
| 863 |
+
}
|
| 864 |
+
}
|
| 865 |
+
}
|
| 866 |
+
} catch (err) {
|
| 867 |
+
if (err.name === 'AbortError') emit('sse-cancelled', {});
|
| 868 |
+
else emit('transcription-error', { message: err.message });
|
| 869 |
+
} finally {
|
| 870 |
+
_browserOpenWebUIAbort = null;
|
| 871 |
+
}
|
| 872 |
+
}
|
| 873 |
+
|
| 874 |
+
async function transcribeOpenWebUIInBrowser() {
|
| 875 |
+
const config = { ...(_browserOpenWebUIConfig || collectConfig()) };
|
| 876 |
+
config.base_url = _normalizeBaseUrl(config.base_url);
|
| 877 |
+
config.api_key = config.api_key || _loadBrowserKey('openwebui');
|
| 878 |
+
config.model = _resolveOpenWebUIModel(config);
|
| 879 |
+
if (!config.base_url) throw new Error('Enter your OpenWebUI base URL');
|
| 880 |
+
if (!config.api_key) throw new Error('Enter your OpenWebUI API key');
|
| 881 |
+
if (!config.model) throw new Error('Load the model list or enter an OpenWebUI model ID');
|
| 882 |
+
|
| 883 |
+
const imageResp = await fetch(`/api/image/${state.imageId}`);
|
| 884 |
+
if (!imageResp.ok) throw new Error('Could not load uploaded image');
|
| 885 |
+
const imageBlob = await imageResp.blob();
|
| 886 |
+
const dataUrl = await _blobToDataUrl(imageBlob);
|
| 887 |
+
|
| 888 |
+
emit('sse-segmentation', {
|
| 889 |
+
num_lines: 1,
|
| 890 |
+
bboxes: [[0, 0, state.imageInfo?.width || 0, state.imageInfo?.height || 0]],
|
| 891 |
+
source: 'page',
|
| 892 |
+
});
|
| 893 |
+
|
| 894 |
+
const prompt = (config.custom_prompt || '').trim() ||
|
| 895 |
+
'Transcribe all handwritten text in this manuscript image. Preserve the original language and layout. Output only the transcribed text without any additional commentary.';
|
| 896 |
+
|
| 897 |
+
const body = {
|
| 898 |
+
model: config.model,
|
| 899 |
+
messages: [{
|
| 900 |
+
role: 'user',
|
| 901 |
+
content: [
|
| 902 |
+
{ type: 'text', text: prompt },
|
| 903 |
+
{ type: 'image_url', image_url: { url: dataUrl } },
|
| 904 |
+
],
|
| 905 |
+
}],
|
| 906 |
+
temperature: Number.isFinite(config.temperature) ? config.temperature : 0.1,
|
| 907 |
+
};
|
| 908 |
+
if (config.max_tokens && config.max_tokens > 0) body.max_tokens = config.max_tokens;
|
| 909 |
+
|
| 910 |
+
_browserOpenWebUIAbort = new AbortController();
|
| 911 |
+
const started = Date.now();
|
| 912 |
+
const resp = await fetch(`${config.base_url}/chat/completions`, {
|
| 913 |
+
method: 'POST',
|
| 914 |
+
headers: {
|
| 915 |
+
'Authorization': `Bearer ${config.api_key}`,
|
| 916 |
+
'Content-Type': 'application/json',
|
| 917 |
+
'Accept': 'application/json',
|
| 918 |
+
},
|
| 919 |
+
body: JSON.stringify(body),
|
| 920 |
+
signal: _browserOpenWebUIAbort.signal,
|
| 921 |
+
});
|
| 922 |
+
const text = await resp.text();
|
| 923 |
+
if (!resp.ok) {
|
| 924 |
+
throw new Error(`OpenWebUI HTTP ${resp.status}: ${text.slice(0, 240)}`);
|
| 925 |
+
}
|
| 926 |
+
let payload;
|
| 927 |
+
try {
|
| 928 |
+
payload = JSON.parse(text);
|
| 929 |
+
} catch (_) {
|
| 930 |
+
throw new Error(`OpenWebUI returned non-JSON response: ${text.slice(0, 240)}`);
|
| 931 |
+
}
|
| 932 |
+
const output = (payload.choices?.[0]?.message?.content || '').trim();
|
| 933 |
+
const tokenUsage = payload.usage ? {
|
| 934 |
+
prompt_tokens: payload.usage.prompt_tokens,
|
| 935 |
+
output_tokens: payload.usage.completion_tokens,
|
| 936 |
+
total_tokens: payload.usage.total_tokens,
|
| 937 |
+
} : null;
|
| 938 |
+
const line = {
|
| 939 |
+
index: 0,
|
| 940 |
+
text: output,
|
| 941 |
+
confidence: null,
|
| 942 |
+
bbox: [0, 0, state.imageInfo?.width || 0, state.imageInfo?.height || 0],
|
| 943 |
+
region: 0,
|
| 944 |
+
};
|
| 945 |
+
const progress = { current: 1, total: 1, line };
|
| 946 |
+
if (tokenUsage) progress.token_usage = tokenUsage;
|
| 947 |
+
emit('sse-progress', progress);
|
| 948 |
+
const complete = {
|
| 949 |
+
lines: [line],
|
| 950 |
+
total_time_s: Math.round((Date.now() - started) / 10) / 100,
|
| 951 |
+
engine: 'OpenWebUI',
|
| 952 |
+
browser_direct: true,
|
| 953 |
+
};
|
| 954 |
+
if (tokenUsage) complete.token_usage = tokenUsage;
|
| 955 |
+
emit('sse-complete', complete);
|
| 956 |
+
}
|
| 957 |
+
|
| 958 |
+
function updateTranscribeBtn() {
|
| 959 |
+
$('btn-transcribe').disabled = !(state.engineLoaded && state.imageId && !state.isProcessing);
|
| 960 |
+
}
|
| 961 |
+
|
| 962 |
+
function updateSegmentBtn() {
|
| 963 |
+
$('btn-segment').disabled = !(state.imageId && !state.isProcessing);
|
| 964 |
+
}
|
| 965 |
+
|
| 966 |
+
async function onSegment() {
|
| 967 |
+
if (!state.imageId || state.isProcessing) return;
|
| 968 |
+
|
| 969 |
+
const btn = $('btn-segment');
|
| 970 |
+
const segMethod = $('seg-method').value;
|
| 971 |
+
const segDevice = $('seg-device').value;
|
| 972 |
+
const maxColumns = parseInt($('seg-max-columns')?.value || '6', 10);
|
| 973 |
+
const splitWidth = parseFloat($('seg-split-width')?.value || '40') / 100;
|
| 974 |
+
const textDirection = $('seg-text-direction')?.value || 'horizontal-lr';
|
| 975 |
+
|
| 976 |
+
btn.classList.add('loading');
|
| 977 |
+
btn.textContent = 'Segmenting…';
|
| 978 |
+
btn.disabled = true;
|
| 979 |
+
|
| 980 |
+
try {
|
| 981 |
+
const params = new URLSearchParams({
|
| 982 |
+
method: segMethod, device: segDevice,
|
| 983 |
+
max_columns: maxColumns, split_width_fraction: splitWidth,
|
| 984 |
+
text_direction: textDirection,
|
| 985 |
+
});
|
| 986 |
+
const resp = await api(`/api/image/${state.imageId}/segment?${params}`);
|
| 987 |
+
if (!resp.ok) {
|
| 988 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 989 |
+
throw new Error(err.detail || resp.statusText);
|
| 990 |
+
}
|
| 991 |
+
const data = await resp.json();
|
| 992 |
+
// Reuse the same event the transcription flow uses — draws bboxes on canvas
|
| 993 |
+
emit('sse-segmentation', data);
|
| 994 |
+
if (data.source !== 'page') {
|
| 995 |
+
toast(`${data.num_lines} lines found (${data.source})`, 'success', 3000);
|
| 996 |
+
}
|
| 997 |
+
emit('segment-preview'); // switch mobile tab to image view
|
| 998 |
+
} catch (err) {
|
| 999 |
+
toast(`Segmentation failed: ${err.message}`, 'error');
|
| 1000 |
+
} finally {
|
| 1001 |
+
btn.classList.remove('loading');
|
| 1002 |
+
btn.textContent = 'Segment';
|
| 1003 |
+
updateSegmentBtn();
|
| 1004 |
+
}
|
| 1005 |
+
}
|
| 1006 |
+
|
| 1007 |
+
/**
|
| 1008 |
+
* Populate a <select> with an array of options.
|
| 1009 |
+
* Each option may be a string or {label, value}.
|
| 1010 |
+
* Tries to restore previousValue after repopulating.
|
| 1011 |
+
*/
|
| 1012 |
+
function _populateSelect(select, options, previousValue) {
|
| 1013 |
+
select.innerHTML = '';
|
| 1014 |
+
if (options.length === 0) {
|
| 1015 |
+
const o = document.createElement('option');
|
| 1016 |
+
o.value = '';
|
| 1017 |
+
o.textContent = '— click ↻ to load —';
|
| 1018 |
+
select.appendChild(o);
|
| 1019 |
+
return;
|
| 1020 |
+
}
|
| 1021 |
+
for (const opt of options) {
|
| 1022 |
+
const o = document.createElement('option');
|
| 1023 |
+
o.value = typeof opt === 'object' ? opt.value : opt;
|
| 1024 |
+
o.textContent = typeof opt === 'object' ? opt.label : opt;
|
| 1025 |
+
select.appendChild(o);
|
| 1026 |
+
}
|
| 1027 |
+
if (previousValue != null) {
|
| 1028 |
+
// Restore previous selection if it still exists
|
| 1029 |
+
const match = Array.from(select.options).find(o => o.value === previousValue);
|
| 1030 |
+
if (match) select.value = previousValue;
|
| 1031 |
+
}
|
| 1032 |
+
}
|
| 1033 |
+
|
| 1034 |
+
// Same palette as image-viewer.js REGION_COLORS
|
| 1035 |
+
const _REGION_COLORS = [
|
| 1036 |
+
'rgba(255,160,30,0.9)', 'rgba(46,213,115,0.9)', 'rgba(232,65,24,0.9)',
|
| 1037 |
+
'rgba(52,172,224,0.9)', 'rgba(162,16,213,0.9)', 'rgba(255,211,42,0.9)',
|
| 1038 |
+
'rgba(18,203,196,0.9)', 'rgba(253,89,166,0.9)',
|
| 1039 |
+
];
|
| 1040 |
+
|
| 1041 |
+
function renderRegionList(regions) {
|
| 1042 |
+
const list = $('seg-regions-list');
|
| 1043 |
+
list.innerHTML = '';
|
| 1044 |
+
if (!regions.length) { list.classList.add('hidden'); return; }
|
| 1045 |
+
list.classList.remove('hidden');
|
| 1046 |
+
|
| 1047 |
+
const hdr = document.createElement('div');
|
| 1048 |
+
hdr.className = 'seg-regions-header';
|
| 1049 |
+
hdr.textContent = `Regions (${regions.length})`;
|
| 1050 |
+
list.appendChild(hdr);
|
| 1051 |
+
|
| 1052 |
+
regions.forEach((r, i) => {
|
| 1053 |
+
const row = document.createElement('div');
|
| 1054 |
+
row.className = 'seg-region-row';
|
| 1055 |
+
|
| 1056 |
+
const dot = document.createElement('span');
|
| 1057 |
+
dot.className = 'seg-region-dot';
|
| 1058 |
+
dot.style.background = _REGION_COLORS[i % _REGION_COLORS.length];
|
| 1059 |
+
|
| 1060 |
+
const label = document.createElement('span');
|
| 1061 |
+
label.className = 'seg-region-label';
|
| 1062 |
+
label.textContent = `R${i + 1}`;
|
| 1063 |
+
|
| 1064 |
+
const count = document.createElement('span');
|
| 1065 |
+
count.className = 'seg-region-count';
|
| 1066 |
+
count.textContent = `${r.num_lines} line${r.num_lines !== 1 ? 's' : ''}`;
|
| 1067 |
+
|
| 1068 |
+
const delBtn = document.createElement('button');
|
| 1069 |
+
delBtn.className = 'seg-region-del btn-icon';
|
| 1070 |
+
delBtn.textContent = '×';
|
| 1071 |
+
delBtn.title = 'Delete this region';
|
| 1072 |
+
delBtn.addEventListener('click', async () => {
|
| 1073 |
+
delBtn.disabled = true;
|
| 1074 |
+
try {
|
| 1075 |
+
const resp = await api(`/api/image/${state.imageId}/region/${i}`, { method: 'DELETE' });
|
| 1076 |
+
const data = await resp.json();
|
| 1077 |
+
emit('sse-segmentation', data);
|
| 1078 |
+
toast(`Region R${i + 1} removed`, 'info', 2000);
|
| 1079 |
+
} catch (err) {
|
| 1080 |
+
toast(`Delete failed: ${err.message}`, 'error');
|
| 1081 |
+
delBtn.disabled = false;
|
| 1082 |
+
}
|
| 1083 |
+
});
|
| 1084 |
+
|
| 1085 |
+
row.appendChild(dot);
|
| 1086 |
+
row.appendChild(label);
|
| 1087 |
+
row.appendChild(count);
|
| 1088 |
+
row.appendChild(delBtn);
|
| 1089 |
+
list.appendChild(row);
|
| 1090 |
+
});
|
| 1091 |
+
}
|
web/static/components/image-viewer.js
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Image Viewer — upload, display, bbox overlay
|
| 3 |
+
*/
|
| 4 |
+
|
| 5 |
+
import { state, emit, on, api, fitZoom, toast } from '../app.js';
|
| 6 |
+
|
| 7 |
+
const $ = id => document.getElementById(id);
|
| 8 |
+
const IMAGE_EXTENSIONS = new Set(['.jpg', '.jpeg', '.png', '.tif', '.tiff', '.bmp', '.gif', '.webp']);
|
| 9 |
+
|
| 10 |
+
function extensionOf(file) {
|
| 11 |
+
const name = file?.name || '';
|
| 12 |
+
const dot = name.lastIndexOf('.');
|
| 13 |
+
return dot >= 0 ? name.slice(dot).toLowerCase() : '';
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
function isImageOrPdf(file) {
|
| 17 |
+
const ext = extensionOf(file);
|
| 18 |
+
return file.type.startsWith('image/') || ext === '.pdf' || IMAGE_EXTENSIONS.has(ext);
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
export function initImageViewer() {
|
| 22 |
+
const uploadArea = $('upload-area');
|
| 23 |
+
const fileInput = $('file-input');
|
| 24 |
+
const xmlInput = $('xml-input');
|
| 25 |
+
const viewerScroll = $('viewer-scroll');
|
| 26 |
+
const viewerPlaceholder = $('viewer-placeholder');
|
| 27 |
+
|
| 28 |
+
const handleDroppedFiles = files => {
|
| 29 |
+
const img = files.find(isImageOrPdf);
|
| 30 |
+
const xml = files.find(f => f.name.toLowerCase().endsWith('.xml'));
|
| 31 |
+
if (img) uploadFile(img);
|
| 32 |
+
if (xml) uploadXml(xml); // queued after image upload sets imageId
|
| 33 |
+
};
|
| 34 |
+
|
| 35 |
+
// Click to browse image
|
| 36 |
+
uploadArea.addEventListener('click', () => fileInput.click());
|
| 37 |
+
|
| 38 |
+
// File selected
|
| 39 |
+
fileInput.addEventListener('change', () => {
|
| 40 |
+
if (fileInput.files.length > 0) uploadFile(fileInput.files[0]);
|
| 41 |
+
});
|
| 42 |
+
|
| 43 |
+
// Drag & drop — accept image, PDF, and XML on the upload box or viewer.
|
| 44 |
+
const dropTargets = [uploadArea, viewerScroll, viewerPlaceholder].filter(Boolean);
|
| 45 |
+
dropTargets.forEach(target => {
|
| 46 |
+
target.addEventListener('dragover', e => {
|
| 47 |
+
e.preventDefault();
|
| 48 |
+
uploadArea.classList.add('dragover');
|
| 49 |
+
if (viewerPlaceholder && !state.imageId) viewerPlaceholder.classList.add('dragover');
|
| 50 |
+
});
|
| 51 |
+
target.addEventListener('dragleave', e => {
|
| 52 |
+
if (!e.currentTarget.contains(e.relatedTarget)) {
|
| 53 |
+
uploadArea.classList.remove('dragover');
|
| 54 |
+
viewerPlaceholder?.classList.remove('dragover');
|
| 55 |
+
}
|
| 56 |
+
});
|
| 57 |
+
target.addEventListener('drop', e => {
|
| 58 |
+
e.preventDefault();
|
| 59 |
+
uploadArea.classList.remove('dragover');
|
| 60 |
+
viewerPlaceholder?.classList.remove('dragover');
|
| 61 |
+
handleDroppedFiles(Array.from(e.dataTransfer.files));
|
| 62 |
+
});
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
// Keep the explicit upload area compatible with batch-panel's capture-phase
|
| 66 |
+
// drop interception for multi-image queues.
|
| 67 |
+
uploadArea.addEventListener('drop', e => {
|
| 68 |
+
e.preventDefault();
|
| 69 |
+
});
|
| 70 |
+
|
| 71 |
+
// XML file picker
|
| 72 |
+
xmlInput.addEventListener('change', () => {
|
| 73 |
+
if (xmlInput.files.length > 0) uploadXml(xmlInput.files[0]);
|
| 74 |
+
});
|
| 75 |
+
|
| 76 |
+
// Batch panel: load a completed item's image into the viewer
|
| 77 |
+
on('batch-item-start', ({ imageId, filename }) => {
|
| 78 |
+
state.imageId = imageId;
|
| 79 |
+
// Clear bboxes immediately for the new item
|
| 80 |
+
currentBboxes = [];
|
| 81 |
+
currentRegions = [];
|
| 82 |
+
const img = $('page-image');
|
| 83 |
+
img.src = `/api/image/${imageId}`;
|
| 84 |
+
$('image-container').classList.remove('hidden');
|
| 85 |
+
$('viewer-placeholder').classList.add('hidden');
|
| 86 |
+
img.onload = () => {
|
| 87 |
+
const canvas = $('overlay-canvas');
|
| 88 |
+
canvas.width = img.naturalWidth;
|
| 89 |
+
canvas.height = img.naturalHeight;
|
| 90 |
+
fitZoom();
|
| 91 |
+
// Redraw any bboxes that arrived before the image finished loading
|
| 92 |
+
if (currentBboxes.length > 0) {
|
| 93 |
+
drawBboxes(currentBboxes, -1, currentRegions);
|
| 94 |
+
} else {
|
| 95 |
+
const ctx = canvas.getContext('2d');
|
| 96 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 97 |
+
}
|
| 98 |
+
};
|
| 99 |
+
$('image-info').textContent = filename;
|
| 100 |
+
$('xml-upload-row').classList.remove('hidden');
|
| 101 |
+
$('xml-status').textContent = 'No PAGE XML';
|
| 102 |
+
$('xml-status').classList.remove('xml-ok');
|
| 103 |
+
emit('transcription-start', {});
|
| 104 |
+
});
|
| 105 |
+
|
| 106 |
+
// Draw bboxes after segmentation; keep state.regions in sync
|
| 107 |
+
on('sse-segmentation', data => {
|
| 108 |
+
state.regions = data.regions || [];
|
| 109 |
+
if (data.source === 'page') {
|
| 110 |
+
// Page-level engine: clear any old line bboxes, don't draw full-page box
|
| 111 |
+
drawBboxes([], -1, []);
|
| 112 |
+
} else {
|
| 113 |
+
drawBboxes(data.bboxes, -1, state.regions);
|
| 114 |
+
}
|
| 115 |
+
if (data.source === 'pagexml') {
|
| 116 |
+
$('xml-status').textContent = `PAGE XML: ${data.num_lines} lines`;
|
| 117 |
+
}
|
| 118 |
+
});
|
| 119 |
+
|
| 120 |
+
// Highlight line on click from transcription panel
|
| 121 |
+
on('highlight-line', ({ index }) => highlightBbox(index));
|
| 122 |
+
|
| 123 |
+
// Click on canvas → highlight the clicked bbox and emit highlight-line
|
| 124 |
+
const canvas = $('overlay-canvas');
|
| 125 |
+
canvas.addEventListener('click', e => {
|
| 126 |
+
if (currentBboxes.length === 0) return;
|
| 127 |
+
|
| 128 |
+
const img = $('page-image');
|
| 129 |
+
// Scale factor: natural image coords / displayed canvas coords
|
| 130 |
+
const scaleX = img.naturalWidth / img.clientWidth;
|
| 131 |
+
const scaleY = img.naturalHeight / img.clientHeight;
|
| 132 |
+
|
| 133 |
+
const rect = canvas.getBoundingClientRect();
|
| 134 |
+
const clickX = (e.clientX - rect.left) * scaleX;
|
| 135 |
+
const clickY = (e.clientY - rect.top) * scaleY;
|
| 136 |
+
|
| 137 |
+
for (let i = 0; i < currentBboxes.length; i++) {
|
| 138 |
+
const [x1, y1, x2, y2] = currentBboxes[i];
|
| 139 |
+
if (clickX >= x1 && clickX <= x2 && clickY >= y1 && clickY <= y2) {
|
| 140 |
+
emit('highlight-line', { index: i });
|
| 141 |
+
break;
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
});
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
async function uploadFile(file) {
|
| 148 |
+
const formData = new FormData();
|
| 149 |
+
formData.append('file', file);
|
| 150 |
+
|
| 151 |
+
$('image-info').textContent = 'Uploading...';
|
| 152 |
+
|
| 153 |
+
try {
|
| 154 |
+
const resp = await fetch('/api/image/upload', {
|
| 155 |
+
method: 'POST',
|
| 156 |
+
body: formData,
|
| 157 |
+
});
|
| 158 |
+
if (!resp.ok) {
|
| 159 |
+
const err = await resp.json();
|
| 160 |
+
throw new Error(err.detail);
|
| 161 |
+
}
|
| 162 |
+
const data = await resp.json();
|
| 163 |
+
|
| 164 |
+
// PDF: redirect all pages to batch panel
|
| 165 |
+
if (data.is_pdf) {
|
| 166 |
+
$('image-info').textContent = `PDF: ${data.num_pages} page(s) — added to batch queue`;
|
| 167 |
+
emit('pdf-pages-ready', data);
|
| 168 |
+
return;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
state.imageId = data.image_id;
|
| 172 |
+
state.imageInfo = data;
|
| 173 |
+
|
| 174 |
+
// Display image — show container, hide placeholder
|
| 175 |
+
const img = $('page-image');
|
| 176 |
+
img.src = `/api/image/${data.image_id}`;
|
| 177 |
+
$('image-container').classList.remove('hidden');
|
| 178 |
+
$('viewer-placeholder').classList.add('hidden');
|
| 179 |
+
|
| 180 |
+
// Wait for image to load to size canvas and fit zoom
|
| 181 |
+
img.onload = () => {
|
| 182 |
+
const canvas = $('overlay-canvas');
|
| 183 |
+
canvas.width = img.naturalWidth;
|
| 184 |
+
canvas.height = img.naturalHeight;
|
| 185 |
+
fitZoom(); // sets img.style.width/height and canvas display size
|
| 186 |
+
};
|
| 187 |
+
|
| 188 |
+
$('image-info').textContent = `${data.filename} (${data.width}×${data.height})`;
|
| 189 |
+
// Show XML upload row
|
| 190 |
+
$('xml-upload-row').classList.remove('hidden');
|
| 191 |
+
$('xml-status').textContent = 'No PAGE XML';
|
| 192 |
+
$('xml-status').classList.remove('xml-ok');
|
| 193 |
+
emit('image-uploaded', data);
|
| 194 |
+
} catch (err) {
|
| 195 |
+
$('image-info').textContent = `Error: ${err.message}`;
|
| 196 |
+
toast(`Upload failed: ${err.message}`, 'error', 7000);
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
async function uploadXml(file) {
|
| 201 |
+
if (!state.imageId) {
|
| 202 |
+
// Will retry after image upload finishes
|
| 203 |
+
on('image-uploaded', () => uploadXml(file), { once: true });
|
| 204 |
+
return;
|
| 205 |
+
}
|
| 206 |
+
const xmlStatus = $('xml-status');
|
| 207 |
+
xmlStatus.textContent = 'Uploading XML...';
|
| 208 |
+
xmlStatus.classList.remove('xml-ok');
|
| 209 |
+
try {
|
| 210 |
+
const formData = new FormData();
|
| 211 |
+
formData.append('file', file);
|
| 212 |
+
const resp = await fetch(`/api/image/${state.imageId}/xml`, {
|
| 213 |
+
method: 'POST',
|
| 214 |
+
body: formData,
|
| 215 |
+
});
|
| 216 |
+
if (!resp.ok) {
|
| 217 |
+
const err = await resp.json();
|
| 218 |
+
throw new Error(err.detail);
|
| 219 |
+
}
|
| 220 |
+
xmlStatus.textContent = `✓ ${file.name}`;
|
| 221 |
+
xmlStatus.classList.add('xml-ok');
|
| 222 |
+
emit('xml-uploaded', { filename: file.name });
|
| 223 |
+
} catch (err) {
|
| 224 |
+
xmlStatus.textContent = `XML error: ${err.message}`;
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
let currentBboxes = [];
|
| 229 |
+
let currentRegions = [];
|
| 230 |
+
|
| 231 |
+
// Distinct colours for up to 8 regions (cycling)
|
| 232 |
+
const REGION_COLORS = [
|
| 233 |
+
'rgba(255, 160, 30, 0.55)', // orange
|
| 234 |
+
'rgba( 46, 213, 115, 0.55)', // green
|
| 235 |
+
'rgba(232, 65, 24, 0.55)', // red
|
| 236 |
+
'rgba( 52, 172, 224, 0.55)', // blue
|
| 237 |
+
'rgba(162, 16, 213, 0.55)', // purple
|
| 238 |
+
'rgba(255, 211, 42, 0.55)', // yellow
|
| 239 |
+
'rgba( 18, 203, 196, 0.55)', // teal
|
| 240 |
+
'rgba(253, 89, 166, 0.55)', // pink
|
| 241 |
+
];
|
| 242 |
+
|
| 243 |
+
function drawBboxes(bboxes, highlightIndex = -1, regions = []) {
|
| 244 |
+
currentBboxes = bboxes;
|
| 245 |
+
currentRegions = regions;
|
| 246 |
+
const canvas = $('overlay-canvas');
|
| 247 |
+
const img = $('page-image');
|
| 248 |
+
const ctx = canvas.getContext('2d');
|
| 249 |
+
|
| 250 |
+
// Keep canvas display size in sync with zoom-controlled img size
|
| 251 |
+
canvas.style.width = img.style.width || img.clientWidth + 'px';
|
| 252 |
+
canvas.style.height = img.style.height || img.clientHeight + 'px';
|
| 253 |
+
|
| 254 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 255 |
+
|
| 256 |
+
// Draw region outlines first (underneath line boxes)
|
| 257 |
+
regions.forEach((r, ri) => {
|
| 258 |
+
const [x1, y1, x2, y2] = r.bbox;
|
| 259 |
+
const color = REGION_COLORS[ri % REGION_COLORS.length];
|
| 260 |
+
ctx.strokeStyle = color;
|
| 261 |
+
ctx.lineWidth = 2.5;
|
| 262 |
+
ctx.setLineDash([8, 4]);
|
| 263 |
+
ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
|
| 264 |
+
ctx.setLineDash([]);
|
| 265 |
+
// Subtle fill
|
| 266 |
+
ctx.fillStyle = color.replace('0.55', '0.07');
|
| 267 |
+
ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
|
| 268 |
+
// Region label
|
| 269 |
+
ctx.fillStyle = color.replace('0.55', '0.9');
|
| 270 |
+
ctx.font = 'bold 13px sans-serif';
|
| 271 |
+
ctx.fillText(`R${ri + 1} (${r.num_lines} lines)`, x1 + 4, y1 + 16);
|
| 272 |
+
});
|
| 273 |
+
|
| 274 |
+
// Draw line boxes on top
|
| 275 |
+
for (let i = 0; i < bboxes.length; i++) {
|
| 276 |
+
const [x1, y1, x2, y2] = bboxes[i];
|
| 277 |
+
const isHighlighted = i === highlightIndex;
|
| 278 |
+
|
| 279 |
+
ctx.strokeStyle = isHighlighted ? '#e94560' : 'rgba(58, 134, 255, 0.6)';
|
| 280 |
+
ctx.lineWidth = isHighlighted ? 3 : 1.5;
|
| 281 |
+
ctx.strokeRect(x1, y1, x2 - x1, y2 - y1);
|
| 282 |
+
|
| 283 |
+
if (isHighlighted) {
|
| 284 |
+
ctx.fillStyle = 'rgba(233, 69, 96, 0.1)';
|
| 285 |
+
ctx.fillRect(x1, y1, x2 - x1, y2 - y1);
|
| 286 |
+
}
|
| 287 |
+
}
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
function highlightBbox(index) {
|
| 291 |
+
if (currentBboxes.length > 0) {
|
| 292 |
+
drawBboxes(currentBboxes, index, currentRegions);
|
| 293 |
+
}
|
| 294 |
+
}
|
web/static/components/transcription-panel.js
ADDED
|
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Transcription Panel — SSE progress, results, export
|
| 3 |
+
*/
|
| 4 |
+
|
| 5 |
+
import { state, emit, on, toast } from '../app.js';
|
| 6 |
+
|
| 7 |
+
const $ = id => document.getElementById(id);
|
| 8 |
+
|
| 9 |
+
// ── Font selector ───────────────────────────────────────────────────────
|
| 10 |
+
const LS_FONT = 'polyscriptor_results_font';
|
| 11 |
+
|
| 12 |
+
const FONTS = [
|
| 13 |
+
{ label: 'Monospace (default)', value: '' },
|
| 14 |
+
{ label: 'Monomakh Unicode ✦', value: 'Monomakh', local: true },
|
| 15 |
+
{ label: 'Old Standard TT', value: 'Old Standard TT', gf: 'Old+Standard+TT' },
|
| 16 |
+
{ label: 'Noto Serif', value: 'Noto Serif', gf: 'Noto+Serif' },
|
| 17 |
+
{ label: 'Crimson Pro', value: 'Crimson Pro', gf: 'Crimson+Pro' },
|
| 18 |
+
{ label: 'IM Fell English', value: 'IM Fell English', gf: 'IM+Fell+English' },
|
| 19 |
+
];
|
| 20 |
+
|
| 21 |
+
const _loadedFonts = new Set();
|
| 22 |
+
|
| 23 |
+
function _loadGoogleFont(gfParam) {
|
| 24 |
+
const url = `https://fonts.googleapis.com/css2?family=${gfParam}&display=swap`;
|
| 25 |
+
if (_loadedFonts.has(url)) return;
|
| 26 |
+
const link = document.createElement('link');
|
| 27 |
+
link.rel = 'stylesheet';
|
| 28 |
+
link.href = url;
|
| 29 |
+
document.head.appendChild(link);
|
| 30 |
+
_loadedFonts.add(url);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
function applyFont(value) {
|
| 34 |
+
const f = FONTS.find(f => f.value === value);
|
| 35 |
+
if (!f) return;
|
| 36 |
+
if (f.gf) _loadGoogleFont(f.gf);
|
| 37 |
+
if (f.value) {
|
| 38 |
+
document.documentElement.style.setProperty(
|
| 39 |
+
'--font-results', `"${f.value}", Georgia, serif`);
|
| 40 |
+
} else {
|
| 41 |
+
document.documentElement.style.removeProperty('--font-results');
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
export function initTranscriptionPanel() {
|
| 46 |
+
let _transcribeStart = null;
|
| 47 |
+
let _numRegions = 1;
|
| 48 |
+
let _columnMode = false;
|
| 49 |
+
|
| 50 |
+
// Confidence threshold slider
|
| 51 |
+
const slider = $('conf-threshold');
|
| 52 |
+
const sliderVal = $('conf-threshold-val');
|
| 53 |
+
slider.addEventListener('input', () => {
|
| 54 |
+
const threshold = parseInt(slider.value, 10);
|
| 55 |
+
sliderVal.textContent = threshold + '%';
|
| 56 |
+
applyConfidenceFilter(threshold);
|
| 57 |
+
});
|
| 58 |
+
|
| 59 |
+
// Search / filter
|
| 60 |
+
const searchInput = $('results-search');
|
| 61 |
+
searchInput.addEventListener('input', () => applySearch(searchInput.value));
|
| 62 |
+
// Clear search on new transcription
|
| 63 |
+
function resetSearch() {
|
| 64 |
+
searchInput.value = '';
|
| 65 |
+
$('results-search-row').classList.add('hidden');
|
| 66 |
+
$('results-search-count').textContent = '';
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
// Font selector — populate, restore, handle changes
|
| 70 |
+
const fontSel = $('font-select');
|
| 71 |
+
for (const f of FONTS) {
|
| 72 |
+
const o = document.createElement('option');
|
| 73 |
+
o.value = f.value;
|
| 74 |
+
o.textContent = f.label;
|
| 75 |
+
fontSel.appendChild(o);
|
| 76 |
+
}
|
| 77 |
+
const savedFont = (() => { try { return localStorage.getItem(LS_FONT) || ''; } catch { return ''; } })();
|
| 78 |
+
fontSel.value = savedFont;
|
| 79 |
+
if (savedFont) applyFont(savedFont);
|
| 80 |
+
fontSel.addEventListener('change', () => {
|
| 81 |
+
applyFont(fontSel.value);
|
| 82 |
+
try { localStorage.setItem(LS_FONT, fontSel.value); } catch { /* private mode */ }
|
| 83 |
+
});
|
| 84 |
+
|
| 85 |
+
// Column layout toggle
|
| 86 |
+
$('btn-col-layout').addEventListener('click', () => {
|
| 87 |
+
_columnMode = !_columnMode;
|
| 88 |
+
$('btn-col-layout').classList.toggle('active', _columnMode);
|
| 89 |
+
if (_columnMode) renderAllColumns();
|
| 90 |
+
else renderAllFlat();
|
| 91 |
+
});
|
| 92 |
+
|
| 93 |
+
on('transcription-start', () => {
|
| 94 |
+
state.lines = [];
|
| 95 |
+
_transcribeStart = null;
|
| 96 |
+
_numRegions = 1;
|
| 97 |
+
_columnMode = false;
|
| 98 |
+
$('btn-col-layout').classList.add('hidden');
|
| 99 |
+
$('btn-col-layout').classList.remove('active');
|
| 100 |
+
$('transcription-lines').innerHTML = '';
|
| 101 |
+
$('transcription-lines').classList.remove('col-layout');
|
| 102 |
+
$('progress-container').classList.remove('hidden');
|
| 103 |
+
$('results-footer').classList.add('hidden');
|
| 104 |
+
$('conf-filter-row').classList.add('hidden');
|
| 105 |
+
resetSearch();
|
| 106 |
+
$('progress-fill').style.width = '0%';
|
| 107 |
+
$('progress-fill').style.background = ''; // reset error colour
|
| 108 |
+
$('progress-text').textContent = 'Segmenting...';
|
| 109 |
+
});
|
| 110 |
+
|
| 111 |
+
// Highlight line in transcription panel when a bbox is clicked (or line clicked)
|
| 112 |
+
on('highlight-line', ({ index }) => {
|
| 113 |
+
const container = $('transcription-lines');
|
| 114 |
+
container.querySelectorAll('.line-active').forEach(el => el.classList.remove('line-active'));
|
| 115 |
+
const target = container.querySelector(`[data-index="${index}"]`);
|
| 116 |
+
if (target) {
|
| 117 |
+
target.classList.add('line-active');
|
| 118 |
+
target.scrollIntoView({ block: 'nearest', behavior: 'smooth' });
|
| 119 |
+
}
|
| 120 |
+
});
|
| 121 |
+
|
| 122 |
+
on('sse-status', data => {
|
| 123 |
+
$('progress-text').textContent = data.message;
|
| 124 |
+
});
|
| 125 |
+
|
| 126 |
+
on('sse-segmentation', data => {
|
| 127 |
+
if (data.source === 'page') {
|
| 128 |
+
$('progress-text').textContent = 'Processing full page...';
|
| 129 |
+
} else {
|
| 130 |
+
$('progress-text').textContent = `${data.num_lines} lines found. Transcribing...`;
|
| 131 |
+
}
|
| 132 |
+
});
|
| 133 |
+
|
| 134 |
+
on('sse-progress', data => {
|
| 135 |
+
const pct = Math.round((data.current / data.total) * 100);
|
| 136 |
+
$('progress-fill').style.width = pct + '%';
|
| 137 |
+
|
| 138 |
+
// ETA
|
| 139 |
+
const now = Date.now();
|
| 140 |
+
if (!_transcribeStart) _transcribeStart = now;
|
| 141 |
+
const elapsed = (now - _transcribeStart) / 1000;
|
| 142 |
+
const rate = data.current / elapsed; // lines/s
|
| 143 |
+
const remaining = rate > 0 ? Math.round((data.total - data.current) / rate) : null;
|
| 144 |
+
const etaStr = remaining != null
|
| 145 |
+
? ` · ~${remaining < 60 ? remaining + 's' : Math.round(remaining / 60) + 'min'} left`
|
| 146 |
+
: '';
|
| 147 |
+
let tokenStr = '';
|
| 148 |
+
if (data.token_usage) {
|
| 149 |
+
const tu = data.token_usage;
|
| 150 |
+
const parts = [];
|
| 151 |
+
if (tu.prompt_tokens != null) parts.push(`in:${tu.prompt_tokens}`);
|
| 152 |
+
if (tu.output_tokens != null) parts.push(`out:${tu.output_tokens}`);
|
| 153 |
+
if (tu.thinking_tokens != null && tu.thinking_tokens > 0) parts.push(`think:${tu.thinking_tokens}`);
|
| 154 |
+
if (parts.length) tokenStr = ` | ${parts.join(' ')} tok`;
|
| 155 |
+
}
|
| 156 |
+
$('progress-text').textContent = `${data.current} / ${data.total} lines${etaStr}${tokenStr}`;
|
| 157 |
+
|
| 158 |
+
_numRegions = Math.max(_numRegions, (data.line.region ?? 0) + 1);
|
| 159 |
+
state.lines.push(data.line);
|
| 160 |
+
appendLine(data.line);
|
| 161 |
+
});
|
| 162 |
+
|
| 163 |
+
on('sse-complete', data => {
|
| 164 |
+
$('progress-container').classList.add('hidden');
|
| 165 |
+
$('results-footer').classList.remove('hidden');
|
| 166 |
+
$('btn-export-xml').classList.toggle('hidden', !!data.browser_direct);
|
| 167 |
+
let summary = `${data.lines.length} lines in ${data.total_time_s}s (${data.engine})`;
|
| 168 |
+
if (data.token_usage) {
|
| 169 |
+
const tu = data.token_usage;
|
| 170 |
+
const parts = [];
|
| 171 |
+
if (tu.prompt_tokens != null) parts.push(`in: ${tu.prompt_tokens}`);
|
| 172 |
+
if (tu.output_tokens != null) parts.push(`out: ${tu.output_tokens}`);
|
| 173 |
+
if (tu.thinking_tokens != null && tu.thinking_tokens > 0)
|
| 174 |
+
parts.push(`think: ${tu.thinking_tokens}`);
|
| 175 |
+
if (parts.length) summary += ` | tokens: ${parts.join(', ')}`;
|
| 176 |
+
}
|
| 177 |
+
$('results-summary').textContent = summary;
|
| 178 |
+
// Show confidence filter if any line has confidence data
|
| 179 |
+
if (state.lines.some(l => l.confidence != null)) {
|
| 180 |
+
$('conf-filter-row').classList.remove('hidden');
|
| 181 |
+
slider.value = 0;
|
| 182 |
+
sliderVal.textContent = '0%';
|
| 183 |
+
}
|
| 184 |
+
// Show search if there are results
|
| 185 |
+
if (state.lines.length > 0) {
|
| 186 |
+
$('results-search-row').classList.remove('hidden');
|
| 187 |
+
}
|
| 188 |
+
// Show column layout toggle if multiple regions detected
|
| 189 |
+
if (_numRegions > 1) {
|
| 190 |
+
$('btn-col-layout').classList.remove('hidden');
|
| 191 |
+
}
|
| 192 |
+
emit('transcription-complete', data);
|
| 193 |
+
});
|
| 194 |
+
|
| 195 |
+
on('sse-cancelled', () => {
|
| 196 |
+
$('progress-text').textContent = 'Cancelled';
|
| 197 |
+
$('progress-fill').style.width = '0%';
|
| 198 |
+
// Show footer if we have partial results
|
| 199 |
+
if (state.lines.length > 0) {
|
| 200 |
+
$('results-footer').classList.remove('hidden');
|
| 201 |
+
$('results-summary').textContent = `Cancelled — ${state.lines.length} lines transcribed`;
|
| 202 |
+
}
|
| 203 |
+
emit('transcription-complete', {});
|
| 204 |
+
});
|
| 205 |
+
|
| 206 |
+
on('sse-error', data => {
|
| 207 |
+
$('progress-text').textContent = `Error: ${data.message}`;
|
| 208 |
+
$('progress-fill').style.width = '0%';
|
| 209 |
+
$('progress-fill').style.background = 'var(--danger)';
|
| 210 |
+
emit('transcription-complete', {});
|
| 211 |
+
});
|
| 212 |
+
|
| 213 |
+
on('transcription-error', data => {
|
| 214 |
+
$('progress-text').textContent = `Error: ${data.message}`;
|
| 215 |
+
emit('transcription-complete', {});
|
| 216 |
+
});
|
| 217 |
+
|
| 218 |
+
// Also hide Export XML when a new transcription starts
|
| 219 |
+
on('transcription-start', () => {
|
| 220 |
+
$('btn-export-xml').classList.add('hidden');
|
| 221 |
+
});
|
| 222 |
+
|
| 223 |
+
$('btn-copy-text').addEventListener('click', copyText);
|
| 224 |
+
$('btn-export-txt').addEventListener('click', exportTxt);
|
| 225 |
+
$('btn-export-csv').addEventListener('click', exportCsv);
|
| 226 |
+
$('btn-export-xml').addEventListener('click', exportXml);
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
function renderAllFlat() {
|
| 230 |
+
const container = $('transcription-lines');
|
| 231 |
+
container.innerHTML = '';
|
| 232 |
+
container.classList.remove('col-layout');
|
| 233 |
+
state.lines.forEach(line => appendLine(line));
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
function renderAllColumns() {
|
| 237 |
+
const container = $('transcription-lines');
|
| 238 |
+
container.innerHTML = '';
|
| 239 |
+
container.classList.add('col-layout');
|
| 240 |
+
|
| 241 |
+
const maxRegion = state.lines.reduce((m, l) => Math.max(m, l.region ?? 0), 0);
|
| 242 |
+
const groups = Array.from({ length: maxRegion + 1 }, () => []);
|
| 243 |
+
state.lines.forEach(line => groups[line.region ?? 0].push(line));
|
| 244 |
+
|
| 245 |
+
groups.forEach((lines, r) => {
|
| 246 |
+
const col = document.createElement('div');
|
| 247 |
+
col.className = 'region-column';
|
| 248 |
+
|
| 249 |
+
const hdr = document.createElement('div');
|
| 250 |
+
hdr.className = 'region-col-header';
|
| 251 |
+
|
| 252 |
+
const title = document.createElement('span');
|
| 253 |
+
title.textContent = `Column ${r + 1} (${lines.length})`;
|
| 254 |
+
hdr.appendChild(title);
|
| 255 |
+
|
| 256 |
+
const closeBtn = document.createElement('button');
|
| 257 |
+
closeBtn.className = 'region-col-close';
|
| 258 |
+
closeBtn.textContent = '×';
|
| 259 |
+
closeBtn.title = 'Hide this column';
|
| 260 |
+
closeBtn.addEventListener('click', e => { e.stopPropagation(); col.remove(); });
|
| 261 |
+
hdr.appendChild(closeBtn);
|
| 262 |
+
|
| 263 |
+
col.appendChild(hdr);
|
| 264 |
+
lines.forEach(line => appendLine(line, col));
|
| 265 |
+
container.appendChild(col);
|
| 266 |
+
});
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
function appendLine(line, container = null) {
|
| 270 |
+
container = container || $('transcription-lines');
|
| 271 |
+
const div = document.createElement('div');
|
| 272 |
+
div.className = 'line-result';
|
| 273 |
+
div.dataset.index = line.index;
|
| 274 |
+
if (line.confidence != null) {
|
| 275 |
+
div.dataset.confidence = Math.round(line.confidence * 100);
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
// Line number
|
| 279 |
+
const numSpan = document.createElement('span');
|
| 280 |
+
numSpan.className = 'line-num';
|
| 281 |
+
numSpan.textContent = line.index + 1;
|
| 282 |
+
|
| 283 |
+
// Editable text span
|
| 284 |
+
const textSpan = document.createElement('span');
|
| 285 |
+
textSpan.className = 'line-text';
|
| 286 |
+
textSpan.textContent = line.text;
|
| 287 |
+
|
| 288 |
+
// Confidence badge
|
| 289 |
+
let confSpan = null;
|
| 290 |
+
if (line.confidence != null) {
|
| 291 |
+
const pct = Math.round(line.confidence * 100);
|
| 292 |
+
const cls = pct >= 90 ? 'conf-high' : pct >= 75 ? 'conf-mid' : 'conf-low';
|
| 293 |
+
confSpan = document.createElement('span');
|
| 294 |
+
confSpan.className = `confidence ${cls}`;
|
| 295 |
+
confSpan.textContent = pct + '%';
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
div.appendChild(numSpan);
|
| 299 |
+
div.appendChild(textSpan);
|
| 300 |
+
if (confSpan) div.appendChild(confSpan);
|
| 301 |
+
|
| 302 |
+
// Thinking text (Gemini reasoning) — collapsible per line
|
| 303 |
+
if (line.thinking_text) {
|
| 304 |
+
const details = document.createElement('details');
|
| 305 |
+
details.className = 'thinking-block';
|
| 306 |
+
const summary = document.createElement('summary');
|
| 307 |
+
summary.className = 'thinking-toggle';
|
| 308 |
+
summary.textContent = 'reasoning';
|
| 309 |
+
const pre = document.createElement('pre');
|
| 310 |
+
pre.className = 'thinking-text';
|
| 311 |
+
pre.textContent = line.thinking_text;
|
| 312 |
+
details.appendChild(summary);
|
| 313 |
+
details.appendChild(pre);
|
| 314 |
+
div.appendChild(details);
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
// Single click → highlight bbox on image
|
| 318 |
+
div.addEventListener('click', e => {
|
| 319 |
+
if (textSpan.contentEditable === 'true') return; // don't interfere while editing
|
| 320 |
+
emit('highlight-line', { index: line.index });
|
| 321 |
+
});
|
| 322 |
+
|
| 323 |
+
// Double-click → start inline editing
|
| 324 |
+
textSpan.addEventListener('dblclick', e => {
|
| 325 |
+
e.stopPropagation();
|
| 326 |
+
textSpan.contentEditable = 'true';
|
| 327 |
+
textSpan.focus();
|
| 328 |
+
// Select all text for easy replacement
|
| 329 |
+
const range = document.createRange();
|
| 330 |
+
range.selectNodeContents(textSpan);
|
| 331 |
+
const sel = window.getSelection();
|
| 332 |
+
sel.removeAllRanges();
|
| 333 |
+
sel.addRange(range);
|
| 334 |
+
});
|
| 335 |
+
|
| 336 |
+
// Save on blur or Enter
|
| 337 |
+
const saveEdit = () => {
|
| 338 |
+
textSpan.contentEditable = 'false';
|
| 339 |
+
const newText = textSpan.textContent;
|
| 340 |
+
if (newText !== line.text) {
|
| 341 |
+
state.lines[line.index].text = newText;
|
| 342 |
+
div.classList.add('line-edited');
|
| 343 |
+
}
|
| 344 |
+
};
|
| 345 |
+
textSpan.addEventListener('blur', saveEdit);
|
| 346 |
+
textSpan.addEventListener('keydown', e => {
|
| 347 |
+
if (e.key === 'Enter') { e.preventDefault(); saveEdit(); }
|
| 348 |
+
if (e.key === 'Escape') {
|
| 349 |
+
textSpan.textContent = state.lines[line.index].text; // revert
|
| 350 |
+
textSpan.contentEditable = 'false';
|
| 351 |
+
}
|
| 352 |
+
});
|
| 353 |
+
|
| 354 |
+
container.appendChild(div);
|
| 355 |
+
// Auto-scroll only for the main flat container (not column sub-divs)
|
| 356 |
+
if (container === $('transcription-lines')) {
|
| 357 |
+
container.scrollTop = container.scrollHeight;
|
| 358 |
+
}
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
function applyConfidenceFilter(threshold) {
|
| 362 |
+
$('transcription-lines').querySelectorAll('.line-result').forEach(div => {
|
| 363 |
+
const conf = parseInt(div.dataset.confidence ?? '100', 10);
|
| 364 |
+
div.classList.toggle('line-dimmed', conf < threshold);
|
| 365 |
+
});
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
function applySearch(query) {
|
| 369 |
+
const lines = $('transcription-lines').querySelectorAll('.line-result');
|
| 370 |
+
const q = query.trim().toLowerCase();
|
| 371 |
+
let matchCount = 0;
|
| 372 |
+
|
| 373 |
+
lines.forEach(div => {
|
| 374 |
+
const textSpan = div.querySelector('.line-text');
|
| 375 |
+
if (!textSpan) return;
|
| 376 |
+
// Use state.lines for the canonical text (survives inline edits and search markup)
|
| 377 |
+
const lineIdx = parseInt(div.dataset.index ?? '-1', 10);
|
| 378 |
+
const raw = lineIdx >= 0 && state.lines[lineIdx]
|
| 379 |
+
? state.lines[lineIdx].text
|
| 380 |
+
: textSpan.textContent;
|
| 381 |
+
|
| 382 |
+
if (!q) {
|
| 383 |
+
// Clear search: restore plain text, remove hidden
|
| 384 |
+
textSpan.textContent = raw;
|
| 385 |
+
div.classList.remove('line-hidden');
|
| 386 |
+
return;
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
const lc = raw.toLowerCase();
|
| 390 |
+
const idx = lc.indexOf(q);
|
| 391 |
+
if (idx === -1) {
|
| 392 |
+
div.classList.add('line-hidden');
|
| 393 |
+
} else {
|
| 394 |
+
div.classList.remove('line-hidden');
|
| 395 |
+
matchCount++;
|
| 396 |
+
// Highlight match with <mark> using safe DOM manipulation
|
| 397 |
+
const before = raw.slice(0, idx);
|
| 398 |
+
const match = raw.slice(idx, idx + q.length);
|
| 399 |
+
const after = raw.slice(idx + q.length);
|
| 400 |
+
textSpan.textContent = '';
|
| 401 |
+
textSpan.appendChild(document.createTextNode(before));
|
| 402 |
+
const mark = document.createElement('mark');
|
| 403 |
+
mark.textContent = match;
|
| 404 |
+
textSpan.appendChild(mark);
|
| 405 |
+
textSpan.appendChild(document.createTextNode(after));
|
| 406 |
+
}
|
| 407 |
+
});
|
| 408 |
+
|
| 409 |
+
const countEl = $('results-search-count');
|
| 410 |
+
countEl.textContent = q ? `${matchCount} match${matchCount !== 1 ? 'es' : ''}` : '';
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
// (escapeHtml no longer needed — we use textContent/DOM directly)
|
| 414 |
+
|
| 415 |
+
async function copyText() {
|
| 416 |
+
if (state.lines.length === 0) return;
|
| 417 |
+
const text = state.lines.map(l => l.text).join('\n');
|
| 418 |
+
try {
|
| 419 |
+
await navigator.clipboard.writeText(text);
|
| 420 |
+
const btn = $('btn-copy-text');
|
| 421 |
+
const orig = btn.textContent;
|
| 422 |
+
btn.textContent = 'Copied!';
|
| 423 |
+
setTimeout(() => { btn.textContent = orig; }, 1500);
|
| 424 |
+
} catch {
|
| 425 |
+
toast('Clipboard not available — use Export TXT instead', 'error');
|
| 426 |
+
}
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
function exportTxt() {
|
| 430 |
+
if (state.lines.length === 0) return;
|
| 431 |
+
const text = state.lines.map(l => l.text).join('\n');
|
| 432 |
+
downloadFile('transcription.txt', text, 'text/plain');
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
function exportCsv() {
|
| 436 |
+
if (state.lines.length === 0) return;
|
| 437 |
+
const header = 'Line,Text,Confidence,X1,Y1,X2,Y2\n';
|
| 438 |
+
const rows = state.lines.map(l => {
|
| 439 |
+
const conf = l.confidence != null ? l.confidence.toFixed(4) : '';
|
| 440 |
+
const bbox = l.bbox ? l.bbox.join(',') : ',,,';
|
| 441 |
+
return `${l.index + 1},"${l.text.replace(/"/g, '""')}",${conf},${bbox}`;
|
| 442 |
+
}).join('\n');
|
| 443 |
+
downloadFile('transcription.csv', header + rows, 'text/csv');
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
function downloadFile(filename, content, mime) {
|
| 447 |
+
const blob = new Blob([content], { type: mime });
|
| 448 |
+
const url = URL.createObjectURL(blob);
|
| 449 |
+
const a = document.createElement('a');
|
| 450 |
+
a.href = url;
|
| 451 |
+
a.download = filename;
|
| 452 |
+
a.click();
|
| 453 |
+
URL.revokeObjectURL(url);
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
async function exportXml() {
|
| 457 |
+
if (!state.imageId) return;
|
| 458 |
+
try {
|
| 459 |
+
const resp = await fetch(`/api/image/${state.imageId}/export-xml`, { method: 'POST' });
|
| 460 |
+
if (!resp.ok) {
|
| 461 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 462 |
+
toast(`XML export failed: ${err.detail || resp.statusText}`, 'error');
|
| 463 |
+
return;
|
| 464 |
+
}
|
| 465 |
+
const blob = await resp.blob();
|
| 466 |
+
// Use filename from Content-Disposition if provided, else fall back
|
| 467 |
+
let filename = 'transcription.xml';
|
| 468 |
+
const cd = resp.headers.get('Content-Disposition');
|
| 469 |
+
if (cd) {
|
| 470 |
+
const m = cd.match(/filename="([^"]+)"/);
|
| 471 |
+
if (m) filename = m[1];
|
| 472 |
+
}
|
| 473 |
+
const url = URL.createObjectURL(blob);
|
| 474 |
+
const a = document.createElement('a');
|
| 475 |
+
a.href = url;
|
| 476 |
+
a.download = filename;
|
| 477 |
+
a.click();
|
| 478 |
+
URL.revokeObjectURL(url);
|
| 479 |
+
} catch (err) {
|
| 480 |
+
toast(`XML export error: ${err.message}`, 'error');
|
| 481 |
+
}
|
| 482 |
+
}
|
web/static/fonts/MonomakhUnicode-Regular.woff2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a07ebc9c97abc54866b6c8f35d6057f861f84a760127349f28c47c069a9cfea4
|
| 3 |
+
size 86480
|
web/static/index.html
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Polyscriptor HTR</title>
|
| 7 |
+
<link rel="stylesheet" href="/static/app.css">
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<!-- Header -->
|
| 11 |
+
<header id="header">
|
| 12 |
+
<div class="header-left">
|
| 13 |
+
<span class="header-logo">⬡</span>
|
| 14 |
+
<h1>Polyscriptor <span class="header-sub">HTR</span></h1>
|
| 15 |
+
</div>
|
| 16 |
+
<div class="header-right">
|
| 17 |
+
<div id="gpu-status" class="gpu-widget"></div>
|
| 18 |
+
<button id="btn-help" class="btn-icon" title="Help">?</button>
|
| 19 |
+
</div>
|
| 20 |
+
</header>
|
| 21 |
+
|
| 22 |
+
<!-- Main 3-column layout -->
|
| 23 |
+
<main id="app">
|
| 24 |
+
<!-- Left: Engine + Image controls -->
|
| 25 |
+
<aside id="engine-panel" class="panel" data-panel="settings">
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
<section class="panel-section">
|
| 29 |
+
<h2>HTR Engine</h2>
|
| 30 |
+
<label for="engine-select">Engine</label>
|
| 31 |
+
<select id="engine-select" disabled>
|
| 32 |
+
<option>Loading engines…</option>
|
| 33 |
+
</select>
|
| 34 |
+
<p id="engine-description" class="muted"></p>
|
| 35 |
+
<div id="config-form"></div>
|
| 36 |
+
<div id="kraken-preset-row" class="hidden" style="margin-top:8px">
|
| 37 |
+
<label for="kraken-preset-select" style="display:block;font-size:0.78rem;margin-bottom:3px">Kraken Model Preset</label>
|
| 38 |
+
<select id="kraken-preset-select" style="width:100%">
|
| 39 |
+
<option value="">Loading presets…</option>
|
| 40 |
+
</select>
|
| 41 |
+
<span id="kraken-preset-status" class="muted" style="font-size:0.72rem;display:block;margin-top:3px"></span>
|
| 42 |
+
</div>
|
| 43 |
+
<button id="btn-load-model" class="btn btn-primary" disabled>Load Model</button>
|
| 44 |
+
<div id="engine-status" class="status-badge hidden"></div>
|
| 45 |
+
</section>
|
| 46 |
+
|
| 47 |
+
<hr>
|
| 48 |
+
|
| 49 |
+
<section class="panel-section">
|
| 50 |
+
<h2>Image</h2>
|
| 51 |
+
<div id="upload-area" class="upload-area">
|
| 52 |
+
<svg class="upload-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
|
| 53 |
+
<path stroke-linecap="round" stroke-linejoin="round"
|
| 54 |
+
d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5m-13.5-9L12 3m0 0l4.5 4.5M12 3v13.5"/>
|
| 55 |
+
</svg>
|
| 56 |
+
<p>Drop image or PDF, or click to browse</p>
|
| 57 |
+
<input type="file" id="file-input" accept="image/*,.pdf" multiple hidden>
|
| 58 |
+
</div>
|
| 59 |
+
<p id="image-info" class="muted"></p>
|
| 60 |
+
<div id="batch-queue-section" class="hidden">
|
| 61 |
+
<div class="batch-queue-header">
|
| 62 |
+
<span class="section-label">Queue</span>
|
| 63 |
+
<span id="batch-overall-progress" class="batch-overall-progress hidden"></span>
|
| 64 |
+
</div>
|
| 65 |
+
<div id="batch-list"></div>
|
| 66 |
+
<div class="batch-options-row">
|
| 67 |
+
<label class="checkbox-label" title="Use PAGE XML segmentation if a matching .xml file was uploaded for this image">
|
| 68 |
+
<input type="checkbox" id="batch-use-pagexml" checked>
|
| 69 |
+
Use PAGE XML
|
| 70 |
+
</label>
|
| 71 |
+
<label class="checkbox-label" title="Skip images that have already been transcribed in this session">
|
| 72 |
+
<input type="checkbox" id="batch-resume">
|
| 73 |
+
Resume
|
| 74 |
+
</label>
|
| 75 |
+
</div>
|
| 76 |
+
<div class="btn-row" style="margin-top:6px">
|
| 77 |
+
<button id="btn-process-batch" class="btn btn-primary btn-small">Process All</button>
|
| 78 |
+
<button id="btn-clear-batch" class="btn btn-small btn-outline">Clear</button>
|
| 79 |
+
</div>
|
| 80 |
+
<div id="batch-export-row" class="btn-row hidden" style="margin-top:6px">
|
| 81 |
+
<button id="btn-export-batch-txt" class="btn btn-small">All TXT</button>
|
| 82 |
+
<button id="btn-export-batch-csv" class="btn btn-small">All CSV</button>
|
| 83 |
+
<button id="btn-export-batch-txt-zip" class="btn btn-small btn-primary">Download ZIP (TXT)</button>
|
| 84 |
+
<button id="btn-export-batch-thinking-zip" class="btn btn-small btn-primary">Download ZIP (Thinking)</button>
|
| 85 |
+
<button id="btn-export-batch-xml" class="btn btn-small btn-primary">Download ZIP (XML)</button>
|
| 86 |
+
</div>
|
| 87 |
+
</div>
|
| 88 |
+
<div id="xml-upload-row" class="xml-row hidden">
|
| 89 |
+
<span id="xml-status" class="muted">No PAGE XML</span>
|
| 90 |
+
<label class="btn btn-small btn-outline" for="xml-input">
|
| 91 |
+
Upload XML
|
| 92 |
+
<input type="file" id="xml-input" accept=".xml" hidden multiple>
|
| 93 |
+
</label>
|
| 94 |
+
</div>
|
| 95 |
+
</section>
|
| 96 |
+
|
| 97 |
+
<hr>
|
| 98 |
+
|
| 99 |
+
<section class="panel-section" id="seg-controls">
|
| 100 |
+
<h2>Segmentation</h2>
|
| 101 |
+
<label for="seg-method">Method</label>
|
| 102 |
+
<select id="seg-method">
|
| 103 |
+
<option value="kraken" selected>Kraken Classical</option>
|
| 104 |
+
<option value="hpp">HPP / projection profile fallback</option>
|
| 105 |
+
<option value="kraken-blla" disabled>Kraken Neural / blla (server only)</option>
|
| 106 |
+
</select>
|
| 107 |
+
|
| 108 |
+
<label for="seg-device">Device</label>
|
| 109 |
+
<select id="seg-device">
|
| 110 |
+
<option value="cpu">CPU</option>
|
| 111 |
+
<option value="cuda:0">GPU 0</option>
|
| 112 |
+
<option value="cuda:1">GPU 1</option>
|
| 113 |
+
</select>
|
| 114 |
+
|
| 115 |
+
<div id="blla-options" style="display:none">
|
| 116 |
+
<div style="display:flex;gap:12px;align-items:center;flex-wrap:wrap">
|
| 117 |
+
<div style="display:flex;flex-direction:column;gap:3px">
|
| 118 |
+
<label for="seg-max-columns">Max columns</label>
|
| 119 |
+
<input type="number" id="seg-max-columns" min="1" max="12" value="6" style="width:60px">
|
| 120 |
+
</div>
|
| 121 |
+
<div style="display:flex;flex-direction:column;gap:3px">
|
| 122 |
+
<label for="seg-split-width">Split width %</label>
|
| 123 |
+
<input type="number" id="seg-split-width" min="5" max="80" value="40" step="5" style="width:60px" title="Min region width (% of page) to trigger sub-column splitting. Lower = split narrower regions. Double pages: try 20.">
|
| 124 |
+
</div>
|
| 125 |
+
</div>
|
| 126 |
+
<div style="margin-top:6px">
|
| 127 |
+
<label for="seg-text-direction">Reading direction</label>
|
| 128 |
+
<select id="seg-text-direction" title="Controls column reading order. Use horizontal-rl for Arabic, Ottoman, Hebrew manuscripts.">
|
| 129 |
+
<option value="horizontal-lr">LTR (Latin, Cyrillic, …)</option>
|
| 130 |
+
<option value="horizontal-rl">RTL (Arabic, Ottoman, Hebrew, …)</option>
|
| 131 |
+
<option value="vertical-lr">Vertical LTR</option>
|
| 132 |
+
<option value="vertical-rl">Vertical RTL</option>
|
| 133 |
+
</select>
|
| 134 |
+
</div>
|
| 135 |
+
</div>
|
| 136 |
+
</section>
|
| 137 |
+
|
| 138 |
+
<div id="seg-regions-list" class="hidden"></div>
|
| 139 |
+
|
| 140 |
+
<div class="panel-footer">
|
| 141 |
+
<div class="btn-row footer-btn-row">
|
| 142 |
+
<button id="btn-segment" class="btn btn-outline" disabled title="Preview line segmentation without transcribing">Segment</button>
|
| 143 |
+
<button id="btn-transcribe" class="btn btn-accent" disabled>Transcribe</button>
|
| 144 |
+
</div>
|
| 145 |
+
</div>
|
| 146 |
+
</aside>
|
| 147 |
+
<div class="panel-resize-handle" id="resize-left" title="Drag to resize"></div>
|
| 148 |
+
|
| 149 |
+
<!-- Center: Image viewer -->
|
| 150 |
+
<section id="viewer-panel" class="panel" data-panel="image">
|
| 151 |
+
<!-- Zoom toolbar — only visible when image is loaded -->
|
| 152 |
+
<div id="zoom-toolbar" class="zoom-toolbar hidden">
|
| 153 |
+
<button class="zoom-btn" id="btn-zoom-out" title="Zoom out">−</button>
|
| 154 |
+
<span id="zoom-level" class="zoom-level">100%</span>
|
| 155 |
+
<button class="zoom-btn" id="btn-zoom-in" title="Zoom in">+</button>
|
| 156 |
+
<button class="zoom-btn zoom-fit" id="btn-zoom-fit" title="Fit to view">⊡</button>
|
| 157 |
+
<span class="zoom-toolbar-sep"></span>
|
| 158 |
+
<button class="btn btn-small btn-outline nav-btn" id="btn-nav-prev" title="Previous image (←)" disabled>‹ Prev</button>
|
| 159 |
+
<span id="batch-nav-label" class="batch-nav-label-toolbar"></span>
|
| 160 |
+
<button class="btn btn-small btn-outline nav-btn" id="btn-nav-next" title="Next image (→)" disabled>Next ›</button>
|
| 161 |
+
</div>
|
| 162 |
+
<!-- Scroll area fills remaining height -->
|
| 163 |
+
<div id="viewer-scroll">
|
| 164 |
+
<div id="viewer-placeholder" class="viewer-placeholder">
|
| 165 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1">
|
| 166 |
+
<rect x="3" y="3" width="18" height="18" rx="2"/>
|
| 167 |
+
<circle cx="8.5" cy="8.5" r="1.5"/>
|
| 168 |
+
<path stroke-linecap="round" stroke-linejoin="round" d="M21 15l-5-5L5 21"/>
|
| 169 |
+
</svg>
|
| 170 |
+
<p>Upload an image to begin</p>
|
| 171 |
+
</div>
|
| 172 |
+
<div id="image-container" class="hidden">
|
| 173 |
+
<img id="page-image">
|
| 174 |
+
<canvas id="overlay-canvas"></canvas>
|
| 175 |
+
</div>
|
| 176 |
+
</div>
|
| 177 |
+
</section>
|
| 178 |
+
<div class="panel-resize-handle" id="resize-right" title="Drag to resize"></div>
|
| 179 |
+
|
| 180 |
+
<!-- Right: Transcription results -->
|
| 181 |
+
<section id="results-panel" class="panel" data-panel="results">
|
| 182 |
+
<div class="results-header">
|
| 183 |
+
<div class="results-header-row">
|
| 184 |
+
<h2>Transcription</h2>
|
| 185 |
+
<div class="results-header-controls">
|
| 186 |
+
<select id="font-select" class="font-select" title="Transcription font"></select>
|
| 187 |
+
<button id="btn-col-layout" class="btn-icon hidden" title="Toggle column layout">⊞</button>
|
| 188 |
+
</div>
|
| 189 |
+
</div>
|
| 190 |
+
<div id="results-search-row" class="results-search-row hidden">
|
| 191 |
+
<input type="search" id="results-search" placeholder="Search lines…" autocomplete="off">
|
| 192 |
+
<span id="results-search-count" class="muted"></span>
|
| 193 |
+
</div>
|
| 194 |
+
<div id="conf-filter-row" class="conf-filter-row hidden">
|
| 195 |
+
<label>Min conf: <strong id="conf-threshold-val">0%</strong></label>
|
| 196 |
+
<input type="range" id="conf-threshold" min="0" max="100" value="0" step="5">
|
| 197 |
+
</div>
|
| 198 |
+
<div id="progress-container" class="hidden">
|
| 199 |
+
<div id="progress-bar"><div id="progress-fill"></div></div>
|
| 200 |
+
<div class="progress-row">
|
| 201 |
+
<p id="progress-text" class="muted">0 / 0 lines</p>
|
| 202 |
+
<button id="btn-cancel" class="btn btn-small hidden">Cancel</button>
|
| 203 |
+
</div>
|
| 204 |
+
</div>
|
| 205 |
+
</div>
|
| 206 |
+
<div id="transcription-lines"></div>
|
| 207 |
+
<div id="results-footer" class="hidden">
|
| 208 |
+
<p id="results-summary" class="muted"></p>
|
| 209 |
+
<div class="btn-row">
|
| 210 |
+
<button id="btn-copy-text" class="btn btn-small">Copy Text</button>
|
| 211 |
+
<button id="btn-export-txt" class="btn btn-small">TXT</button>
|
| 212 |
+
<button id="btn-export-csv" class="btn btn-small">CSV</button>
|
| 213 |
+
<button id="btn-export-xml" class="btn btn-small hidden">XML</button>
|
| 214 |
+
</div>
|
| 215 |
+
</div>
|
| 216 |
+
</section>
|
| 217 |
+
</main>
|
| 218 |
+
|
| 219 |
+
<!-- Mobile tab bar (visible < 700px) -->
|
| 220 |
+
<nav id="mobile-tabs">
|
| 221 |
+
<button class="tab-btn active" data-target="settings">
|
| 222 |
+
<svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M11.49 3.17c-.38-1.56-2.6-1.56-2.98 0a1.532 1.532 0 01-2.286.948c-1.372-.836-2.942.734-2.106 2.106.54.886.061 2.042-.947 2.287-1.561.379-1.561 2.6 0 2.978a1.532 1.532 0 01.947 2.287c-.836 1.372.734 2.942 2.106 2.106a1.532 1.532 0 012.287.947c.379 1.561 2.6 1.561 2.978 0a1.533 1.533 0 012.287-.947c1.372.836 2.942-.734 2.106-2.106a1.533 1.533 0 01.947-2.287c1.561-.379 1.561-2.6 0-2.978a1.532 1.532 0 01-.947-2.287c.836-1.372-.734-2.942-2.106-2.106a1.532 1.532 0 01-2.287-.947zM10 13a3 3 0 100-6 3 3 0 000 6z" clip-rule="evenodd"/></svg>
|
| 223 |
+
Settings
|
| 224 |
+
</button>
|
| 225 |
+
<button class="tab-btn" data-target="image">
|
| 226 |
+
<svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M4 3a2 2 0 00-2 2v10a2 2 0 002 2h12a2 2 0 002-2V5a2 2 0 00-2-2H4zm12 12H4l4-8 3 6 2-4 3 6z" clip-rule="evenodd"/></svg>
|
| 227 |
+
Image
|
| 228 |
+
</button>
|
| 229 |
+
<button class="tab-btn" data-target="results">
|
| 230 |
+
<svg viewBox="0 0 20 20" fill="currentColor"><path fill-rule="evenodd" d="M4 4a2 2 0 012-2h4.586A2 2 0 0112 2.586L15.414 6A2 2 0 0116 7.414V16a2 2 0 01-2 2H6a2 2 0 01-2-2V4zm2 6a1 1 0 011-1h6a1 1 0 110 2H7a1 1 0 01-1-1zm1 3a1 1 0 100 2h6a1 1 0 100-2H7z" clip-rule="evenodd"/></svg>
|
| 231 |
+
Results
|
| 232 |
+
</button>
|
| 233 |
+
</nav>
|
| 234 |
+
|
| 235 |
+
<!-- Help modal -->
|
| 236 |
+
<dialog id="help-modal">
|
| 237 |
+
<div class="modal-header">
|
| 238 |
+
<h2>Polyscriptor HTR — Quick Guide</h2>
|
| 239 |
+
<button id="btn-help-close" class="btn-icon">✕</button>
|
| 240 |
+
</div>
|
| 241 |
+
<div class="modal-body">
|
| 242 |
+
<h3>Quick Start</h3>
|
| 243 |
+
<ol>
|
| 244 |
+
<li><strong>Select an engine</strong> from the dropdown and configure it (model path, API key, etc.).</li>
|
| 245 |
+
<li>Click <strong>Load Model</strong> and wait for the green status badge.</li>
|
| 246 |
+
<li><strong>Upload an image</strong> by dragging it onto the upload area or clicking to browse.</li>
|
| 247 |
+
<li>Optionally click <strong>Segment</strong> to preview line detection before transcribing.</li>
|
| 248 |
+
<li>Click <strong>Transcribe</strong>. Lines appear one by one as they are processed.</li>
|
| 249 |
+
<li><strong>Export</strong> the result as TXT, CSV, or PAGE XML.</li>
|
| 250 |
+
</ol>
|
| 251 |
+
|
| 252 |
+
<h3>Source Code</h3>
|
| 253 |
+
<p>
|
| 254 |
+
The public Polyscriptor source code is available on
|
| 255 |
+
<a href="https://github.com/achimrabus/polyscriptor" target="_blank" rel="noopener noreferrer">GitHub</a>.
|
| 256 |
+
This Hugging Face Space runs a curated hosted demo configuration.
|
| 257 |
+
</p>
|
| 258 |
+
|
| 259 |
+
<h3>Engines</h3>
|
| 260 |
+
<table>
|
| 261 |
+
<tr><th>Engine</th><th>Best for</th></tr>
|
| 262 |
+
<tr><td>CRNN-CTC</td><td>Fastest; works well on Church Slavonic, Glagolitic, Ukrainian with trained models</td></tr>
|
| 263 |
+
<tr><td>TrOCR</td><td>HuggingFace Transformer OCR; good general-purpose accuracy</td></tr>
|
| 264 |
+
<tr><td>Qwen3-VL</td><td>Large vision-language model; best quality but slow, needs GPU</td></tr>
|
| 265 |
+
<tr><td>Kraken</td><td>Classical HTR; good for Latin scripts</td></tr>
|
| 266 |
+
<tr><td>Party</td><td>Whole-page transformer; requires PAGE XML with line segmentation</td></tr>
|
| 267 |
+
<tr><td>Commercial APIs</td><td>OpenAI / Gemini / Claude — cloud inference, no local GPU needed</td></tr>
|
| 268 |
+
<tr><td>OpenWebUI</td><td>Locally hosted models via OpenWebUI/Ollama</td></tr>
|
| 269 |
+
</table>
|
| 270 |
+
|
| 271 |
+
<h3>Segmentation</h3>
|
| 272 |
+
<ul>
|
| 273 |
+
<li><strong>Kraken Classical</strong> — default line segmentation in this Hugging Face CPU demo.</li>
|
| 274 |
+
<li><strong>HPP</strong> — horizontal projection profile fallback.</li>
|
| 275 |
+
<li><strong>Kraken Neural / blla</strong> — available on the full server setup, but not enabled in this Space.</li>
|
| 276 |
+
<li><strong>PAGE XML upload</strong> — skip segmentation entirely by uploading an existing PAGE XML annotation (e.g. from Transkribus).</li>
|
| 277 |
+
</ul>
|
| 278 |
+
|
| 279 |
+
<h3>Tips</h3>
|
| 280 |
+
<ul>
|
| 281 |
+
<li>Click a transcription line to highlight the corresponding bounding box in the image.</li>
|
| 282 |
+
<li>Confidence badges: <span class="conf-high demo-badge">high ≥90%</span> <span class="conf-mid demo-badge">mid ≥75%</span> <span class="conf-low demo-badge">low <75%</span></li>
|
| 283 |
+
<li>Line-segmenting engines (CRNN-CTC, TrOCR, Kraken) use the segmentation method above. Page-level engines (Party, Qwen3-VL, Commercial APIs) do their own segmentation.</li>
|
| 284 |
+
<li>API keys can be saved on the server — enter the key once, check <em>Save key on server</em>.</li>
|
| 285 |
+
<li>Uploads are kept for 24 hours, then cleaned up automatically.</li>
|
| 286 |
+
</ul>
|
| 287 |
+
|
| 288 |
+
<h3>Keyboard</h3>
|
| 289 |
+
<ul>
|
| 290 |
+
<li><kbd>Esc</kbd> — close this dialog</li>
|
| 291 |
+
</ul>
|
| 292 |
+
</div>
|
| 293 |
+
</dialog>
|
| 294 |
+
|
| 295 |
+
<!-- Toast notification container -->
|
| 296 |
+
<div id="toast-container"></div>
|
| 297 |
+
|
| 298 |
+
<script type="module" src="/static/app.js"></script>
|
| 299 |
+
<script>
|
| 300 |
+
// Help modal
|
| 301 |
+
const modal = document.getElementById('help-modal');
|
| 302 |
+
document.getElementById('btn-help').addEventListener('click', () => modal.showModal());
|
| 303 |
+
document.getElementById('btn-help-close').addEventListener('click', () => modal.close());
|
| 304 |
+
modal.addEventListener('click', e => { if (e.target === modal) modal.close(); });
|
| 305 |
+
|
| 306 |
+
// Mobile tab bar
|
| 307 |
+
const tabBtns = document.querySelectorAll('.tab-btn');
|
| 308 |
+
const panels = document.querySelectorAll('[data-panel]');
|
| 309 |
+
tabBtns.forEach(btn => {
|
| 310 |
+
btn.addEventListener('click', () => {
|
| 311 |
+
const target = btn.dataset.target;
|
| 312 |
+
tabBtns.forEach(b => b.classList.remove('active'));
|
| 313 |
+
btn.classList.add('active');
|
| 314 |
+
panels.forEach(p => {
|
| 315 |
+
p.classList.toggle('panel-active', p.dataset.panel === target);
|
| 316 |
+
});
|
| 317 |
+
});
|
| 318 |
+
});
|
| 319 |
+
// Default: settings active on mobile
|
| 320 |
+
document.querySelector('[data-panel="settings"]').classList.add('panel-active');
|
| 321 |
+
</script>
|
| 322 |
+
</body>
|
| 323 |
+
</html>
|
web/static/pwa/demo.css
ADDED
|
@@ -0,0 +1,698 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ── Design tokens (matching main app) ───────────────────────────────── */
|
| 2 |
+
:root {
|
| 3 |
+
--bg: #111827;
|
| 4 |
+
--bg-panel: #1f2937;
|
| 5 |
+
--bg-card: #1a2333;
|
| 6 |
+
--bg-input: #111827;
|
| 7 |
+
--bg-hover: #2a3a52;
|
| 8 |
+
--text: #e2e8f0;
|
| 9 |
+
--text-muted: #64748b;
|
| 10 |
+
--text-dim: #94a3b8;
|
| 11 |
+
--accent: #e94560;
|
| 12 |
+
--primary: #3b82f6;
|
| 13 |
+
--primary-dark: #2563eb;
|
| 14 |
+
--primary-glow: rgba(59,130,246,0.25);
|
| 15 |
+
--success: #22c55e;
|
| 16 |
+
--warning: #f59e0b;
|
| 17 |
+
--danger: #ef4444;
|
| 18 |
+
--border: #2d3f59;
|
| 19 |
+
--border-light: #3a4f6e;
|
| 20 |
+
--radius: 10px;
|
| 21 |
+
--radius-sm: 6px;
|
| 22 |
+
--font: 'Segoe UI', system-ui, -apple-system, sans-serif;
|
| 23 |
+
--font-mono: 'Consolas', 'Fira Code', monospace;
|
| 24 |
+
--header-h: 52px;
|
| 25 |
+
--safe-bottom: env(safe-area-inset-bottom, 0px);
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
/* ── Reset ───────────────────────────────────────────────────────────── */
|
| 29 |
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
| 30 |
+
|
| 31 |
+
html, body {
|
| 32 |
+
height: 100%;
|
| 33 |
+
font-family: var(--font);
|
| 34 |
+
background: var(--bg);
|
| 35 |
+
color: var(--text);
|
| 36 |
+
-webkit-text-size-adjust: 100%;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
body {
|
| 40 |
+
display: flex;
|
| 41 |
+
flex-direction: column;
|
| 42 |
+
min-height: 100dvh;
|
| 43 |
+
overflow-x: hidden;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
/* ── Header ──────────────────────────────────────────────────────────── */
|
| 47 |
+
#header {
|
| 48 |
+
position: sticky;
|
| 49 |
+
top: 0;
|
| 50 |
+
z-index: 100;
|
| 51 |
+
height: var(--header-h);
|
| 52 |
+
display: flex;
|
| 53 |
+
align-items: center;
|
| 54 |
+
justify-content: space-between;
|
| 55 |
+
padding: 0 16px;
|
| 56 |
+
padding-top: env(safe-area-inset-top, 0);
|
| 57 |
+
background: var(--bg-panel);
|
| 58 |
+
border-bottom: 1px solid var(--border);
|
| 59 |
+
flex-shrink: 0;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.header-brand {
|
| 63 |
+
display: flex;
|
| 64 |
+
align-items: center;
|
| 65 |
+
gap: 8px;
|
| 66 |
+
user-select: none;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
.logo-hex {
|
| 70 |
+
font-size: 1.5rem;
|
| 71 |
+
color: var(--primary);
|
| 72 |
+
line-height: 1;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
.logo-text {
|
| 76 |
+
font-size: 1.05rem;
|
| 77 |
+
font-weight: 700;
|
| 78 |
+
letter-spacing: -0.01em;
|
| 79 |
+
color: var(--text);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.logo-sub {
|
| 83 |
+
font-weight: 400;
|
| 84 |
+
color: var(--text-dim);
|
| 85 |
+
font-size: 0.9em;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
/* Engine status pill */
|
| 89 |
+
.engine-pill {
|
| 90 |
+
display: flex;
|
| 91 |
+
align-items: center;
|
| 92 |
+
gap: 6px;
|
| 93 |
+
padding: 4px 10px;
|
| 94 |
+
border-radius: 20px;
|
| 95 |
+
font-size: 0.75rem;
|
| 96 |
+
font-weight: 600;
|
| 97 |
+
border: 1px solid transparent;
|
| 98 |
+
transition: all 0.2s;
|
| 99 |
+
}
|
| 100 |
+
.engine-pill--unknown { background: var(--bg); border-color: var(--border); color: var(--text-muted); }
|
| 101 |
+
.engine-pill--loaded { background: rgba(34,197,94,0.12); border-color: rgba(34,197,94,0.4); color: var(--success); }
|
| 102 |
+
.engine-pill--unloaded { background: rgba(239,68,68,0.1); border-color: rgba(239,68,68,0.3); color: var(--danger); }
|
| 103 |
+
.engine-pill--loading { background: rgba(245,158,11,0.1); border-color: rgba(245,158,11,0.3); color: var(--warning); }
|
| 104 |
+
|
| 105 |
+
.pill-dot {
|
| 106 |
+
width: 7px;
|
| 107 |
+
height: 7px;
|
| 108 |
+
border-radius: 50%;
|
| 109 |
+
background: currentColor;
|
| 110 |
+
flex-shrink: 0;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.engine-pill--loading .pill-dot {
|
| 114 |
+
animation: pulse-dot 1s ease-in-out infinite;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
@keyframes pulse-dot {
|
| 118 |
+
0%, 100% { opacity: 1; }
|
| 119 |
+
50% { opacity: 0.3; }
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
/* ── Toast ───────────────────────────────────────────────────────────── */
|
| 123 |
+
#toast-container {
|
| 124 |
+
position: fixed;
|
| 125 |
+
top: calc(var(--header-h) + 8px);
|
| 126 |
+
left: 50%;
|
| 127 |
+
transform: translateX(-50%);
|
| 128 |
+
z-index: 200;
|
| 129 |
+
display: flex;
|
| 130 |
+
flex-direction: column;
|
| 131 |
+
gap: 6px;
|
| 132 |
+
width: calc(100% - 32px);
|
| 133 |
+
max-width: 420px;
|
| 134 |
+
pointer-events: none;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
.toast {
|
| 138 |
+
padding: 10px 14px;
|
| 139 |
+
border-radius: var(--radius-sm);
|
| 140 |
+
font-size: 0.85rem;
|
| 141 |
+
font-weight: 500;
|
| 142 |
+
pointer-events: auto;
|
| 143 |
+
animation: toast-in 0.25s ease;
|
| 144 |
+
}
|
| 145 |
+
.toast--info { background: var(--bg-panel); border: 1px solid var(--border); color: var(--text); }
|
| 146 |
+
.toast--success { background: rgba(34,197,94,0.15); border: 1px solid rgba(34,197,94,0.4); color: var(--success); }
|
| 147 |
+
.toast--error { background: rgba(239,68,68,0.15); border: 1px solid rgba(239,68,68,0.4); color: #fca5a5; }
|
| 148 |
+
.toast--warn { background: rgba(245,158,11,0.12); border: 1px solid rgba(245,158,11,0.35); color: var(--warning); }
|
| 149 |
+
|
| 150 |
+
@keyframes toast-in {
|
| 151 |
+
from { opacity: 0; transform: translateY(-8px); }
|
| 152 |
+
to { opacity: 1; transform: translateY(0); }
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
/* ── Main scroll area ─────────────────────────────────────────────────── */
|
| 156 |
+
#main {
|
| 157 |
+
flex: 1;
|
| 158 |
+
overflow-y: auto;
|
| 159 |
+
padding: 14px;
|
| 160 |
+
padding-bottom: calc(16px + var(--safe-bottom));
|
| 161 |
+
display: flex;
|
| 162 |
+
flex-direction: column;
|
| 163 |
+
gap: 12px;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
/* ── Cards ───────────────────────────────────────────────────────────── */
|
| 167 |
+
.card {
|
| 168 |
+
background: var(--bg-card);
|
| 169 |
+
border: 1px solid var(--border);
|
| 170 |
+
border-radius: var(--radius);
|
| 171 |
+
padding: 16px;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.card-header {
|
| 175 |
+
display: flex;
|
| 176 |
+
align-items: center;
|
| 177 |
+
justify-content: space-between;
|
| 178 |
+
margin-bottom: 14px;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
.card-header h2 {
|
| 182 |
+
font-size: 0.9rem;
|
| 183 |
+
font-weight: 700;
|
| 184 |
+
text-transform: uppercase;
|
| 185 |
+
letter-spacing: 0.06em;
|
| 186 |
+
color: var(--text-dim);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
/* ── Buttons ──────────────────────────────────────────────────────────── */
|
| 190 |
+
.btn {
|
| 191 |
+
display: inline-flex;
|
| 192 |
+
align-items: center;
|
| 193 |
+
justify-content: center;
|
| 194 |
+
gap: 8px;
|
| 195 |
+
padding: 12px 18px;
|
| 196 |
+
border: 1px solid transparent;
|
| 197 |
+
border-radius: var(--radius-sm);
|
| 198 |
+
font-family: var(--font);
|
| 199 |
+
font-size: 0.95rem;
|
| 200 |
+
font-weight: 600;
|
| 201 |
+
cursor: pointer;
|
| 202 |
+
transition: all 0.15s;
|
| 203 |
+
min-height: 48px;
|
| 204 |
+
user-select: none;
|
| 205 |
+
-webkit-tap-highlight-color: transparent;
|
| 206 |
+
white-space: nowrap;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.btn svg {
|
| 210 |
+
width: 18px;
|
| 211 |
+
height: 18px;
|
| 212 |
+
flex-shrink: 0;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.btn:disabled {
|
| 216 |
+
opacity: 0.4;
|
| 217 |
+
cursor: not-allowed;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
.btn-primary {
|
| 221 |
+
background: var(--primary);
|
| 222 |
+
color: #fff;
|
| 223 |
+
border-color: var(--primary);
|
| 224 |
+
}
|
| 225 |
+
.btn-primary:not(:disabled):hover,
|
| 226 |
+
.btn-primary:not(:disabled):active {
|
| 227 |
+
background: var(--primary-dark);
|
| 228 |
+
border-color: var(--primary-dark);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.btn-secondary {
|
| 232 |
+
background: var(--bg-panel);
|
| 233 |
+
color: var(--text);
|
| 234 |
+
border-color: var(--border-light);
|
| 235 |
+
}
|
| 236 |
+
.btn-secondary:not(:disabled):hover,
|
| 237 |
+
.btn-secondary:not(:disabled):active {
|
| 238 |
+
background: var(--bg-hover);
|
| 239 |
+
border-color: var(--primary);
|
| 240 |
+
color: var(--primary);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
.btn-danger {
|
| 244 |
+
background: rgba(239,68,68,0.15);
|
| 245 |
+
color: var(--danger);
|
| 246 |
+
border-color: rgba(239,68,68,0.4);
|
| 247 |
+
}
|
| 248 |
+
.btn-danger:not(:disabled):hover,
|
| 249 |
+
.btn-danger:not(:disabled):active {
|
| 250 |
+
background: rgba(239,68,68,0.25);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
.btn-ghost {
|
| 254 |
+
background: transparent;
|
| 255 |
+
color: var(--text-muted);
|
| 256 |
+
border-color: transparent;
|
| 257 |
+
padding: 4px 8px;
|
| 258 |
+
min-height: unset;
|
| 259 |
+
font-size: 0.8rem;
|
| 260 |
+
}
|
| 261 |
+
.btn-ghost:hover { color: var(--text); }
|
| 262 |
+
|
| 263 |
+
.btn-small { font-size: 0.8rem; padding: 6px 10px; min-height: 32px; }
|
| 264 |
+
|
| 265 |
+
/* Capture button — accent-colored, full-width primary CTA */
|
| 266 |
+
.btn-capture {
|
| 267 |
+
background: linear-gradient(135deg, var(--primary) 0%, #6366f1 100%);
|
| 268 |
+
color: #fff;
|
| 269 |
+
border-color: transparent;
|
| 270 |
+
flex: 1;
|
| 271 |
+
padding: 14px;
|
| 272 |
+
font-size: 1rem;
|
| 273 |
+
}
|
| 274 |
+
.btn-capture:not(:disabled):hover,
|
| 275 |
+
.btn-capture:not(:disabled):active {
|
| 276 |
+
opacity: 0.9;
|
| 277 |
+
transform: translateY(-1px);
|
| 278 |
+
}
|
| 279 |
+
.btn-capture:not(:disabled):active {
|
| 280 |
+
transform: translateY(0);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
/* ── Upload section ───────────────────────────────────────────────────── */
|
| 284 |
+
.upload-btn-row {
|
| 285 |
+
display: flex;
|
| 286 |
+
gap: 10px;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
.btn-upload {
|
| 290 |
+
flex: 0 0 auto;
|
| 291 |
+
padding: 14px 16px;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
/* Image preview */
|
| 295 |
+
#image-preview-wrap {
|
| 296 |
+
margin-top: 14px;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
#image-container {
|
| 300 |
+
position: relative;
|
| 301 |
+
display: inline-block;
|
| 302 |
+
width: 100%;
|
| 303 |
+
border-radius: var(--radius-sm);
|
| 304 |
+
overflow: hidden;
|
| 305 |
+
background: #000;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
#preview-img {
|
| 309 |
+
display: block;
|
| 310 |
+
width: 100%;
|
| 311 |
+
height: auto;
|
| 312 |
+
max-height: 55vh;
|
| 313 |
+
object-fit: contain;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
#bbox-canvas {
|
| 317 |
+
position: absolute;
|
| 318 |
+
top: 0;
|
| 319 |
+
left: 0;
|
| 320 |
+
width: 100%;
|
| 321 |
+
height: 100%;
|
| 322 |
+
pointer-events: none;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
.preview-meta {
|
| 326 |
+
display: flex;
|
| 327 |
+
align-items: center;
|
| 328 |
+
justify-content: space-between;
|
| 329 |
+
margin-top: 8px;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
.meta-filename {
|
| 333 |
+
font-size: 0.78rem;
|
| 334 |
+
color: var(--text-muted);
|
| 335 |
+
overflow: hidden;
|
| 336 |
+
text-overflow: ellipsis;
|
| 337 |
+
white-space: nowrap;
|
| 338 |
+
max-width: 70%;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
/* ── Engine card ──────────────────────────────────────────────────────── */
|
| 342 |
+
.field-row {
|
| 343 |
+
display: flex;
|
| 344 |
+
flex-direction: column;
|
| 345 |
+
gap: 5px;
|
| 346 |
+
margin-bottom: 10px;
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
.field-row label {
|
| 350 |
+
font-size: 0.75rem;
|
| 351 |
+
font-weight: 600;
|
| 352 |
+
color: var(--text-dim);
|
| 353 |
+
text-transform: uppercase;
|
| 354 |
+
letter-spacing: 0.05em;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
select {
|
| 358 |
+
width: 100%;
|
| 359 |
+
padding: 10px 12px;
|
| 360 |
+
background: var(--bg-input);
|
| 361 |
+
color: var(--text);
|
| 362 |
+
border: 1px solid var(--border);
|
| 363 |
+
border-radius: var(--radius-sm);
|
| 364 |
+
font-family: var(--font);
|
| 365 |
+
font-size: 0.9rem;
|
| 366 |
+
cursor: pointer;
|
| 367 |
+
min-height: 44px;
|
| 368 |
+
-webkit-appearance: none;
|
| 369 |
+
appearance: none;
|
| 370 |
+
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='8' viewBox='0 0 12 8'%3E%3Cpath d='M1 1l5 5 5-5' stroke='%2364748b' stroke-width='1.5' fill='none' stroke-linecap='round'/%3E%3C/svg%3E");
|
| 371 |
+
background-repeat: no-repeat;
|
| 372 |
+
background-position: right 12px center;
|
| 373 |
+
padding-right: 36px;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
select:focus {
|
| 377 |
+
outline: none;
|
| 378 |
+
border-color: var(--primary);
|
| 379 |
+
box-shadow: 0 0 0 3px var(--primary-glow);
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
/* Badges */
|
| 383 |
+
.badge {
|
| 384 |
+
padding: 3px 9px;
|
| 385 |
+
border-radius: 20px;
|
| 386 |
+
font-size: 0.72rem;
|
| 387 |
+
font-weight: 700;
|
| 388 |
+
text-transform: uppercase;
|
| 389 |
+
letter-spacing: 0.05em;
|
| 390 |
+
}
|
| 391 |
+
.badge--loading { background: rgba(100,116,139,0.2); color: var(--text-muted); }
|
| 392 |
+
.badge--loaded { background: rgba(34,197,94,0.15); color: var(--success); }
|
| 393 |
+
.badge--unloaded { background: rgba(239,68,68,0.12); color: var(--danger); }
|
| 394 |
+
.badge--info { background: rgba(59,130,246,0.15); color: var(--primary); }
|
| 395 |
+
|
| 396 |
+
/* Advanced details */
|
| 397 |
+
#advanced-details {
|
| 398 |
+
margin-top: 8px;
|
| 399 |
+
border-top: 1px solid var(--border);
|
| 400 |
+
padding-top: 10px;
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
#advanced-details summary {
|
| 404 |
+
font-size: 0.8rem;
|
| 405 |
+
color: var(--text-muted);
|
| 406 |
+
cursor: pointer;
|
| 407 |
+
user-select: none;
|
| 408 |
+
padding: 2px 0;
|
| 409 |
+
list-style: none;
|
| 410 |
+
display: flex;
|
| 411 |
+
align-items: center;
|
| 412 |
+
gap: 6px;
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
#advanced-details summary::before {
|
| 416 |
+
content: '›';
|
| 417 |
+
font-size: 1.1em;
|
| 418 |
+
transition: transform 0.2s;
|
| 419 |
+
display: inline-block;
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
#advanced-details[open] summary::before {
|
| 423 |
+
transform: rotate(90deg);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
.advanced-inner {
|
| 427 |
+
margin-top: 10px;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
/* ── Actions card ─────────────────────────────────────────────────────── */
|
| 431 |
+
.actions-card {
|
| 432 |
+
display: flex;
|
| 433 |
+
flex-wrap: wrap;
|
| 434 |
+
gap: 10px;
|
| 435 |
+
padding: 12px;
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
#btn-cancel {
|
| 439 |
+
flex: 0 0 100%;
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
.btn-action {
|
| 443 |
+
flex: 1;
|
| 444 |
+
padding: 14px 10px;
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
.btn-segment {
|
| 448 |
+
background: var(--bg-panel);
|
| 449 |
+
color: var(--text-dim);
|
| 450 |
+
border-color: var(--border-light);
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
.btn-segment:not(:disabled) {
|
| 454 |
+
color: var(--text);
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
.btn-segment:not(:disabled):hover,
|
| 458 |
+
.btn-segment:not(:disabled):active {
|
| 459 |
+
background: var(--bg-hover);
|
| 460 |
+
border-color: var(--primary);
|
| 461 |
+
color: var(--primary);
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
/* ── Progress card ────────────────────────────────────────────────────── */
|
| 465 |
+
#progress-bar-wrap {
|
| 466 |
+
height: 6px;
|
| 467 |
+
background: var(--bg-panel);
|
| 468 |
+
border-radius: 3px;
|
| 469 |
+
overflow: hidden;
|
| 470 |
+
margin-bottom: 10px;
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
#progress-bar {
|
| 474 |
+
height: 100%;
|
| 475 |
+
background: linear-gradient(90deg, var(--primary), #6366f1);
|
| 476 |
+
border-radius: 3px;
|
| 477 |
+
transition: width 0.3s ease;
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
.status-text {
|
| 481 |
+
font-size: 0.82rem;
|
| 482 |
+
color: var(--text-dim);
|
| 483 |
+
min-height: 1.4em;
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
/* ── Results card ─────────────────────────────────────────────────────── */
|
| 487 |
+
#results-list {
|
| 488 |
+
display: flex;
|
| 489 |
+
flex-direction: column;
|
| 490 |
+
gap: 6px;
|
| 491 |
+
margin-bottom: 14px;
|
| 492 |
+
max-height: 50vh;
|
| 493 |
+
overflow-y: auto;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
.result-line {
|
| 497 |
+
display: flex;
|
| 498 |
+
gap: 10px;
|
| 499 |
+
align-items: flex-start;
|
| 500 |
+
padding: 8px 10px;
|
| 501 |
+
background: var(--bg-panel);
|
| 502 |
+
border-radius: var(--radius-sm);
|
| 503 |
+
border: 1px solid var(--border);
|
| 504 |
+
animation: line-in 0.2s ease;
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
@keyframes line-in {
|
| 508 |
+
from { opacity: 0; transform: translateY(4px); }
|
| 509 |
+
to { opacity: 1; transform: translateY(0); }
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
.line-num {
|
| 513 |
+
font-size: 0.72rem;
|
| 514 |
+
font-weight: 700;
|
| 515 |
+
color: var(--text-muted);
|
| 516 |
+
min-width: 22px;
|
| 517 |
+
padding-top: 1px;
|
| 518 |
+
flex-shrink: 0;
|
| 519 |
+
font-family: var(--font-mono);
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
.line-text {
|
| 523 |
+
flex: 1;
|
| 524 |
+
font-size: 0.88rem;
|
| 525 |
+
line-height: 1.45;
|
| 526 |
+
color: var(--text);
|
| 527 |
+
word-break: break-word;
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
.line-conf {
|
| 531 |
+
font-size: 0.7rem;
|
| 532 |
+
font-weight: 600;
|
| 533 |
+
padding: 2px 6px;
|
| 534 |
+
border-radius: 4px;
|
| 535 |
+
flex-shrink: 0;
|
| 536 |
+
align-self: flex-start;
|
| 537 |
+
margin-top: 1px;
|
| 538 |
+
}
|
| 539 |
+
.conf-high { background: rgba(34,197,94,0.15); color: var(--success); }
|
| 540 |
+
.conf-mid { background: rgba(245,158,11,0.15); color: var(--warning); }
|
| 541 |
+
.conf-low { background: rgba(239,68,68,0.12); color: var(--danger); }
|
| 542 |
+
|
| 543 |
+
.results-actions {
|
| 544 |
+
display: flex;
|
| 545 |
+
gap: 10px;
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
.results-actions .btn {
|
| 549 |
+
flex: 1;
|
| 550 |
+
font-size: 0.85rem;
|
| 551 |
+
padding: 10px 12px;
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
/* ── Landscape layout ─────────────────────────────────────────────────── */
|
| 555 |
+
@media (orientation: landscape) and (max-height: 600px) {
|
| 556 |
+
#main {
|
| 557 |
+
display: grid;
|
| 558 |
+
grid-template-columns: 1fr 1fr;
|
| 559 |
+
grid-template-rows: auto;
|
| 560 |
+
align-items: start;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
#upload-card { grid-column: 1; grid-row: 1 / 3; }
|
| 564 |
+
#engine-card { grid-column: 2; grid-row: 1; }
|
| 565 |
+
#actions-card { grid-column: 2; grid-row: 2; }
|
| 566 |
+
#progress-card { grid-column: 1 / 3; }
|
| 567 |
+
#results-card { grid-column: 1 / 3; }
|
| 568 |
+
|
| 569 |
+
#preview-img {
|
| 570 |
+
max-height: 70vh;
|
| 571 |
+
}
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
/* ── Desktop (>= 768px) ───────────────────────────────────────────────── */
|
| 575 |
+
@media (min-width: 768px) {
|
| 576 |
+
#main {
|
| 577 |
+
max-width: 580px;
|
| 578 |
+
margin: 0 auto;
|
| 579 |
+
padding: 20px 0 40px;
|
| 580 |
+
}
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
/* ── Utility ──────────────────────────────────────────────────────────── */
|
| 584 |
+
.hidden { display: none !important; }
|
| 585 |
+
|
| 586 |
+
/* Scrollbar styling */
|
| 587 |
+
#results-list::-webkit-scrollbar { width: 4px; }
|
| 588 |
+
#results-list::-webkit-scrollbar-track { background: transparent; }
|
| 589 |
+
#results-list::-webkit-scrollbar-thumb { background: var(--border-light); border-radius: 2px; }
|
| 590 |
+
|
| 591 |
+
/* Focus visible for accessibility */
|
| 592 |
+
:focus-visible {
|
| 593 |
+
outline: 2px solid var(--primary);
|
| 594 |
+
outline-offset: 2px;
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
/* ── Photo Review Overlay ─────────────────────────────────────────────── */
|
| 598 |
+
#photo-review {
|
| 599 |
+
position: fixed;
|
| 600 |
+
inset: 0;
|
| 601 |
+
background: #0a0a0a;
|
| 602 |
+
z-index: 500;
|
| 603 |
+
/* Use block layout instead of flex to avoid the iOS Safari flex+overflow-y scroll bug */
|
| 604 |
+
display: block;
|
| 605 |
+
overflow-y: auto;
|
| 606 |
+
-webkit-overflow-scrolling: touch;
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
#photo-review[hidden] {
|
| 610 |
+
display: none;
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
#review-inner {
|
| 614 |
+
width: 100%;
|
| 615 |
+
max-width: 600px;
|
| 616 |
+
margin: 0 auto;
|
| 617 |
+
padding: max(14px, env(safe-area-inset-top, 0px)) 14px calc(14px + env(safe-area-inset-bottom, 0px));
|
| 618 |
+
box-sizing: border-box;
|
| 619 |
+
display: flex;
|
| 620 |
+
flex-direction: column;
|
| 621 |
+
gap: 12px;
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
#review-warn {
|
| 625 |
+
background: rgba(245, 158, 11, 0.2);
|
| 626 |
+
color: #fef3c7;
|
| 627 |
+
border: 1px solid rgba(245, 158, 11, 0.45);
|
| 628 |
+
padding: 10px 14px;
|
| 629 |
+
border-radius: 8px;
|
| 630 |
+
font-size: 0.875rem;
|
| 631 |
+
text-align: center;
|
| 632 |
+
line-height: 1.4;
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
#review-warn[hidden] { display: none; }
|
| 636 |
+
|
| 637 |
+
#review-img-outer {
|
| 638 |
+
text-align: center;
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
#review-img-wrap {
|
| 642 |
+
display: inline-block;
|
| 643 |
+
position: relative;
|
| 644 |
+
max-width: 100%;
|
| 645 |
+
border-radius: 8px;
|
| 646 |
+
overflow: hidden;
|
| 647 |
+
background: #111;
|
| 648 |
+
vertical-align: top;
|
| 649 |
+
}
|
| 650 |
+
|
| 651 |
+
#review-img {
|
| 652 |
+
display: block;
|
| 653 |
+
max-width: 100%;
|
| 654 |
+
max-height: 45vh; /* fallback */
|
| 655 |
+
max-height: 45svh; /* small viewport height: excludes browser chrome on iOS/Android */
|
| 656 |
+
width: auto;
|
| 657 |
+
height: auto;
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
#review-crop-canvas {
|
| 661 |
+
position: absolute;
|
| 662 |
+
inset: 0;
|
| 663 |
+
width: 100%;
|
| 664 |
+
height: 100%;
|
| 665 |
+
pointer-events: none;
|
| 666 |
+
touch-action: none;
|
| 667 |
+
cursor: crosshair;
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
#review-toolbar {
|
| 671 |
+
display: flex;
|
| 672 |
+
gap: 8px;
|
| 673 |
+
flex-wrap: wrap;
|
| 674 |
+
align-items: center;
|
| 675 |
+
justify-content: center;
|
| 676 |
+
}
|
| 677 |
+
|
| 678 |
+
.btn-icon {
|
| 679 |
+
font-size: 1.2rem;
|
| 680 |
+
padding: 8px 16px;
|
| 681 |
+
min-width: 48px;
|
| 682 |
+
}
|
| 683 |
+
|
| 684 |
+
#review-actions {
|
| 685 |
+
display: flex;
|
| 686 |
+
gap: 10px;
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
#review-actions .btn {
|
| 690 |
+
flex: 1;
|
| 691 |
+
}
|
| 692 |
+
|
| 693 |
+
@media (orientation: landscape) and (max-height: 500px) {
|
| 694 |
+
#review-img {
|
| 695 |
+
max-height: 30vh;
|
| 696 |
+
max-height: 30svh;
|
| 697 |
+
}
|
| 698 |
+
}
|
web/static/pwa/demo.html
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
|
| 6 |
+
<meta name="theme-color" content="#3b82f6">
|
| 7 |
+
<meta name="mobile-web-app-capable" content="yes">
|
| 8 |
+
<meta name="apple-mobile-web-app-capable" content="yes">
|
| 9 |
+
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
|
| 10 |
+
<meta name="apple-mobile-web-app-title" content="Polyscriptor">
|
| 11 |
+
<title>Polyscriptor HTR Demo</title>
|
| 12 |
+
<link rel="manifest" href="/manifest.json">
|
| 13 |
+
<link rel="apple-touch-icon" href="/static/pwa/icons/icon-192.png">
|
| 14 |
+
<link rel="stylesheet" href="/static/pwa/demo.css">
|
| 15 |
+
</head>
|
| 16 |
+
<body>
|
| 17 |
+
|
| 18 |
+
<!-- Header -->
|
| 19 |
+
<header id="header">
|
| 20 |
+
<div class="header-brand">
|
| 21 |
+
<span class="logo-hex">⬡</span>
|
| 22 |
+
<span class="logo-text">Polyscriptor <span class="logo-sub">HTR Demo</span></span>
|
| 23 |
+
</div>
|
| 24 |
+
<div class="header-actions">
|
| 25 |
+
<div id="engine-pill" class="engine-pill engine-pill--unknown" title="Engine status">
|
| 26 |
+
<span class="pill-dot"></span>
|
| 27 |
+
<span id="engine-pill-text">…</span>
|
| 28 |
+
</div>
|
| 29 |
+
</div>
|
| 30 |
+
</header>
|
| 31 |
+
|
| 32 |
+
<!-- Toast container -->
|
| 33 |
+
<div id="toast-container" aria-live="polite"></div>
|
| 34 |
+
|
| 35 |
+
<!-- Main content (scrollable) -->
|
| 36 |
+
<main id="main">
|
| 37 |
+
|
| 38 |
+
<!-- Card: Upload / Camera -->
|
| 39 |
+
<section id="upload-card" class="card">
|
| 40 |
+
<div id="upload-buttons" class="upload-btn-row">
|
| 41 |
+
<button id="btn-camera" class="btn btn-capture">
|
| 42 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 43 |
+
<path d="M23 19a2 2 0 0 1-2 2H3a2 2 0 0 1-2-2V8a2 2 0 0 1 2-2h4l2-3h6l2 3h4a2 2 0 0 1 2 2z"/>
|
| 44 |
+
<circle cx="12" cy="13" r="4"/>
|
| 45 |
+
</svg>
|
| 46 |
+
<span>Take Photo</span>
|
| 47 |
+
</button>
|
| 48 |
+
<button id="btn-file" class="btn btn-secondary btn-upload">
|
| 49 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 50 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
| 51 |
+
<polyline points="17 8 12 3 7 8"/>
|
| 52 |
+
<line x1="12" y1="3" x2="12" y2="15"/>
|
| 53 |
+
</svg>
|
| 54 |
+
<span>Upload Image</span>
|
| 55 |
+
</button>
|
| 56 |
+
</div>
|
| 57 |
+
|
| 58 |
+
<!-- Hidden inputs -->
|
| 59 |
+
<input id="file-camera" type="file" accept="image/*" capture="environment" hidden>
|
| 60 |
+
<input id="file-picker" type="file" accept="image/*,application/pdf" hidden>
|
| 61 |
+
|
| 62 |
+
<!-- Image preview with bbox canvas overlay -->
|
| 63 |
+
<div id="image-preview-wrap" hidden>
|
| 64 |
+
<div id="image-container">
|
| 65 |
+
<img id="preview-img" alt="Uploaded image">
|
| 66 |
+
<canvas id="bbox-canvas"></canvas>
|
| 67 |
+
</div>
|
| 68 |
+
<div class="preview-meta">
|
| 69 |
+
<span id="preview-filename" class="meta-filename"></span>
|
| 70 |
+
<button id="btn-clear-image" class="btn-ghost btn-small">✕ Remove</button>
|
| 71 |
+
</div>
|
| 72 |
+
</div>
|
| 73 |
+
</section>
|
| 74 |
+
|
| 75 |
+
<!-- Card: Engine & Model -->
|
| 76 |
+
<section id="engine-card" class="card">
|
| 77 |
+
<div class="card-header">
|
| 78 |
+
<h2>HTR Engine</h2>
|
| 79 |
+
<span id="model-status-badge" class="badge badge--loading">checking…</span>
|
| 80 |
+
</div>
|
| 81 |
+
|
| 82 |
+
<div id="engine-controls">
|
| 83 |
+
<div class="field-row">
|
| 84 |
+
<label for="engine-select">Engine</label>
|
| 85 |
+
<select id="engine-select">
|
| 86 |
+
<option value="">Loading…</option>
|
| 87 |
+
</select>
|
| 88 |
+
</div>
|
| 89 |
+
|
| 90 |
+
<div class="field-row" id="model-row" hidden>
|
| 91 |
+
<label for="model-select">Model</label>
|
| 92 |
+
<select id="model-select">
|
| 93 |
+
<option value="">Select engine first</option>
|
| 94 |
+
</select>
|
| 95 |
+
</div>
|
| 96 |
+
|
| 97 |
+
<button id="btn-load-model" class="btn btn-secondary" hidden>
|
| 98 |
+
Load Model
|
| 99 |
+
</button>
|
| 100 |
+
</div>
|
| 101 |
+
|
| 102 |
+
<!-- Advanced: segmentation -->
|
| 103 |
+
<details id="advanced-details">
|
| 104 |
+
<summary>Advanced options</summary>
|
| 105 |
+
<div class="advanced-inner">
|
| 106 |
+
<div class="field-row">
|
| 107 |
+
<label for="seg-method-select">Line segmentation</label>
|
| 108 |
+
<select id="seg-method-select">
|
| 109 |
+
<option value="kraken" selected>Kraken Classical</option>
|
| 110 |
+
<option value="hpp">Projection Profile fallback</option>
|
| 111 |
+
<option value="kraken-blla" disabled>Kraken Neural / blla (server only)</option>
|
| 112 |
+
</select>
|
| 113 |
+
</div>
|
| 114 |
+
</div>
|
| 115 |
+
</details>
|
| 116 |
+
</section>
|
| 117 |
+
|
| 118 |
+
<!-- Card: Actions -->
|
| 119 |
+
<section id="actions-card" class="card actions-card">
|
| 120 |
+
<button id="btn-segment" class="btn btn-action btn-segment" disabled>
|
| 121 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 122 |
+
<rect x="3" y="3" width="18" height="18" rx="2"/>
|
| 123 |
+
<line x1="3" y1="9" x2="21" y2="9"/>
|
| 124 |
+
<line x1="3" y1="15" x2="21" y2="15"/>
|
| 125 |
+
</svg>
|
| 126 |
+
Detect Lines
|
| 127 |
+
</button>
|
| 128 |
+
<button id="btn-transcribe" class="btn btn-action btn-primary" disabled>
|
| 129 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 130 |
+
<path d="M12 20h9"/><path d="M16.5 3.5a2.121 2.121 0 0 1 3 3L7 19l-4 1 1-4L16.5 3.5z"/>
|
| 131 |
+
</svg>
|
| 132 |
+
Transcribe
|
| 133 |
+
</button>
|
| 134 |
+
<button id="btn-cancel" class="btn btn-danger" hidden>
|
| 135 |
+
Cancel
|
| 136 |
+
</button>
|
| 137 |
+
</section>
|
| 138 |
+
|
| 139 |
+
<!-- Card: Progress -->
|
| 140 |
+
<section id="progress-card" class="card" hidden>
|
| 141 |
+
<div id="progress-bar-wrap">
|
| 142 |
+
<div id="progress-bar" style="width:0%"></div>
|
| 143 |
+
</div>
|
| 144 |
+
<p id="status-text" class="status-text"></p>
|
| 145 |
+
</section>
|
| 146 |
+
|
| 147 |
+
<!-- Card: Results -->
|
| 148 |
+
<section id="results-card" class="card" hidden>
|
| 149 |
+
<div class="card-header">
|
| 150 |
+
<h2>Transcription</h2>
|
| 151 |
+
<span id="line-count" class="badge badge--info"></span>
|
| 152 |
+
</div>
|
| 153 |
+
<div id="results-list"></div>
|
| 154 |
+
<div id="results-actions" class="results-actions">
|
| 155 |
+
<button id="btn-copy" class="btn btn-secondary">
|
| 156 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 157 |
+
<rect x="9" y="9" width="13" height="13" rx="2"/>
|
| 158 |
+
<path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/>
|
| 159 |
+
</svg>
|
| 160 |
+
Copy All
|
| 161 |
+
</button>
|
| 162 |
+
<button id="btn-export-txt" class="btn btn-secondary">
|
| 163 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 164 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
| 165 |
+
<polyline points="7 10 12 15 17 10"/>
|
| 166 |
+
<line x1="12" y1="15" x2="12" y2="3"/>
|
| 167 |
+
</svg>
|
| 168 |
+
Save TXT
|
| 169 |
+
</button>
|
| 170 |
+
</div>
|
| 171 |
+
</section>
|
| 172 |
+
|
| 173 |
+
</main>
|
| 174 |
+
|
| 175 |
+
<!-- Photo Review Overlay — shown after camera capture, before upload -->
|
| 176 |
+
<div id="photo-review" hidden>
|
| 177 |
+
<div id="review-inner">
|
| 178 |
+
<div id="review-warn" hidden>
|
| 179 |
+
Landscape photo detected - for line segmentation, please rotate to portrait (↺ or ↻)
|
| 180 |
+
</div>
|
| 181 |
+
<div id="review-img-outer">
|
| 182 |
+
<div id="review-img-wrap">
|
| 183 |
+
<img id="review-img" alt="Photo preview">
|
| 184 |
+
<canvas id="review-crop-canvas"></canvas>
|
| 185 |
+
</div>
|
| 186 |
+
</div>
|
| 187 |
+
<div id="review-toolbar">
|
| 188 |
+
<button id="btn-rotate-ccw" class="btn btn-secondary btn-icon" title="Rotate 90° left">↺</button>
|
| 189 |
+
<button id="btn-rotate-cw" class="btn btn-secondary btn-icon" title="Rotate 90° right">↻</button>
|
| 190 |
+
<button id="btn-auto-crop" class="btn btn-secondary">Auto crop page</button>
|
| 191 |
+
<button id="btn-crop-start" class="btn btn-secondary">✂ Manual crop</button>
|
| 192 |
+
<button id="btn-crop-apply" class="btn btn-primary" hidden>Apply crop</button>
|
| 193 |
+
<button id="btn-crop-cancel" class="btn btn-ghost btn-small" hidden>Cancel</button>
|
| 194 |
+
</div>
|
| 195 |
+
<div id="review-actions">
|
| 196 |
+
<button id="btn-retake" class="btn btn-secondary">Retake</button>
|
| 197 |
+
<button id="btn-use-photo" class="btn btn-primary">Use photo →</button>
|
| 198 |
+
</div>
|
| 199 |
+
</div>
|
| 200 |
+
</div>
|
| 201 |
+
|
| 202 |
+
<script src="/static/pwa/demo.js" type="module"></script>
|
| 203 |
+
</body>
|
| 204 |
+
</html>
|
web/static/pwa/demo.js
ADDED
|
@@ -0,0 +1,1069 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Polyscriptor PWA Demo — App Logic
|
| 3 |
+
* Self-contained (no imports from main app.js).
|
| 4 |
+
* Cache-bust: 2026-05-18 (photo review CSS fix)
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
// ── LocalStorage keys ──────────────────────────────────────────────────
|
| 8 |
+
const LS_ENGINE = 'pwa_last_engine';
|
| 9 |
+
const LS_SEG_METHOD = 'pwa_seg_method';
|
| 10 |
+
const LS_MODEL = name => `pwa_last_model_${name}`;
|
| 11 |
+
|
| 12 |
+
// ── State ──────────────────────────────────────────────────────────────
|
| 13 |
+
const state = {
|
| 14 |
+
imageId: null,
|
| 15 |
+
imageInfo: null, // { width, height, filename }
|
| 16 |
+
bboxes: [], // [[x1,y1,x2,y2], …]
|
| 17 |
+
lines: [], // [{index, text, confidence, bbox}, …]
|
| 18 |
+
engines: [], // from /api/engines
|
| 19 |
+
loadedEngine: null, // currently active engine name in pool
|
| 20 |
+
engineChangeSeq: 0, // guards against stale async schema responses
|
| 21 |
+
isSegmenting: false,
|
| 22 |
+
isTranscribing: false,
|
| 23 |
+
sseAbort: null, // AbortController for active SSE
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
// ── DOM refs ───────────────────────────────────────────────────────────
|
| 27 |
+
const $ = id => document.getElementById(id);
|
| 28 |
+
const el = {
|
| 29 |
+
btnCamera: $('btn-camera'),
|
| 30 |
+
btnFile: $('btn-file'),
|
| 31 |
+
fileCamera: $('file-camera'),
|
| 32 |
+
filePicker: $('file-picker'),
|
| 33 |
+
previewWrap: $('image-preview-wrap'),
|
| 34 |
+
previewImg: $('preview-img'),
|
| 35 |
+
bboxCanvas: $('bbox-canvas'),
|
| 36 |
+
previewFilename: $('preview-filename'),
|
| 37 |
+
btnClearImage: $('btn-clear-image'),
|
| 38 |
+
|
| 39 |
+
engineSelect: $('engine-select'),
|
| 40 |
+
modelRow: $('model-row'),
|
| 41 |
+
modelSelect: $('model-select'),
|
| 42 |
+
btnLoadModel: $('btn-load-model'),
|
| 43 |
+
modelStatusBadge: $('model-status-badge'),
|
| 44 |
+
enginePill: $('engine-pill'),
|
| 45 |
+
enginePillText: $('engine-pill-text'),
|
| 46 |
+
|
| 47 |
+
segMethodSelect: $('seg-method-select'),
|
| 48 |
+
|
| 49 |
+
btnSegment: $('btn-segment'),
|
| 50 |
+
btnTranscribe: $('btn-transcribe'),
|
| 51 |
+
btnCancel: $('btn-cancel'),
|
| 52 |
+
|
| 53 |
+
progressCard: $('progress-card'),
|
| 54 |
+
progressBar: $('progress-bar'),
|
| 55 |
+
statusText: $('status-text'),
|
| 56 |
+
|
| 57 |
+
resultsCard: $('results-card'),
|
| 58 |
+
resultsList: $('results-list'),
|
| 59 |
+
lineCount: $('line-count'),
|
| 60 |
+
btnCopy: $('btn-copy'),
|
| 61 |
+
btnExportTxt: $('btn-export-txt'),
|
| 62 |
+
|
| 63 |
+
// Photo review overlay
|
| 64 |
+
photoReview: $('photo-review'),
|
| 65 |
+
reviewImg: $('review-img'),
|
| 66 |
+
reviewCropCanvas: $('review-crop-canvas'),
|
| 67 |
+
reviewWarn: $('review-warn'),
|
| 68 |
+
btnRotateCCW: $('btn-rotate-ccw'),
|
| 69 |
+
btnRotateCW: $('btn-rotate-cw'),
|
| 70 |
+
btnAutoCrop: $('btn-auto-crop'),
|
| 71 |
+
btnCropStart: $('btn-crop-start'),
|
| 72 |
+
btnCropApply: $('btn-crop-apply'),
|
| 73 |
+
btnCropCancel: $('btn-crop-cancel'),
|
| 74 |
+
btnRetake: $('btn-retake'),
|
| 75 |
+
btnUsePhoto: $('btn-use-photo'),
|
| 76 |
+
};
|
| 77 |
+
|
| 78 |
+
// ── Photo Review State ─────────────────────────────────────────────────
|
| 79 |
+
const reviewState = {
|
| 80 |
+
canvas: null, // off-screen working canvas (rotated / cropped)
|
| 81 |
+
cropMode: false,
|
| 82 |
+
cropStart: null, // image-coord pointer-down position
|
| 83 |
+
cropRect: null, // {x, y, w, h} in image coords
|
| 84 |
+
srcFilename: '',
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
// ── Toast ──────────────────────────────────────────────────────────────
|
| 88 |
+
function toast(msg, type = 'info', ms = 4000) {
|
| 89 |
+
const container = $('toast-container');
|
| 90 |
+
const div = document.createElement('div');
|
| 91 |
+
div.className = `toast toast--${type}`;
|
| 92 |
+
div.textContent = msg;
|
| 93 |
+
container.appendChild(div);
|
| 94 |
+
setTimeout(() => div.remove(), ms);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
// ── API helper ─────────────────────────────────────────────────────────
|
| 98 |
+
async function api(path, options = {}) {
|
| 99 |
+
const headers = { 'Content-Type': 'application/json', ...(options.headers || {}) };
|
| 100 |
+
const resp = await fetch(path, { ...options, headers });
|
| 101 |
+
if (!resp.ok) {
|
| 102 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 103 |
+
throw new Error(err.detail || err.message || `HTTP ${resp.status}`);
|
| 104 |
+
}
|
| 105 |
+
return resp;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
// ── Engine pill ────────────────────────────────────────────────────────
|
| 109 |
+
function setPill(state, text) {
|
| 110 |
+
el.enginePill.className = `engine-pill engine-pill--${state}`;
|
| 111 |
+
el.enginePillText.textContent = text;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
// ── Engine status (check pool) ─────────────────────────────��───────────
|
| 115 |
+
async function checkEngineStatus() {
|
| 116 |
+
try {
|
| 117 |
+
const resp = await api('/api/engine/status');
|
| 118 |
+
const data = await resp.json();
|
| 119 |
+
|
| 120 |
+
// Response: { loaded: bool, engine_name: str, config: {...} }
|
| 121 |
+
if (data.loaded && data.engine_name) {
|
| 122 |
+
state.loadedEngine = data.engine_name;
|
| 123 |
+
setPill('loaded', data.engine_name);
|
| 124 |
+
setBadge('loaded', 'Model loaded');
|
| 125 |
+
// Pre-select the matching engine in the dropdown
|
| 126 |
+
if (el.engineSelect.querySelector(`option[value="${data.engine_name}"]`)) {
|
| 127 |
+
el.engineSelect.value = data.engine_name;
|
| 128 |
+
}
|
| 129 |
+
// Hide load controls — engine already active
|
| 130 |
+
el.btnLoadModel.hidden = true;
|
| 131 |
+
el.modelRow.hidden = true;
|
| 132 |
+
} else {
|
| 133 |
+
state.loadedEngine = null;
|
| 134 |
+
setPill('unloaded', 'No model');
|
| 135 |
+
setBadge('unloaded', 'No model loaded');
|
| 136 |
+
el.btnLoadModel.hidden = false;
|
| 137 |
+
}
|
| 138 |
+
updateActionButtons();
|
| 139 |
+
} catch {
|
| 140 |
+
setPill('unknown', 'Offline');
|
| 141 |
+
setBadge('loading', 'Checking…');
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
function setBadge(type, text) {
|
| 146 |
+
el.modelStatusBadge.className = `badge badge--${type}`;
|
| 147 |
+
el.modelStatusBadge.textContent = text;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// ── Load engines list ──────────────────────────────────────────────────
|
| 151 |
+
async function loadEngines() {
|
| 152 |
+
try {
|
| 153 |
+
const resp = await api('/api/engines');
|
| 154 |
+
const data = await resp.json();
|
| 155 |
+
// /api/engines returns a plain array
|
| 156 |
+
state.engines = Array.isArray(data) ? data : (data.engines || []);
|
| 157 |
+
|
| 158 |
+
el.engineSelect.innerHTML = '';
|
| 159 |
+
const avail = state.engines.filter(e => e.available);
|
| 160 |
+
|
| 161 |
+
if (avail.length === 0) {
|
| 162 |
+
el.engineSelect.innerHTML = '<option value="">No engines available</option>';
|
| 163 |
+
return;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
for (const eng of avail) {
|
| 167 |
+
const opt = document.createElement('option');
|
| 168 |
+
opt.value = eng.name;
|
| 169 |
+
opt.textContent = eng.display_name || eng.name;
|
| 170 |
+
el.engineSelect.appendChild(opt);
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
// Restore last selection
|
| 174 |
+
const last = localStorage.getItem(LS_ENGINE);
|
| 175 |
+
if (last && el.engineSelect.querySelector(`option[value="${last}"]`)) {
|
| 176 |
+
el.engineSelect.value = last;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
await onEngineChange();
|
| 180 |
+
} catch (e) {
|
| 181 |
+
el.engineSelect.innerHTML = '<option value="">Failed to load engines</option>';
|
| 182 |
+
toast('Could not reach server', 'error');
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
// ── Engine selection changed ───────────────────────────────────────────
|
| 187 |
+
async function onEngineChange() {
|
| 188 |
+
const name = el.engineSelect.value;
|
| 189 |
+
if (!name) return;
|
| 190 |
+
const requestSeq = ++state.engineChangeSeq;
|
| 191 |
+
localStorage.setItem(LS_ENGINE, name);
|
| 192 |
+
|
| 193 |
+
// If this engine is already the loaded one, hide load controls
|
| 194 |
+
if (name === state.loadedEngine) {
|
| 195 |
+
el.modelRow.hidden = true;
|
| 196 |
+
el.btnLoadModel.hidden = true;
|
| 197 |
+
return;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
el.modelRow.hidden = false;
|
| 201 |
+
el.modelSelect.innerHTML = '<option>Loading…</option>';
|
| 202 |
+
el.btnLoadModel.hidden = false;
|
| 203 |
+
el.btnLoadModel.disabled = true;
|
| 204 |
+
state.modelFieldKey = null;
|
| 205 |
+
|
| 206 |
+
try {
|
| 207 |
+
// Use config-schema (same as main app) — it has the full model option list
|
| 208 |
+
const resp = await api(`/api/engine/${encodeURIComponent(name)}/config-schema`);
|
| 209 |
+
const schema = await resp.json();
|
| 210 |
+
|
| 211 |
+
if (requestSeq !== state.engineChangeSeq || el.engineSelect.value !== name) {
|
| 212 |
+
return;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
// Find first non-dynamic select field → that's the model selector
|
| 216 |
+
const selectField = (schema.fields || []).find(
|
| 217 |
+
f => f.type === 'select' && !f.dynamic
|
| 218 |
+
);
|
| 219 |
+
|
| 220 |
+
el.modelSelect.innerHTML = '';
|
| 221 |
+
|
| 222 |
+
if (selectField && (selectField.options || []).length > 0) {
|
| 223 |
+
state.modelFieldKey = selectField.key;
|
| 224 |
+
for (const opt of selectField.options) {
|
| 225 |
+
const o = document.createElement('option');
|
| 226 |
+
o.value = typeof opt === 'object' ? opt.value : opt;
|
| 227 |
+
o.textContent = typeof opt === 'object' ? opt.label : opt;
|
| 228 |
+
el.modelSelect.appendChild(o);
|
| 229 |
+
}
|
| 230 |
+
// Restore last selection or apply schema default
|
| 231 |
+
const lastModel = localStorage.getItem(LS_MODEL(name));
|
| 232 |
+
if (lastModel && el.modelSelect.querySelector(`option[value="${lastModel}"]`)) {
|
| 233 |
+
el.modelSelect.value = lastModel;
|
| 234 |
+
} else if (selectField.default != null) {
|
| 235 |
+
el.modelSelect.value = selectField.default;
|
| 236 |
+
}
|
| 237 |
+
} else {
|
| 238 |
+
// No static options (e.g. API-based engines) — show Default
|
| 239 |
+
state.modelFieldKey = selectField?.key || 'model_path';
|
| 240 |
+
const o = document.createElement('option');
|
| 241 |
+
o.value = '';
|
| 242 |
+
o.textContent = 'Default';
|
| 243 |
+
el.modelSelect.appendChild(o);
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
el.btnLoadModel.disabled = false;
|
| 247 |
+
} catch {
|
| 248 |
+
if (requestSeq !== state.engineChangeSeq || el.engineSelect.value !== name) {
|
| 249 |
+
return;
|
| 250 |
+
}
|
| 251 |
+
el.modelSelect.innerHTML = '<option value="">Default</option>';
|
| 252 |
+
state.modelFieldKey = 'model_path';
|
| 253 |
+
el.btnLoadModel.disabled = false;
|
| 254 |
+
}
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
// ── Load model ─────────────────────────────────────────────────────────
|
| 258 |
+
async function loadModel() {
|
| 259 |
+
const engineName = el.engineSelect.value;
|
| 260 |
+
if (!engineName) return;
|
| 261 |
+
|
| 262 |
+
const modelVal = el.modelSelect.value || '';
|
| 263 |
+
localStorage.setItem(LS_MODEL(engineName), modelVal);
|
| 264 |
+
|
| 265 |
+
el.btnLoadModel.disabled = true;
|
| 266 |
+
el.btnLoadModel.textContent = 'Loading…';
|
| 267 |
+
setPill('loading', 'Loading…');
|
| 268 |
+
setBadge('loading', 'Loading…');
|
| 269 |
+
|
| 270 |
+
try {
|
| 271 |
+
// Use the field key from the config schema (e.g. 'model_path' for CRNN-CTC/TrOCR/Kraken)
|
| 272 |
+
const fieldKey = state.modelFieldKey || 'model_path';
|
| 273 |
+
const config = modelVal ? { [fieldKey]: modelVal } : {};
|
| 274 |
+
await api('/api/engine/load', {
|
| 275 |
+
method: 'POST',
|
| 276 |
+
body: JSON.stringify({ engine_name: engineName, config }),
|
| 277 |
+
});
|
| 278 |
+
|
| 279 |
+
state.loadedEngine = engineName;
|
| 280 |
+
setPill('loaded', engineName);
|
| 281 |
+
setBadge('loaded', 'Model loaded');
|
| 282 |
+
el.btnLoadModel.hidden = true;
|
| 283 |
+
el.modelRow.hidden = true;
|
| 284 |
+
toast(`${engineName} loaded`, 'success');
|
| 285 |
+
} catch (e) {
|
| 286 |
+
setPill('unloaded', 'Load failed');
|
| 287 |
+
setBadge('unloaded', 'Load failed');
|
| 288 |
+
toast(`Load failed: ${e.message}`, 'error');
|
| 289 |
+
} finally {
|
| 290 |
+
el.btnLoadModel.disabled = false;
|
| 291 |
+
el.btnLoadModel.textContent = 'Load Model';
|
| 292 |
+
updateActionButtons();
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
// ── Update action button states ────────────────────────────────────────
|
| 297 |
+
function updateActionButtons() {
|
| 298 |
+
const hasImage = !!state.imageId;
|
| 299 |
+
const hasEngine = !!state.loadedEngine;
|
| 300 |
+
const busy = state.isSegmenting || state.isTranscribing;
|
| 301 |
+
|
| 302 |
+
el.btnSegment.disabled = !hasImage || !hasEngine || busy;
|
| 303 |
+
el.btnTranscribe.disabled = !hasImage || !hasEngine || busy;
|
| 304 |
+
el.btnCancel.hidden = !busy;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
// ── File upload ────────────────────────────────────────────────────────
|
| 308 |
+
async function uploadFile(file) {
|
| 309 |
+
if (!file) return;
|
| 310 |
+
|
| 311 |
+
const fd = new FormData();
|
| 312 |
+
fd.append('file', file);
|
| 313 |
+
|
| 314 |
+
setStatus('Uploading…');
|
| 315 |
+
el.progressCard.hidden = false;
|
| 316 |
+
setProgress(0);
|
| 317 |
+
|
| 318 |
+
try {
|
| 319 |
+
const resp = await fetch('/api/image/upload?max_dim=2400', { method: 'POST', body: fd });
|
| 320 |
+
if (!resp.ok) {
|
| 321 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 322 |
+
throw new Error(err.detail || 'Upload failed');
|
| 323 |
+
}
|
| 324 |
+
const data = await resp.json();
|
| 325 |
+
|
| 326 |
+
if (data.is_pdf) {
|
| 327 |
+
// PDF: use first page
|
| 328 |
+
const first = data.pages[0];
|
| 329 |
+
state.imageId = first.image_id;
|
| 330 |
+
state.imageInfo = { width: first.width, height: first.height, filename: first.filename };
|
| 331 |
+
toast(`PDF uploaded — using page 1 of ${data.pages.length}`, 'info');
|
| 332 |
+
} else {
|
| 333 |
+
state.imageId = data.image_id;
|
| 334 |
+
state.imageInfo = { width: data.width, height: data.height, filename: data.filename };
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
// Show preview
|
| 338 |
+
el.previewImg.src = `/api/image/${state.imageId}`;
|
| 339 |
+
el.previewFilename.textContent = state.imageInfo.filename || file.name;
|
| 340 |
+
el.previewWrap.hidden = false;
|
| 341 |
+
clearBboxes();
|
| 342 |
+
|
| 343 |
+
// Clear old results
|
| 344 |
+
hideResults();
|
| 345 |
+
setStatus('Image ready');
|
| 346 |
+
setProgress(100);
|
| 347 |
+
setTimeout(() => { el.progressCard.hidden = true; }, 800);
|
| 348 |
+
updateActionButtons();
|
| 349 |
+
} catch (e) {
|
| 350 |
+
toast(`Upload failed: ${e.message}`, 'error');
|
| 351 |
+
setStatus('');
|
| 352 |
+
el.progressCard.hidden = true;
|
| 353 |
+
}
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
// ── Clear image ────────────────────────────────────────────────────────
|
| 357 |
+
function clearImage() {
|
| 358 |
+
state.imageId = null;
|
| 359 |
+
state.imageInfo = null;
|
| 360 |
+
state.bboxes = [];
|
| 361 |
+
state.lines = [];
|
| 362 |
+
el.previewWrap.hidden = true;
|
| 363 |
+
el.previewImg.src = '';
|
| 364 |
+
clearBboxes();
|
| 365 |
+
hideResults();
|
| 366 |
+
updateActionButtons();
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
// ── BBox canvas ────────────────────────────────────────────────────────
|
| 370 |
+
function clearBboxes() {
|
| 371 |
+
const canvas = el.bboxCanvas;
|
| 372 |
+
const ctx = canvas.getContext('2d');
|
| 373 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 374 |
+
state.bboxes = [];
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
// Draw bounding boxes scaled to displayed image size
|
| 378 |
+
function drawBboxes(bboxes, highlightIdx = -1) {
|
| 379 |
+
const img = el.previewImg;
|
| 380 |
+
const canvas = el.bboxCanvas;
|
| 381 |
+
const ctx = canvas.getContext('2d');
|
| 382 |
+
|
| 383 |
+
// Match canvas to displayed size
|
| 384 |
+
canvas.width = img.offsetWidth;
|
| 385 |
+
canvas.height = img.offsetHeight;
|
| 386 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
| 387 |
+
|
| 388 |
+
if (!bboxes || bboxes.length === 0 || !state.imageInfo) return;
|
| 389 |
+
|
| 390 |
+
const scaleX = img.offsetWidth / state.imageInfo.width;
|
| 391 |
+
const scaleY = img.offsetHeight / state.imageInfo.height;
|
| 392 |
+
|
| 393 |
+
// Color palette for lines — use distinct hues
|
| 394 |
+
const COLORS = [
|
| 395 |
+
'rgba(59,130,246,', // blue
|
| 396 |
+
'rgba(99,102,241,', // indigo
|
| 397 |
+
'rgba(34,197,94,', // green
|
| 398 |
+
'rgba(245,158,11,', // amber
|
| 399 |
+
'rgba(239,68,68,', // red
|
| 400 |
+
'rgba(168,85,247,', // purple
|
| 401 |
+
'rgba(20,184,166,', // teal
|
| 402 |
+
'rgba(249,115,22,', // orange
|
| 403 |
+
];
|
| 404 |
+
|
| 405 |
+
bboxes.forEach((bbox, i) => {
|
| 406 |
+
const [x1, y1, x2, y2] = bbox;
|
| 407 |
+
const x = x1 * scaleX;
|
| 408 |
+
const y = y1 * scaleY;
|
| 409 |
+
const w = (x2 - x1) * scaleX;
|
| 410 |
+
const h = (y2 - y1) * scaleY;
|
| 411 |
+
|
| 412 |
+
const colorBase = COLORS[i % COLORS.length];
|
| 413 |
+
const isHighlighted = i === highlightIdx;
|
| 414 |
+
const fillAlpha = isHighlighted ? 0.25 : 0.10;
|
| 415 |
+
const strokeAlpha = isHighlighted ? 1.0 : 0.7;
|
| 416 |
+
|
| 417 |
+
ctx.fillStyle = `${colorBase}${fillAlpha})`;
|
| 418 |
+
ctx.strokeStyle = `${colorBase}${strokeAlpha})`;
|
| 419 |
+
ctx.lineWidth = isHighlighted ? 2 : 1.5;
|
| 420 |
+
|
| 421 |
+
ctx.fillRect(x, y, w, h);
|
| 422 |
+
ctx.strokeRect(x, y, w, h);
|
| 423 |
+
|
| 424 |
+
// Line number label
|
| 425 |
+
ctx.font = 'bold 10px monospace';
|
| 426 |
+
ctx.fillStyle = `${colorBase}0.9)`;
|
| 427 |
+
const label = String(i + 1);
|
| 428 |
+
const pad = 3;
|
| 429 |
+
const tw = ctx.measureText(label).width + pad * 2;
|
| 430 |
+
ctx.fillStyle = `${colorBase}0.85)`;
|
| 431 |
+
ctx.fillRect(x, y - 14, tw, 14);
|
| 432 |
+
ctx.fillStyle = '#fff';
|
| 433 |
+
ctx.fillText(label, x + pad, y - 3);
|
| 434 |
+
});
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
// ── Segment ────────────────────────────────────────────────────────────
|
| 438 |
+
async function segmentImage() {
|
| 439 |
+
if (!state.imageId) return;
|
| 440 |
+
|
| 441 |
+
state.isSegmenting = true;
|
| 442 |
+
updateActionButtons();
|
| 443 |
+
el.progressCard.hidden = false;
|
| 444 |
+
setProgress(0);
|
| 445 |
+
setStatus('Detecting lines…');
|
| 446 |
+
clearBboxes();
|
| 447 |
+
|
| 448 |
+
const method = el.segMethodSelect.value || 'kraken';
|
| 449 |
+
localStorage.setItem(LS_SEG_METHOD, method);
|
| 450 |
+
|
| 451 |
+
try {
|
| 452 |
+
const url = `/api/image/${state.imageId}/segment?method=${encodeURIComponent(method)}&device=cuda%3A0`;
|
| 453 |
+
const resp = await api(url);
|
| 454 |
+
const data = await resp.json();
|
| 455 |
+
|
| 456 |
+
state.bboxes = data.bboxes || [];
|
| 457 |
+
drawBboxes(state.bboxes);
|
| 458 |
+
|
| 459 |
+
setStatus(`${state.bboxes.length} line${state.bboxes.length !== 1 ? 's' : ''} detected`);
|
| 460 |
+
setProgress(100);
|
| 461 |
+
toast(`${state.bboxes.length} lines detected`, 'success', 2500);
|
| 462 |
+
} catch (e) {
|
| 463 |
+
toast(`Segmentation failed: ${e.message}`, 'error');
|
| 464 |
+
setStatus('Segmentation failed');
|
| 465 |
+
} finally {
|
| 466 |
+
state.isSegmenting = false;
|
| 467 |
+
updateActionButtons();
|
| 468 |
+
setTimeout(() => { if (!state.isTranscribing) el.progressCard.hidden = true; }, 1500);
|
| 469 |
+
}
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
// ── Transcribe (SSE) ───────────────────────────────────────────────────
|
| 473 |
+
async function startTranscription() {
|
| 474 |
+
if (!state.imageId || !state.loadedEngine) return;
|
| 475 |
+
|
| 476 |
+
state.isTranscribing = true;
|
| 477 |
+
state.lines = [];
|
| 478 |
+
updateActionButtons();
|
| 479 |
+
|
| 480 |
+
el.progressCard.hidden = false;
|
| 481 |
+
setProgress(0);
|
| 482 |
+
setStatus('Starting transcription…');
|
| 483 |
+
el.resultsCard.hidden = true;
|
| 484 |
+
el.resultsList.innerHTML = '';
|
| 485 |
+
|
| 486 |
+
const method = el.segMethodSelect.value || 'kraken';
|
| 487 |
+
|
| 488 |
+
const body = JSON.stringify({
|
| 489 |
+
image_id: state.imageId,
|
| 490 |
+
seg_method: method,
|
| 491 |
+
seg_device: 'cuda:0',
|
| 492 |
+
});
|
| 493 |
+
|
| 494 |
+
const abort = new AbortController();
|
| 495 |
+
state.sseAbort = abort;
|
| 496 |
+
|
| 497 |
+
try {
|
| 498 |
+
const resp = await fetch('/api/transcribe', {
|
| 499 |
+
method: 'POST',
|
| 500 |
+
headers: { 'Content-Type': 'application/json' },
|
| 501 |
+
body,
|
| 502 |
+
signal: abort.signal,
|
| 503 |
+
});
|
| 504 |
+
|
| 505 |
+
if (!resp.ok) {
|
| 506 |
+
const err = await resp.json().catch(() => ({ detail: resp.statusText }));
|
| 507 |
+
throw new Error(err.detail || 'Transcription failed');
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
const reader = resp.body.getReader();
|
| 511 |
+
const decoder = new TextDecoder();
|
| 512 |
+
let buffer = '';
|
| 513 |
+
|
| 514 |
+
while (true) {
|
| 515 |
+
const { done, value } = await reader.read();
|
| 516 |
+
if (done) break;
|
| 517 |
+
|
| 518 |
+
buffer += decoder.decode(value, { stream: true });
|
| 519 |
+
const parts = buffer.split('\n\n');
|
| 520 |
+
buffer = parts.pop(); // last part may be incomplete
|
| 521 |
+
|
| 522 |
+
for (const part of parts) {
|
| 523 |
+
const eventLine = part.split('\n').find(l => l.startsWith('event:'));
|
| 524 |
+
const dataLine = part.split('\n').find(l => l.startsWith('data:'));
|
| 525 |
+
if (!dataLine) continue;
|
| 526 |
+
|
| 527 |
+
const event = eventLine ? eventLine.slice(7).trim() : 'message';
|
| 528 |
+
const payload = JSON.parse(dataLine.slice(5).trim());
|
| 529 |
+
|
| 530 |
+
handleSSEEvent(event, payload);
|
| 531 |
+
}
|
| 532 |
+
}
|
| 533 |
+
} catch (e) {
|
| 534 |
+
if (e.name !== 'AbortError') {
|
| 535 |
+
toast(`Transcription error: ${e.message}`, 'error');
|
| 536 |
+
setStatus('Error');
|
| 537 |
+
}
|
| 538 |
+
} finally {
|
| 539 |
+
state.isTranscribing = false;
|
| 540 |
+
state.sseAbort = null;
|
| 541 |
+
updateActionButtons();
|
| 542 |
+
}
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
function handleSSEEvent(event, payload) {
|
| 546 |
+
switch (event) {
|
| 547 |
+
case 'status':
|
| 548 |
+
setStatus(payload.message || '');
|
| 549 |
+
break;
|
| 550 |
+
|
| 551 |
+
case 'segmentation': {
|
| 552 |
+
state.bboxes = payload.bboxes || [];
|
| 553 |
+
drawBboxes(state.bboxes);
|
| 554 |
+
setStatus(`${state.bboxes.length} lines detected — transcribing…`);
|
| 555 |
+
break;
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
case 'progress': {
|
| 559 |
+
const { current, total, line } = payload;
|
| 560 |
+
setProgress(total > 0 ? (current / total) * 100 : 0);
|
| 561 |
+
setStatus(`Transcribing line ${current} / ${total}…`);
|
| 562 |
+
|
| 563 |
+
if (line) {
|
| 564 |
+
state.lines.push(line);
|
| 565 |
+
appendResultLine(line);
|
| 566 |
+
// Highlight corresponding bbox
|
| 567 |
+
drawBboxes(state.bboxes, line.index);
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
// Show results card on first result
|
| 571 |
+
if (el.resultsCard.hidden && state.lines.length === 1) {
|
| 572 |
+
el.resultsCard.hidden = false;
|
| 573 |
+
el.resultsCard.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
|
| 574 |
+
}
|
| 575 |
+
break;
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
case 'complete': {
|
| 579 |
+
setProgress(100);
|
| 580 |
+
const count = (payload.lines || []).length;
|
| 581 |
+
const secs = payload.total_time_s ? ` in ${payload.total_time_s}s` : '';
|
| 582 |
+
setStatus(`Done — ${count} lines${secs}`);
|
| 583 |
+
el.lineCount.textContent = `${count} lines`;
|
| 584 |
+
el.lineCount.className = 'badge badge--info';
|
| 585 |
+
|
| 586 |
+
// Redraw all bboxes without highlight
|
| 587 |
+
drawBboxes(state.bboxes);
|
| 588 |
+
toast(`Transcription complete (${count} lines)`, 'success');
|
| 589 |
+
setTimeout(() => { el.progressCard.hidden = true; }, 1200);
|
| 590 |
+
break;
|
| 591 |
+
}
|
| 592 |
+
|
| 593 |
+
case 'cancelled':
|
| 594 |
+
setStatus('Cancelled');
|
| 595 |
+
toast('Transcription cancelled', 'warn', 2500);
|
| 596 |
+
setTimeout(() => { el.progressCard.hidden = true; }, 1000);
|
| 597 |
+
break;
|
| 598 |
+
|
| 599 |
+
case 'error':
|
| 600 |
+
toast(`Error: ${payload.message}`, 'error');
|
| 601 |
+
setStatus('Error');
|
| 602 |
+
break;
|
| 603 |
+
}
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
// ── Result line DOM ────────────────────────────────────────────────────
|
| 607 |
+
function appendResultLine(line) {
|
| 608 |
+
const div = document.createElement('div');
|
| 609 |
+
div.className = 'result-line';
|
| 610 |
+
|
| 611 |
+
const numSpan = document.createElement('span');
|
| 612 |
+
numSpan.className = 'line-num';
|
| 613 |
+
numSpan.textContent = String(line.index + 1);
|
| 614 |
+
|
| 615 |
+
const textSpan = document.createElement('span');
|
| 616 |
+
textSpan.className = 'line-text';
|
| 617 |
+
textSpan.textContent = line.text || '';
|
| 618 |
+
|
| 619 |
+
div.appendChild(numSpan);
|
| 620 |
+
div.appendChild(textSpan);
|
| 621 |
+
|
| 622 |
+
if (line.confidence !== null && line.confidence !== undefined) {
|
| 623 |
+
const pct = Math.round(line.confidence * 100);
|
| 624 |
+
const confSpan = document.createElement('span');
|
| 625 |
+
confSpan.className = `line-conf ${pct >= 90 ? 'conf-high' : pct >= 75 ? 'conf-mid' : 'conf-low'}`;
|
| 626 |
+
confSpan.textContent = `${pct}%`;
|
| 627 |
+
div.appendChild(confSpan);
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
el.resultsList.appendChild(div);
|
| 631 |
+
// Auto-scroll to latest
|
| 632 |
+
el.resultsList.scrollTop = el.resultsList.scrollHeight;
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
// ── Cancel ─────────────────────────────────────────────────────────────
|
| 636 |
+
async function cancelTranscription() {
|
| 637 |
+
if (state.sseAbort) state.sseAbort.abort();
|
| 638 |
+
try {
|
| 639 |
+
await api('/api/transcribe/cancel', { method: 'POST', body: '{}' });
|
| 640 |
+
} catch { /* ignore */ }
|
| 641 |
+
}
|
| 642 |
+
|
| 643 |
+
// ── Progress helpers ───────────────────────────────────────────────────
|
| 644 |
+
function setProgress(pct) {
|
| 645 |
+
el.progressBar.style.width = `${Math.min(100, Math.max(0, pct))}%`;
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
function setStatus(msg) {
|
| 649 |
+
el.statusText.textContent = msg;
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
// ── Hide results ───────────────────────────────────────────────────────
|
| 653 |
+
function hideResults() {
|
| 654 |
+
el.resultsCard.hidden = true;
|
| 655 |
+
el.resultsList.innerHTML = '';
|
| 656 |
+
state.lines = [];
|
| 657 |
+
el.lineCount.textContent = '';
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
// ── Copy all ───────────────────────────────────────────────────────────
|
| 661 |
+
function copyAll() {
|
| 662 |
+
const text = state.lines.map(l => l.text || '').join('\n');
|
| 663 |
+
if (!text) { toast('Nothing to copy', 'warn', 2000); return; }
|
| 664 |
+
navigator.clipboard.writeText(text)
|
| 665 |
+
.then(() => toast('Copied to clipboard', 'success', 2000))
|
| 666 |
+
.catch(() => toast('Copy failed', 'error'));
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
// ── Export TXT ─────────────────────────────────────────────────────────
|
| 670 |
+
function exportTxt() {
|
| 671 |
+
const text = state.lines.map(l => l.text || '').join('\n');
|
| 672 |
+
if (!text) { toast('Nothing to export', 'warn', 2000); return; }
|
| 673 |
+
const blob = new Blob([text], { type: 'text/plain;charset=utf-8' });
|
| 674 |
+
const url = URL.createObjectURL(blob);
|
| 675 |
+
const a = document.createElement('a');
|
| 676 |
+
a.href = url;
|
| 677 |
+
a.download = (state.imageInfo?.filename?.replace(/\.[^.]+$/, '') || 'transcription') + '.txt';
|
| 678 |
+
a.click();
|
| 679 |
+
URL.revokeObjectURL(url);
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
// ── Redraw bboxes on image resize ─────────────���────────────────────────
|
| 683 |
+
function onImageResize() {
|
| 684 |
+
if (state.bboxes.length > 0) drawBboxes(state.bboxes);
|
| 685 |
+
}
|
| 686 |
+
|
| 687 |
+
// ── Photo Review ────────────────────────────────────────────────────────
|
| 688 |
+
|
| 689 |
+
function openPhotoReview(file) {
|
| 690 |
+
reviewState.srcFilename = file.name || 'photo.jpg';
|
| 691 |
+
reviewState.cropMode = false;
|
| 692 |
+
reviewState.cropStart = null;
|
| 693 |
+
reviewState.cropRect = null;
|
| 694 |
+
|
| 695 |
+
const img = new Image();
|
| 696 |
+
const url = URL.createObjectURL(file);
|
| 697 |
+
img.onload = () => {
|
| 698 |
+
URL.revokeObjectURL(url);
|
| 699 |
+
const canvas = document.createElement('canvas');
|
| 700 |
+
canvas.width = img.naturalWidth;
|
| 701 |
+
canvas.height = img.naturalHeight;
|
| 702 |
+
canvas.getContext('2d').drawImage(img, 0, 0);
|
| 703 |
+
reviewState.canvas = canvas;
|
| 704 |
+
updateReviewDisplay();
|
| 705 |
+
el.photoReview.hidden = false;
|
| 706 |
+
document.body.style.overflow = 'hidden';
|
| 707 |
+
};
|
| 708 |
+
img.onerror = () => {
|
| 709 |
+
URL.revokeObjectURL(url);
|
| 710 |
+
toast('Could not load photo', 'error');
|
| 711 |
+
};
|
| 712 |
+
img.src = url;
|
| 713 |
+
}
|
| 714 |
+
|
| 715 |
+
function closePhotoReview() {
|
| 716 |
+
el.photoReview.hidden = true;
|
| 717 |
+
document.body.style.overflow = '';
|
| 718 |
+
reviewState.canvas = null;
|
| 719 |
+
reviewState.cropMode = false;
|
| 720 |
+
reviewState.cropRect = null;
|
| 721 |
+
resetCropUI();
|
| 722 |
+
}
|
| 723 |
+
|
| 724 |
+
function updateReviewDisplay() {
|
| 725 |
+
if (!reviewState.canvas) return;
|
| 726 |
+
el.reviewImg.onload = () => {
|
| 727 |
+
syncCropCanvas();
|
| 728 |
+
checkReviewOrientation();
|
| 729 |
+
};
|
| 730 |
+
el.reviewImg.src = reviewState.canvas.toDataURL('image/jpeg', 0.9);
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
function checkReviewOrientation() {
|
| 734 |
+
const landscape = reviewState.canvas.width > reviewState.canvas.height;
|
| 735 |
+
el.reviewWarn.hidden = !landscape;
|
| 736 |
+
}
|
| 737 |
+
|
| 738 |
+
function syncCropCanvas() {
|
| 739 |
+
const c = el.reviewCropCanvas;
|
| 740 |
+
const rect = el.reviewImg.getBoundingClientRect();
|
| 741 |
+
if (!rect.width) return;
|
| 742 |
+
c.width = Math.round(rect.width);
|
| 743 |
+
c.height = Math.round(rect.height);
|
| 744 |
+
c.getContext('2d').clearRect(0, 0, c.width, c.height);
|
| 745 |
+
}
|
| 746 |
+
|
| 747 |
+
// ── Auto-Crop (adaptive page detection) ────────────────────────────────
|
| 748 |
+
|
| 749 |
+
function autoDetectAndCrop() {
|
| 750 |
+
if (!reviewState.canvas) return;
|
| 751 |
+
exitCropMode();
|
| 752 |
+
|
| 753 |
+
const canvas = reviewState.canvas;
|
| 754 |
+
const { width, height } = canvas;
|
| 755 |
+
const data = canvas.getContext('2d').getImageData(0, 0, width, height).data;
|
| 756 |
+
|
| 757 |
+
// Single pass: accumulate page-likelihood per row and per column.
|
| 758 |
+
// Heuristic: white paper is typically bright with low saturation.
|
| 759 |
+
const rowSum = new Float32Array(height);
|
| 760 |
+
const colSum = new Float32Array(width);
|
| 761 |
+
let borderSum = 0;
|
| 762 |
+
let borderCount = 0;
|
| 763 |
+
|
| 764 |
+
const borderBandY = Math.max(1, Math.floor(height * 0.08));
|
| 765 |
+
const borderBandX = Math.max(1, Math.floor(width * 0.08));
|
| 766 |
+
|
| 767 |
+
for (let y = 0; y < height; y++) {
|
| 768 |
+
for (let x = 0; x < width; x++) {
|
| 769 |
+
const i = (y * width + x) * 4;
|
| 770 |
+
const r = data[i];
|
| 771 |
+
const g = data[i + 1];
|
| 772 |
+
const b = data[i + 2];
|
| 773 |
+
|
| 774 |
+
const v = Math.max(r, g, b);
|
| 775 |
+
const min = Math.min(r, g, b);
|
| 776 |
+
const s = v === 0 ? 0 : (v - min) / v;
|
| 777 |
+
|
| 778 |
+
const pageScore = v - (s * 90);
|
| 779 |
+
rowSum[y] += pageScore;
|
| 780 |
+
colSum[x] += pageScore;
|
| 781 |
+
|
| 782 |
+
const isBorderPixel = y < borderBandY || y >= (height - borderBandY) || x < borderBandX || x >= (width - borderBandX);
|
| 783 |
+
if (isBorderPixel) {
|
| 784 |
+
borderSum += pageScore;
|
| 785 |
+
borderCount += 1;
|
| 786 |
+
}
|
| 787 |
+
}
|
| 788 |
+
}
|
| 789 |
+
|
| 790 |
+
const borderMean = borderCount > 0 ? (borderSum / borderCount) : 40;
|
| 791 |
+
const THRESHOLD = Math.min(230, borderMean + 14);
|
| 792 |
+
const PAD = 12;
|
| 793 |
+
|
| 794 |
+
let top = 0, bottom = height - 1, left = 0, right = width - 1;
|
| 795 |
+
for (let y = 0; y < height; y++) { if (rowSum[y] / width > THRESHOLD) { top = y; break; } }
|
| 796 |
+
for (let y = height - 1; y >= 0; y--) { if (rowSum[y] / width > THRESHOLD) { bottom = y; break; } }
|
| 797 |
+
for (let x = 0; x < width; x++) { if (colSum[x] / height > THRESHOLD) { left = x; break; } }
|
| 798 |
+
for (let x = width - 1; x >= 0; x--) { if (colSum[x] / height > THRESHOLD) { right = x; break; } }
|
| 799 |
+
|
| 800 |
+
// Apply padding and clamp
|
| 801 |
+
top = Math.max(0, top - PAD);
|
| 802 |
+
bottom = Math.min(height - 1, bottom + PAD);
|
| 803 |
+
left = Math.max(0, left - PAD);
|
| 804 |
+
right = Math.min(width - 1, right + PAD);
|
| 805 |
+
|
| 806 |
+
const w = right - left;
|
| 807 |
+
const h = bottom - top;
|
| 808 |
+
|
| 809 |
+
// Sanity check: don't crop to less than 20% of original
|
| 810 |
+
if (w < width * 0.2 || h < height * 0.2) {
|
| 811 |
+
toast('Page not detected clearly - please crop manually', 'warn');
|
| 812 |
+
return;
|
| 813 |
+
}
|
| 814 |
+
|
| 815 |
+
const dst = document.createElement('canvas');
|
| 816 |
+
dst.width = w;
|
| 817 |
+
dst.height = h;
|
| 818 |
+
dst.getContext('2d').drawImage(canvas, left, top, w, h, 0, 0, w, h);
|
| 819 |
+
reviewState.canvas = dst;
|
| 820 |
+
updateReviewDisplay();
|
| 821 |
+
}
|
| 822 |
+
|
| 823 |
+
// ── Rotate ─────────────────────────────────────────────────────────────
|
| 824 |
+
|
| 825 |
+
function rotateReview(angle) {
|
| 826 |
+
if (!reviewState.canvas) return;
|
| 827 |
+
exitCropMode();
|
| 828 |
+
const src = reviewState.canvas;
|
| 829 |
+
const dst = document.createElement('canvas');
|
| 830 |
+
dst.width = src.height;
|
| 831 |
+
dst.height = src.width;
|
| 832 |
+
const ctx = dst.getContext('2d');
|
| 833 |
+
ctx.translate(dst.width / 2, dst.height / 2);
|
| 834 |
+
ctx.rotate(angle * Math.PI / 180);
|
| 835 |
+
ctx.drawImage(src, -src.width / 2, -src.height / 2);
|
| 836 |
+
reviewState.canvas = dst;
|
| 837 |
+
updateReviewDisplay();
|
| 838 |
+
}
|
| 839 |
+
|
| 840 |
+
// ── Crop ───────────────────────────────────────────────────────────────
|
| 841 |
+
|
| 842 |
+
function enterCropMode() {
|
| 843 |
+
reviewState.cropMode = true;
|
| 844 |
+
reviewState.cropRect = null;
|
| 845 |
+
reviewState.cropStart = null;
|
| 846 |
+
el.btnCropStart.hidden = true;
|
| 847 |
+
el.btnCropApply.hidden = true;
|
| 848 |
+
el.btnCropCancel.hidden = false;
|
| 849 |
+
el.reviewCropCanvas.style.pointerEvents = 'auto';
|
| 850 |
+
syncCropCanvas();
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
function exitCropMode() {
|
| 854 |
+
reviewState.cropMode = false;
|
| 855 |
+
reviewState.cropStart = null;
|
| 856 |
+
reviewState.cropRect = null;
|
| 857 |
+
el.reviewCropCanvas.style.pointerEvents = 'none';
|
| 858 |
+
resetCropUI();
|
| 859 |
+
syncCropCanvas();
|
| 860 |
+
}
|
| 861 |
+
|
| 862 |
+
function resetCropUI() {
|
| 863 |
+
el.btnCropStart.hidden = false;
|
| 864 |
+
el.btnCropApply.hidden = true;
|
| 865 |
+
el.btnCropCancel.hidden = true;
|
| 866 |
+
}
|
| 867 |
+
|
| 868 |
+
function pointerToImageCoords(e) {
|
| 869 |
+
const c = el.reviewCropCanvas;
|
| 870 |
+
const rect = c.getBoundingClientRect();
|
| 871 |
+
return {
|
| 872 |
+
x: Math.max(0, Math.min(reviewState.canvas.width, (e.clientX - rect.left) * (reviewState.canvas.width / rect.width))),
|
| 873 |
+
y: Math.max(0, Math.min(reviewState.canvas.height, (e.clientY - rect.top) * (reviewState.canvas.height / rect.height))),
|
| 874 |
+
};
|
| 875 |
+
}
|
| 876 |
+
|
| 877 |
+
function onCropPointerDown(e) {
|
| 878 |
+
if (!reviewState.cropMode) return;
|
| 879 |
+
e.preventDefault();
|
| 880 |
+
el.reviewCropCanvas.setPointerCapture(e.pointerId);
|
| 881 |
+
reviewState.cropStart = pointerToImageCoords(e);
|
| 882 |
+
reviewState.cropRect = null;
|
| 883 |
+
el.btnCropApply.hidden = true;
|
| 884 |
+
}
|
| 885 |
+
|
| 886 |
+
function onCropPointerMove(e) {
|
| 887 |
+
if (!reviewState.cropMode || !reviewState.cropStart) return;
|
| 888 |
+
e.preventDefault();
|
| 889 |
+
const cur = pointerToImageCoords(e);
|
| 890 |
+
reviewState.cropRect = {
|
| 891 |
+
x: Math.min(reviewState.cropStart.x, cur.x),
|
| 892 |
+
y: Math.min(reviewState.cropStart.y, cur.y),
|
| 893 |
+
w: Math.abs(cur.x - reviewState.cropStart.x),
|
| 894 |
+
h: Math.abs(cur.y - reviewState.cropStart.y),
|
| 895 |
+
};
|
| 896 |
+
drawCropOverlay();
|
| 897 |
+
}
|
| 898 |
+
|
| 899 |
+
function onCropPointerUp(e) {
|
| 900 |
+
if (!reviewState.cropMode) return;
|
| 901 |
+
e.preventDefault();
|
| 902 |
+
reviewState.cropStart = null;
|
| 903 |
+
const r = reviewState.cropRect;
|
| 904 |
+
if (r && r.w > 20 && r.h > 20) {
|
| 905 |
+
el.btnCropApply.hidden = false;
|
| 906 |
+
}
|
| 907 |
+
}
|
| 908 |
+
|
| 909 |
+
function drawCropOverlay() {
|
| 910 |
+
const c = el.reviewCropCanvas;
|
| 911 |
+
const ctx = c.getContext('2d');
|
| 912 |
+
const r = reviewState.cropRect;
|
| 913 |
+
if (!r) return;
|
| 914 |
+
|
| 915 |
+
const scaleX = c.width / reviewState.canvas.width;
|
| 916 |
+
const scaleY = c.height / reviewState.canvas.height;
|
| 917 |
+
const rx = r.x * scaleX, ry = r.y * scaleY;
|
| 918 |
+
const rw = r.w * scaleX, rh = r.h * scaleY;
|
| 919 |
+
|
| 920 |
+
ctx.clearRect(0, 0, c.width, c.height);
|
| 921 |
+
ctx.fillStyle = 'rgba(0,0,0,0.55)';
|
| 922 |
+
ctx.fillRect(0, 0, c.width, c.height);
|
| 923 |
+
ctx.clearRect(rx, ry, rw, rh);
|
| 924 |
+
ctx.strokeStyle = 'rgba(255,255,255,0.9)';
|
| 925 |
+
ctx.lineWidth = 2;
|
| 926 |
+
ctx.strokeRect(rx, ry, rw, rh);
|
| 927 |
+
}
|
| 928 |
+
|
| 929 |
+
function applyReviewCrop() {
|
| 930 |
+
const r = reviewState.cropRect;
|
| 931 |
+
if (!r || r.w < 20 || r.h < 20) return;
|
| 932 |
+
const dst = document.createElement('canvas');
|
| 933 |
+
dst.width = Math.round(r.w);
|
| 934 |
+
dst.height = Math.round(r.h);
|
| 935 |
+
dst.getContext('2d').drawImage(
|
| 936 |
+
reviewState.canvas,
|
| 937 |
+
Math.round(r.x), Math.round(r.y), Math.round(r.w), Math.round(r.h),
|
| 938 |
+
0, 0, Math.round(r.w), Math.round(r.h)
|
| 939 |
+
);
|
| 940 |
+
reviewState.canvas = dst;
|
| 941 |
+
exitCropMode();
|
| 942 |
+
updateReviewDisplay();
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
// ── Confirm / Retake ────────────────────────────────────────────────────
|
| 946 |
+
|
| 947 |
+
function retakePhoto() {
|
| 948 |
+
closePhotoReview();
|
| 949 |
+
el.fileCamera.value = '';
|
| 950 |
+
el.fileCamera.click();
|
| 951 |
+
}
|
| 952 |
+
|
| 953 |
+
function confirmPhoto() {
|
| 954 |
+
if (!reviewState.canvas) return;
|
| 955 |
+
el.btnUsePhoto.disabled = true;
|
| 956 |
+
reviewState.canvas.toBlob(blob => {
|
| 957 |
+
if (!blob) {
|
| 958 |
+
toast('Error while processing photo', 'error');
|
| 959 |
+
el.btnUsePhoto.disabled = false;
|
| 960 |
+
return;
|
| 961 |
+
}
|
| 962 |
+
const baseName = reviewState.srcFilename.replace(/\.[^.]+$/, '');
|
| 963 |
+
const file = new File([blob], baseName + '.jpg', { type: 'image/jpeg' });
|
| 964 |
+
closePhotoReview();
|
| 965 |
+
el.btnUsePhoto.disabled = false;
|
| 966 |
+
uploadFile(file);
|
| 967 |
+
}, 'image/jpeg', 0.92);
|
| 968 |
+
}
|
| 969 |
+
|
| 970 |
+
// ── Register service worker ─────────────────────────────────────────────
|
| 971 |
+
async function detectPwaVersion() {
|
| 972 |
+
try {
|
| 973 |
+
const resp = await fetch('/static/pwa/demo.js', {
|
| 974 |
+
method: 'HEAD',
|
| 975 |
+
cache: 'no-store',
|
| 976 |
+
});
|
| 977 |
+
const lastModified = resp.headers.get('last-modified');
|
| 978 |
+
if (lastModified) {
|
| 979 |
+
const ts = Date.parse(lastModified);
|
| 980 |
+
if (Number.isFinite(ts) && ts > 0) return String(ts);
|
| 981 |
+
}
|
| 982 |
+
} catch {
|
| 983 |
+
// Fallback below
|
| 984 |
+
}
|
| 985 |
+
return 'dev';
|
| 986 |
+
}
|
| 987 |
+
|
| 988 |
+
if ('serviceWorker' in navigator) {
|
| 989 |
+
window.addEventListener('load', async () => {
|
| 990 |
+
try {
|
| 991 |
+
const version = await detectPwaVersion();
|
| 992 |
+
const reg = await navigator.serviceWorker.register(`/sw.js?v=${encodeURIComponent(version)}`, { scope: '/' });
|
| 993 |
+
reg.update().catch(() => {});
|
| 994 |
+
} catch (e) {
|
| 995 |
+
console.warn('SW registration failed:', e);
|
| 996 |
+
}
|
| 997 |
+
});
|
| 998 |
+
}
|
| 999 |
+
|
| 1000 |
+
// ── Init ───────────────────────────────────────────────────────────────
|
| 1001 |
+
function init() {
|
| 1002 |
+
// Camera button — open review overlay instead of uploading directly
|
| 1003 |
+
el.btnCamera.addEventListener('click', () => el.fileCamera.click());
|
| 1004 |
+
el.fileCamera.addEventListener('change', () => {
|
| 1005 |
+
if (el.fileCamera.files[0]) openPhotoReview(el.fileCamera.files[0]);
|
| 1006 |
+
el.fileCamera.value = '';
|
| 1007 |
+
});
|
| 1008 |
+
|
| 1009 |
+
// Photo review
|
| 1010 |
+
el.btnRotateCCW.addEventListener('click', () => rotateReview(-90));
|
| 1011 |
+
el.btnRotateCW.addEventListener('click', () => rotateReview(90));
|
| 1012 |
+
el.btnAutoCrop.addEventListener('click', autoDetectAndCrop);
|
| 1013 |
+
el.btnCropStart.addEventListener('click', enterCropMode);
|
| 1014 |
+
el.btnCropApply.addEventListener('click', applyReviewCrop);
|
| 1015 |
+
el.btnCropCancel.addEventListener('click', exitCropMode);
|
| 1016 |
+
el.btnRetake.addEventListener('click', retakePhoto);
|
| 1017 |
+
el.btnUsePhoto.addEventListener('click', confirmPhoto);
|
| 1018 |
+
el.reviewCropCanvas.addEventListener('pointerdown', onCropPointerDown);
|
| 1019 |
+
el.reviewCropCanvas.addEventListener('pointermove', onCropPointerMove);
|
| 1020 |
+
el.reviewCropCanvas.addEventListener('pointerup', onCropPointerUp);
|
| 1021 |
+
|
| 1022 |
+
// File picker button
|
| 1023 |
+
el.btnFile.addEventListener('click', () => el.filePicker.click());
|
| 1024 |
+
el.filePicker.addEventListener('change', () => {
|
| 1025 |
+
if (el.filePicker.files[0]) uploadFile(el.filePicker.files[0]);
|
| 1026 |
+
el.filePicker.value = '';
|
| 1027 |
+
});
|
| 1028 |
+
|
| 1029 |
+
// Clear image
|
| 1030 |
+
el.btnClearImage.addEventListener('click', clearImage);
|
| 1031 |
+
|
| 1032 |
+
// Engine select
|
| 1033 |
+
el.engineSelect.addEventListener('change', onEngineChange);
|
| 1034 |
+
|
| 1035 |
+
// Load model
|
| 1036 |
+
el.btnLoadModel.addEventListener('click', loadModel);
|
| 1037 |
+
|
| 1038 |
+
// Segment
|
| 1039 |
+
el.btnSegment.addEventListener('click', segmentImage);
|
| 1040 |
+
|
| 1041 |
+
// Transcribe
|
| 1042 |
+
el.btnTranscribe.addEventListener('click', startTranscription);
|
| 1043 |
+
|
| 1044 |
+
// Cancel
|
| 1045 |
+
el.btnCancel.addEventListener('click', cancelTranscription);
|
| 1046 |
+
|
| 1047 |
+
// Export
|
| 1048 |
+
el.btnCopy.addEventListener('click', copyAll);
|
| 1049 |
+
el.btnExportTxt.addEventListener('click', exportTxt);
|
| 1050 |
+
|
| 1051 |
+
// Seg method persistence
|
| 1052 |
+
const savedSeg = localStorage.getItem(LS_SEG_METHOD);
|
| 1053 |
+
const savedSegOption = savedSeg ? el.segMethodSelect.querySelector(`option[value="${savedSeg}"]`) : null;
|
| 1054 |
+
if (savedSegOption && !savedSegOption.disabled) {
|
| 1055 |
+
el.segMethodSelect.value = savedSeg;
|
| 1056 |
+
}
|
| 1057 |
+
el.segMethodSelect.addEventListener('change', () => {
|
| 1058 |
+
localStorage.setItem(LS_SEG_METHOD, el.segMethodSelect.value);
|
| 1059 |
+
});
|
| 1060 |
+
|
| 1061 |
+
// Redraw bboxes on layout changes (image resize)
|
| 1062 |
+
const ro = new ResizeObserver(onImageResize);
|
| 1063 |
+
ro.observe(el.previewImg);
|
| 1064 |
+
|
| 1065 |
+
// Initial data load
|
| 1066 |
+
loadEngines().then(checkEngineStatus);
|
| 1067 |
+
}
|
| 1068 |
+
|
| 1069 |
+
document.addEventListener('DOMContentLoaded', init);
|
web/static/pwa/icons/icon-192.png
ADDED
|
|
Git LFS Details
|
web/static/pwa/icons/icon-512.png
ADDED
|
|
Git LFS Details
|
web/static/pwa/manifest.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Polyscriptor HTR Demo",
|
| 3 |
+
"short_name": "Polyscriptor",
|
| 4 |
+
"description": "Handwritten Text Recognition — capture a photo and transcribe it instantly",
|
| 5 |
+
"start_url": "/demo",
|
| 6 |
+
"scope": "/",
|
| 7 |
+
"display": "standalone",
|
| 8 |
+
"orientation": "portrait-primary",
|
| 9 |
+
"background_color": "#111827",
|
| 10 |
+
"theme_color": "#3b82f6",
|
| 11 |
+
"icons": [
|
| 12 |
+
{
|
| 13 |
+
"src": "/static/pwa/icons/icon-192.png",
|
| 14 |
+
"sizes": "192x192",
|
| 15 |
+
"type": "image/png",
|
| 16 |
+
"purpose": "any maskable"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"src": "/static/pwa/icons/icon-512.png",
|
| 20 |
+
"sizes": "512x512",
|
| 21 |
+
"type": "image/png",
|
| 22 |
+
"purpose": "any maskable"
|
| 23 |
+
}
|
| 24 |
+
],
|
| 25 |
+
"categories": ["productivity", "utilities"],
|
| 26 |
+
"lang": "en"
|
| 27 |
+
}
|
web/static/pwa/sw.js
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Polyscriptor PWA — Service Worker
|
| 3 |
+
* Caches static assets for faster startup; API calls always go to network.
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
const SW_VERSION = new URL(self.location.href).searchParams.get('v') || 'dev';
|
| 7 |
+
const CACHE = `polyscriptor-pwa-${SW_VERSION}`;
|
| 8 |
+
const STATIC = [
|
| 9 |
+
'/demo',
|
| 10 |
+
'/static/pwa/demo.html',
|
| 11 |
+
'/static/pwa/demo.css',
|
| 12 |
+
'/static/pwa/demo.js',
|
| 13 |
+
'/static/pwa/manifest.json',
|
| 14 |
+
'/static/pwa/icons/icon-192.png',
|
| 15 |
+
'/static/pwa/icons/icon-512.png',
|
| 16 |
+
];
|
| 17 |
+
|
| 18 |
+
self.addEventListener('install', e => {
|
| 19 |
+
e.waitUntil(
|
| 20 |
+
caches.open(CACHE)
|
| 21 |
+
.then(async c => {
|
| 22 |
+
const freshRequests = STATIC.map(url => new Request(url, { cache: 'reload' }));
|
| 23 |
+
await c.addAll(freshRequests);
|
| 24 |
+
})
|
| 25 |
+
.then(() => self.skipWaiting())
|
| 26 |
+
);
|
| 27 |
+
});
|
| 28 |
+
|
| 29 |
+
self.addEventListener('activate', e => {
|
| 30 |
+
e.waitUntil(
|
| 31 |
+
caches.keys().then(keys =>
|
| 32 |
+
Promise.all(keys.filter(k => k !== CACHE).map(k => caches.delete(k)))
|
| 33 |
+
).then(() => self.clients.claim())
|
| 34 |
+
);
|
| 35 |
+
});
|
| 36 |
+
|
| 37 |
+
self.addEventListener('fetch', e => {
|
| 38 |
+
const url = new URL(e.request.url);
|
| 39 |
+
|
| 40 |
+
// API calls: always network-only (no caching)
|
| 41 |
+
if (url.pathname.startsWith('/api/')) {
|
| 42 |
+
e.respondWith(fetch(e.request).catch(() =>
|
| 43 |
+
new Response(JSON.stringify({ detail: 'No server connection' }), {
|
| 44 |
+
status: 503,
|
| 45 |
+
headers: { 'Content-Type': 'application/json' },
|
| 46 |
+
})
|
| 47 |
+
));
|
| 48 |
+
return;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
// Static assets: cache-first
|
| 52 |
+
e.respondWith(
|
| 53 |
+
caches.match(e.request).then(cached => cached || fetch(e.request).then(resp => {
|
| 54 |
+
if (resp.ok && STATIC.some(s => url.pathname === s || url.pathname.startsWith(s))) {
|
| 55 |
+
caches.open(CACHE).then(c => c.put(e.request, resp.clone()));
|
| 56 |
+
}
|
| 57 |
+
return resp;
|
| 58 |
+
}))
|
| 59 |
+
);
|
| 60 |
+
});
|