File size: 12,813 Bytes
78431ff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | """
HTR Engine Plugin System - Base Classes and Registry
This module defines the plugin architecture for HTR (Handwritten Text Recognition) engines.
All HTR engines (TrOCR, Qwen3, CRNN-CTC, Kraken, etc.) implement the HTREngine interface.
Design principles:
- Abstraction: Each engine is self-contained and interchangeable
- Scalability: New engines can be added without modifying existing code
- Consistency: All engines expose the same interface to the GUI
- Flexibility: Each engine can have custom configuration widgets
"""
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List
from dataclasses import dataclass
import os
import numpy as np
try:
from PyQt6.QtWidgets import QWidget, QVBoxLayout, QLabel
PYQT_AVAILABLE = True
except ImportError:
PYQT_AVAILABLE = False
QWidget = object
@dataclass
class TranscriptionResult:
"""Result from HTR engine transcription."""
text: str
confidence: float = 1.0
metadata: Dict[str, Any] = None
def __post_init__(self):
if self.metadata is None:
self.metadata = {}
class HTREngine(ABC):
"""Abstract base class for HTR engines.
All HTR engines must implement this interface to be compatible
with the GUI and batch processing systems.
"""
@abstractmethod
def get_name(self) -> str:
"""Get display name for the engine.
Returns:
str: Human-readable engine name (e.g., "TrOCR", "Qwen3 VLM")
"""
pass
@abstractmethod
def get_description(self) -> str:
"""Get brief description of the engine.
Returns:
str: One-line description (e.g., "Transformer-based OCR for manuscripts")
"""
pass
@abstractmethod
def is_available(self) -> bool:
"""Check if engine dependencies are installed and functional.
Returns:
bool: True if engine can be used, False otherwise
"""
pass
@abstractmethod
def get_unavailable_reason(self) -> str:
"""Get reason why engine is unavailable (if is_available() == False).
Returns:
str: Explanation and installation instructions
"""
pass
@abstractmethod
def get_config_widget(self) -> QWidget:
"""Create and return configuration widget for this engine.
The widget should contain all engine-specific controls (model selection,
beam search, preprocessing options, etc.). The GUI will embed this widget
in the configuration panel.
Returns:
QWidget: Qt widget with engine configuration controls
"""
pass
@abstractmethod
def get_config(self) -> Dict[str, Any]:
"""Get current configuration from the config widget.
This method extracts values from the widget controls and returns
them as a dictionary that can be passed to transcribe_line().
Returns:
Dict[str, Any]: Configuration parameters
"""
pass
@abstractmethod
def set_config(self, config: Dict[str, Any]):
"""Set configuration values in the config widget.
Used to restore saved settings when switching engines.
Args:
config: Configuration parameters
"""
pass
@abstractmethod
def load_model(self, config: Dict[str, Any]) -> bool:
"""Load the HTR model with given configuration.
Args:
config: Configuration parameters (from get_config())
Returns:
bool: True if model loaded successfully, False otherwise
"""
pass
@abstractmethod
def unload_model(self):
"""Unload model from memory to free resources.
Called when switching to a different engine or closing the application.
"""
pass
@abstractmethod
def is_model_loaded(self) -> bool:
"""Check if model is currently loaded.
Returns:
bool: True if model is ready for inference
"""
pass
@abstractmethod
def transcribe_line(self, image: np.ndarray, config: Optional[Dict[str, Any]] = None) -> TranscriptionResult:
"""Transcribe a single line image.
Args:
image: Line image as numpy array (RGB, shape: H x W x 3)
config: Optional configuration overrides
Returns:
TranscriptionResult: Transcription text and metadata
"""
pass
def requires_line_segmentation(self) -> bool:
"""Check if engine requires pre-segmented lines or can process full pages.
Returns:
bool: True if lines must be segmented first (TrOCR, CRNN-CTC),
False if engine handles full pages (Qwen3, Commercial APIs)
"""
return True # Default: most engines need line segmentation
def transcribe_lines(self, images: List[np.ndarray], config: Optional[Dict[str, Any]] = None) -> List[TranscriptionResult]:
"""Transcribe multiple line images (batch processing).
Default implementation calls transcribe_line() for each image.
Engines can override this for optimized batch processing.
Args:
images: List of line images
config: Optional configuration overrides
Returns:
List[TranscriptionResult]: Transcriptions for each image
"""
return [self.transcribe_line(img, config) for img in images]
def supports_batch(self) -> bool:
"""Check if engine supports optimized batch processing.
Returns:
bool: True if transcribe_lines() is optimized, False if it just loops
"""
return False
def get_aliases(self) -> List[str]:
"""Get alternative names for this engine (e.g., short CLI aliases).
Returns:
List[str]: Alternative names accepted by the registry (default: none)
"""
return []
def get_capabilities(self) -> Dict[str, bool]:
"""Get engine capabilities.
Returns:
Dict with capability flags:
- batch_processing: Supports batch inference
- confidence_scores: Returns confidence scores
- beam_search: Supports beam search decoding
- language_model: Uses language model for post-processing
- preprocessing: Has built-in preprocessing
"""
return {
"batch_processing": self.supports_batch(),
"confidence_scores": False,
"beam_search": False,
"language_model": False,
"preprocessing": False,
}
class HTREngineRegistry:
"""Registry of available HTR engines.
Manages discovery, registration, and instantiation of HTR engines.
"""
def __init__(self):
self.engines: List[HTREngine] = []
self._engine_cache: Dict[str, HTREngine] = {}
def register(self, engine: HTREngine):
"""Register an HTR engine.
Args:
engine: HTREngine instance to register
"""
self.engines.append(engine)
self._engine_cache[engine.get_name()] = engine
for alias in engine.get_aliases():
self._engine_cache[alias] = engine
def discover_engines(self):
"""Automatically discover and register all available engines.
Tries to import each engine module and registers it if available.
"""
if os.environ.get("POLYSCRIPTOR_DEMO_MODE") == "hf_space":
demo_engines = [
("CRNN-CTC", "engines.pylaia_engine", "PyLaiaEngine"),
("Commercial APIs", "engines.commercial_api_engine", "CommercialAPIEngine"),
("OpenWebUI", "engines.openwebui_engine", "OpenWebUIEngine"),
]
for label, module_name, class_name in demo_engines:
try:
module = __import__(module_name, fromlist=[class_name])
self.register(getattr(module, class_name)())
except ImportError as e:
print(f"Warning: Failed to load {label} engine: {e}")
return
# Import and register TrOCR engine
try:
from engines.trocr_engine import TrOCREngine
self.register(TrOCREngine())
except ImportError as e:
print(f"Warning: Failed to load TrOCR engine: {e}")
# Import and register Qwen3 engine
try:
from engines.qwen3_engine import Qwen3Engine
self.register(Qwen3Engine())
except ImportError as e:
print(f"Warning: Failed to load Qwen3 engine: {e}")
# Import and register Churro engine
try:
from engines.churro_engine import ChurroEngine
self.register(ChurroEngine())
except ImportError as e:
print(f"Warning: Failed to load Churro engine: {e}")
# Import and register CRNN-CTC engine
try:
from engines.pylaia_engine import PyLaiaEngine
self.register(PyLaiaEngine())
except ImportError as e:
print(f"Warning: Failed to load CRNN-CTC engine: {e}")
# Import and register Kraken engine
try:
from engines.kraken_engine import KrakenEngine
self.register(KrakenEngine())
except ImportError as e:
print(f"Warning: Failed to load Kraken engine: {e}")
# Import and register Commercial API engine
try:
from engines.commercial_api_engine import CommercialAPIEngine
self.register(CommercialAPIEngine())
except ImportError as e:
print(f"Warning: Failed to load Commercial API engine: {e}")
# Import and register Party engine
try:
from engines.party_engine import PartyEngine
self.register(PartyEngine())
except ImportError as e:
print(f"Warning: Failed to load Party engine: {e}")
# Import and register OpenWebUI engine
try:
from engines.openwebui_engine import OpenWebUIEngine
self.register(OpenWebUIEngine())
except ImportError as e:
print(f"Warning: Failed to load OpenWebUI engine: {e}")
# Import and register DeepSeek-OCR engine
try:
from engines.deepseek_ocr_engine import DeepSeekOCREngine
self.register(DeepSeekOCREngine())
except ImportError as e:
print(f"Warning: Failed to load DeepSeek-OCR engine: {e}")
# Import and register LightOnOCR engine
try:
from engines.lighton_ocr_engine import LightOnOCREngine
self.register(LightOnOCREngine())
except ImportError as e:
print(f"Warning: Failed to load LightOnOCR engine: {e}")
# Import and register PaddleOCR engine
try:
from engines.paddle_engine import PaddleOCREngine
self.register(PaddleOCREngine())
except ImportError as e:
print(f"Warning: Failed to load PaddleOCR engine: {e}")
def get_available_engines(self) -> List[HTREngine]:
"""Get list of engines with satisfied dependencies.
Returns:
List[HTREngine]: Engines that can be used
"""
return [e for e in self.engines if e.is_available()]
def get_all_engines(self) -> List[HTREngine]:
"""Get all registered engines (including unavailable ones).
Returns:
List[HTREngine]: All registered engines
"""
return self.engines
def get_engine_by_name(self, name: str) -> Optional[HTREngine]:
"""Get engine by display name.
Args:
name: Engine display name
Returns:
Optional[HTREngine]: Engine instance or None if not found
"""
return self._engine_cache.get(name)
def get_engine_names(self) -> List[str]:
"""Get list of available engine names.
Returns:
List[str]: Engine display names
"""
return [e.get_name() for e in self.get_available_engines()]
# Global registry instance (singleton pattern)
_global_registry: Optional[HTREngineRegistry] = None
def get_global_registry() -> HTREngineRegistry:
"""Get global HTR engine registry (singleton).
Returns:
HTREngineRegistry: Global registry instance
"""
global _global_registry
if _global_registry is None:
_global_registry = HTREngineRegistry()
_global_registry.discover_engines()
return _global_registry
# Convenience function for GUI
def get_available_engine_names() -> List[str]:
"""Get list of available engine names (convenience function).
Returns:
List[str]: Engine display names
"""
return get_global_registry().get_engine_names()
|