File size: 7,605 Bytes
be4a6f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
"""OCR engine — unified wrapper providing 100% accuracy on any platform.
Backend selection (automatic):
1. Windows → native DLL via ctypes (fastest, 100% accuracy)
2. Linux/macOS with Wine → DLL via Wine subprocess (100% accuracy)
3. Fallback → pure Python/ONNX reimplementation (~53% match rate)
Usage:
from ocr.engine_unified import OcrEngineUnified
engine = OcrEngineUnified()
result = engine.recognize_pil(pil_image)
print(result.text)
print(f"Backend: {engine.backend_name}")
"""
from __future__ import annotations
import json
import logging
import platform
import sys
from pathlib import Path
from typing import TYPE_CHECKING
from ocr.models import BoundingRect, OcrLine, OcrResult, OcrWord
if TYPE_CHECKING:
from PIL import Image
logger = logging.getLogger(__name__)
class OcrEngineUnified:
"""Unified OCR engine — auto-selects the best available backend.
Priority order:
1. Native Windows DLL (100%, fastest)
2. Wine bridge on Linux (100%, ~2x slower due to subprocess)
3. ONNX reimplementation (~53%, fully cross-platform)
Args:
ocr_data_dir: Path to directory with DLL/model files.
Defaults to PROJECT_ROOT/ocr_data/.
force_backend: Force a specific backend: 'dll', 'wine', 'onnx', or None (auto).
"""
BACKENDS = ("dll", "wine", "onnx")
def __init__(
self,
ocr_data_dir: str | Path | None = None,
force_backend: str | None = None,
) -> None:
if ocr_data_dir is None:
ocr_data_dir = Path(__file__).resolve().parent.parent / "ocr_data"
self._ocr_data = Path(ocr_data_dir)
self._backend_name: str = "none"
self._engine = None
if force_backend:
if force_backend not in self.BACKENDS:
raise ValueError(f"Unknown backend: {force_backend!r}. Choose from {self.BACKENDS}")
self._init_backend(force_backend)
else:
self._auto_select()
@property
def backend_name(self) -> str:
"""Name of the active backend."""
return self._backend_name
def recognize_pil(self, image: "Image.Image") -> OcrResult:
"""Run OCR on a PIL Image. Returns OcrResult with text, lines, words."""
if self._backend_name == "dll":
return self._engine.recognize_pil(image)
elif self._backend_name == "wine":
return self._recognize_wine(image)
elif self._backend_name == "onnx":
return self._engine.recognize_pil(image)
else:
return OcrResult(error="No OCR backend available")
def recognize_bytes(self, image_bytes: bytes) -> OcrResult:
"""Run OCR on raw image bytes (PNG/JPEG/etc)."""
from io import BytesIO
from PIL import Image as PILImage
img = PILImage.open(BytesIO(image_bytes))
return self.recognize_pil(img)
# ── Backend initialization ──────────────────────────────────
def _auto_select(self) -> None:
"""Try backends in priority order."""
for backend in self.BACKENDS:
try:
self._init_backend(backend)
logger.info("OCR backend: %s", self._backend_name)
return
except Exception as e:
logger.debug("Backend %s unavailable: %s", backend, e)
logger.warning("No OCR backend available!")
self._backend_name = "none"
def _init_backend(self, name: str) -> None:
"""Initialize a specific backend."""
if name == "dll":
self._init_dll()
elif name == "wine":
self._init_wine()
elif name == "onnx":
self._init_onnx()
def _init_dll(self) -> None:
"""Initialize native Windows DLL backend."""
if platform.system() != "Windows":
raise RuntimeError("DLL backend requires Windows")
from ocr.engine import OcrEngine
self._engine = OcrEngine(ocr_data_dir=self._ocr_data)
self._backend_name = "dll"
def _init_wine(self) -> None:
"""Initialize Wine bridge backend."""
if platform.system() == "Windows":
raise RuntimeError("Wine backend is for Linux/macOS only")
# Import and check requirements
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "tools"))
from wine_bridge import WineBridge
bridge = WineBridge(ocr_data_dir=self._ocr_data)
checks = bridge.check_requirements()
if not checks["wine_found"]:
raise RuntimeError("Wine not installed")
if not checks["dll_exists"]:
raise RuntimeError(f"oneocr.dll not found in {self._ocr_data}")
if not checks["model_exists"]:
raise RuntimeError(f"oneocr.onemodel not found in {self._ocr_data}")
# Compile loader if needed
if not checks["loader_compiled"]:
if not checks["mingw_found"]:
raise RuntimeError(
"MinGW cross-compiler needed to build Wine loader. "
"Install: sudo apt install mingw-w64"
)
bridge.compile_loader()
self._engine = bridge
self._backend_name = "wine"
def _init_onnx(self) -> None:
"""Initialize pure ONNX backend (fallback)."""
from ocr.engine_onnx import OcrEngineOnnx
self._engine = OcrEngineOnnx(ocr_data_dir=self._ocr_data)
self._backend_name = "onnx"
# ── Wine result conversion ─────────────────────────────────
def _recognize_wine(self, image: "Image.Image") -> OcrResult:
"""Run OCR via Wine bridge and convert JSON → OcrResult."""
try:
raw = self._engine.recognize_pil(image)
except Exception as e:
return OcrResult(error=f"Wine bridge error: {e}")
return self._json_to_ocr_result(raw)
@staticmethod
def _json_to_ocr_result(data: dict) -> OcrResult:
"""Convert Wine bridge JSON output to OcrResult dataclass."""
if "error" in data:
return OcrResult(error=data["error"])
lines = []
for line_data in data.get("lines", []):
words = []
for word_data in line_data.get("words", []):
bbox = word_data.get("bbox", [0]*8)
words.append(OcrWord(
text=word_data.get("text", ""),
confidence=word_data.get("confidence", 0.0),
bounding_rect=BoundingRect(
x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3],
x3=bbox[4], y3=bbox[5], x4=bbox[6], y4=bbox[7],
),
))
line_bbox = line_data.get("bbox", [0]*8)
lines.append(OcrLine(
text=line_data.get("text", ""),
words=words,
bounding_rect=BoundingRect(
x1=line_bbox[0], y1=line_bbox[1],
x2=line_bbox[2], y2=line_bbox[3],
x3=line_bbox[4], y3=line_bbox[5],
x4=line_bbox[6] if len(line_bbox) > 6 else 0,
y4=line_bbox[7] if len(line_bbox) > 7 else 0,
),
))
full_text = "\n".join(line.text for line in lines if line.text)
text_angle = data.get("text_angle")
return OcrResult(text=full_text, text_angle=text_angle, lines=lines)
|