oneocr / ocr /engine_unified.py

OneOCR Dev

feat: Wine bridge - run DLL on Linux via Wine (100% accuracy)

be4a6f1 1 day ago

7.61 kB

	"""OCR engine — unified wrapper providing 100% accuracy on any platform.

	Backend selection (automatic):
	1. Windows → native DLL via ctypes (fastest, 100% accuracy)
	2. Linux/macOS with Wine → DLL via Wine subprocess (100% accuracy)
	3. Fallback → pure Python/ONNX reimplementation (~53% match rate)

	Usage:
	from ocr.engine_unified import OcrEngineUnified
	engine = OcrEngineUnified()
	result = engine.recognize_pil(pil_image)
	print(result.text)
	print(f"Backend: {engine.backend_name}")
	"""

	from __future__ import annotations

	import json
	import logging
	import platform
	import sys
	from pathlib import Path
	from typing import TYPE_CHECKING

	from ocr.models import BoundingRect, OcrLine, OcrResult, OcrWord

	if TYPE_CHECKING:
	from PIL import Image

	logger = logging.getLogger(__name__)


	class OcrEngineUnified:
	"""Unified OCR engine — auto-selects the best available backend.

	Priority order:
	1. Native Windows DLL (100%, fastest)
	2. Wine bridge on Linux (100%, ~2x slower due to subprocess)
	3. ONNX reimplementation (~53%, fully cross-platform)

	Args:
	ocr_data_dir: Path to directory with DLL/model files.
	Defaults to PROJECT_ROOT/ocr_data/.
	force_backend: Force a specific backend: 'dll', 'wine', 'onnx', or None (auto).
	"""

	BACKENDS = ("dll", "wine", "onnx")

	def __init__(
	self,
	ocr_data_dir: str \| Path \| None = None,
	force_backend: str \| None = None,
	) -> None:
	if ocr_data_dir is None:
	ocr_data_dir = Path(__file__).resolve().parent.parent / "ocr_data"
	self._ocr_data = Path(ocr_data_dir)
	self._backend_name: str = "none"
	self._engine = None

	if force_backend:
	if force_backend not in self.BACKENDS:
	raise ValueError(f"Unknown backend: {force_backend!r}. Choose from {self.BACKENDS}")
	self._init_backend(force_backend)
	else:
	self._auto_select()

	@property
	def backend_name(self) -> str:
	"""Name of the active backend."""
	return self._backend_name

	def recognize_pil(self, image: "Image.Image") -> OcrResult:
	"""Run OCR on a PIL Image. Returns OcrResult with text, lines, words."""
	if self._backend_name == "dll":
	return self._engine.recognize_pil(image)
	elif self._backend_name == "wine":
	return self._recognize_wine(image)
	elif self._backend_name == "onnx":
	return self._engine.recognize_pil(image)
	else:
	return OcrResult(error="No OCR backend available")

	def recognize_bytes(self, image_bytes: bytes) -> OcrResult:
	"""Run OCR on raw image bytes (PNG/JPEG/etc)."""
	from io import BytesIO
	from PIL import Image as PILImage
	img = PILImage.open(BytesIO(image_bytes))
	return self.recognize_pil(img)

	# ── Backend initialization ──────────────────────────────────

	def _auto_select(self) -> None:
	"""Try backends in priority order."""
	for backend in self.BACKENDS:
	try:
	self._init_backend(backend)
	logger.info("OCR backend: %s", self._backend_name)
	return
	except Exception as e:
	logger.debug("Backend %s unavailable: %s", backend, e)

	logger.warning("No OCR backend available!")
	self._backend_name = "none"

	def _init_backend(self, name: str) -> None:
	"""Initialize a specific backend."""
	if name == "dll":
	self._init_dll()
	elif name == "wine":
	self._init_wine()
	elif name == "onnx":
	self._init_onnx()

	def _init_dll(self) -> None:
	"""Initialize native Windows DLL backend."""
	if platform.system() != "Windows":
	raise RuntimeError("DLL backend requires Windows")
	from ocr.engine import OcrEngine
	self._engine = OcrEngine(ocr_data_dir=self._ocr_data)
	self._backend_name = "dll"

	def _init_wine(self) -> None:
	"""Initialize Wine bridge backend."""
	if platform.system() == "Windows":
	raise RuntimeError("Wine backend is for Linux/macOS only")

	# Import and check requirements
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "tools"))
	from wine_bridge import WineBridge

	bridge = WineBridge(ocr_data_dir=self._ocr_data)
	checks = bridge.check_requirements()

	if not checks["wine_found"]:
	raise RuntimeError("Wine not installed")
	if not checks["dll_exists"]:
	raise RuntimeError(f"oneocr.dll not found in {self._ocr_data}")
	if not checks["model_exists"]:
	raise RuntimeError(f"oneocr.onemodel not found in {self._ocr_data}")

	# Compile loader if needed
	if not checks["loader_compiled"]:
	if not checks["mingw_found"]:
	raise RuntimeError(
	"MinGW cross-compiler needed to build Wine loader. "
	"Install: sudo apt install mingw-w64"
	)
	bridge.compile_loader()

	self._engine = bridge
	self._backend_name = "wine"

	def _init_onnx(self) -> None:
	"""Initialize pure ONNX backend (fallback)."""
	from ocr.engine_onnx import OcrEngineOnnx
	self._engine = OcrEngineOnnx(ocr_data_dir=self._ocr_data)
	self._backend_name = "onnx"

	# ── Wine result conversion ─────────────────────────────────

	def _recognize_wine(self, image: "Image.Image") -> OcrResult:
	"""Run OCR via Wine bridge and convert JSON → OcrResult."""
	try:
	raw = self._engine.recognize_pil(image)
	except Exception as e:
	return OcrResult(error=f"Wine bridge error: {e}")

	return self._json_to_ocr_result(raw)

	@staticmethod
	def _json_to_ocr_result(data: dict) -> OcrResult:
	"""Convert Wine bridge JSON output to OcrResult dataclass."""
	if "error" in data:
	return OcrResult(error=data["error"])

	lines = []
	for line_data in data.get("lines", []):
	words = []
	for word_data in line_data.get("words", []):
	bbox = word_data.get("bbox", [0]*8)
	words.append(OcrWord(
	text=word_data.get("text", ""),
	confidence=word_data.get("confidence", 0.0),
	bounding_rect=BoundingRect(
	x1=bbox[0], y1=bbox[1], x2=bbox[2], y2=bbox[3],
	x3=bbox[4], y3=bbox[5], x4=bbox[6], y4=bbox[7],
	),
	))

	line_bbox = line_data.get("bbox", [0]*8)
	lines.append(OcrLine(
	text=line_data.get("text", ""),
	words=words,
	bounding_rect=BoundingRect(
	x1=line_bbox[0], y1=line_bbox[1],
	x2=line_bbox[2], y2=line_bbox[3],
	x3=line_bbox[4], y3=line_bbox[5],
	x4=line_bbox[6] if len(line_bbox) > 6 else 0,
	y4=line_bbox[7] if len(line_bbox) > 7 else 0,
	),
	))

	full_text = "\n".join(line.text for line in lines if line.text)
	text_angle = data.get("text_angle")

	return OcrResult(text=full_text, text_angle=text_angle, lines=lines)