cute-tokenizer / cute_tokenizer /_accel_loader.py
HusseinEid's picture
Super-squash branch 'main' using huggingface_hub
68a4c53
"""Rust acceleration loader + Python fallback gate.
Importing this module exposes:
- ``accel``: the loaded Rust extension module, or ``None`` if unavailable.
- ``USE_RUST``: bool. ``True`` iff the Rust path should be used.
- ``prepare_mapping(pua_mapping)``: returns a cached ``PreparedMapping``
for the given ``PUAMapping`` instance, building it lazily on first use.
The Rust path is the default. To force the pure-Python path (for parity
testing or debugging), set ``CUTE_USE_PYTHON_PRETOKENIZER=1`` before
importing :mod:`cute_tokenizer`.
"""
from __future__ import annotations
import contextlib
import os
import weakref
from typing import Any
_FORCE_PY = os.environ.get("CUTE_USE_PYTHON_PRETOKENIZER", "") not in (
"",
"0",
"false",
"False",
)
try: # pragma: no cover - import-time path differs per environment
from cute_tokenizer import _accel as _accel_module # type: ignore[attr-defined]
_ACCEL_AVAILABLE = True
except ImportError:
_accel_module = None # type: ignore[assignment]
_ACCEL_AVAILABLE = False
accel: Any = _accel_module
USE_RUST: bool = _ACCEL_AVAILABLE and not _FORCE_PY
# Cache: keyed by id(PUAMapping). Frozen dataclasses support weakref.finalize,
# so we drop the entry when the mapping object is collected.
_prepared_cache: dict[int, Any] = {}
def _release(key: int) -> None:
_prepared_cache.pop(key, None)
def prepare_mapping(mapping: Any) -> Any:
"""Return a cached ``PreparedMapping`` for ``mapping``.
Builds the prepared form on first call per mapping object; subsequent
calls with the same object return the cached handle. The cache uses
``id(mapping)`` as key and a weakref finalizer to drop entries when
the mapping is garbage collected.
Raises ``RuntimeError`` if the Rust extension is not available.
"""
if not _ACCEL_AVAILABLE:
raise RuntimeError("cute_tokenizer._accel is not available")
key = id(mapping)
cached = _prepared_cache.get(key)
if cached is not None:
return cached
prepared = accel.PreparedMapping(mapping.word_to_pua, mapping.pua_to_word)
_prepared_cache[key] = prepared
# Mapping objects without `__weakref__` (very rare) silently skip
# the finalizer; the entry then lives until process exit, which is
# fine because the prepared mapping is small and the dict caps at
# one entry per distinct PUAMapping instance.
with contextlib.suppress(TypeError):
weakref.finalize(mapping, _release, key)
return prepared
def build_tag() -> str:
"""Return the Rust extension build tag (or 'python-fallback')."""
if USE_RUST:
return getattr(accel, "__build_tag__", "rust-unknown")
return "python-fallback"
__all__ = ["USE_RUST", "accel", "build_tag", "prepare_mapping"]