| """Baseline tokenizer adapters for savings-based candidate scoring.
|
|
|
| The selection step asks "how many baseline tokens would this string cost?"
|
| to compute net token savings per PUA candidate. We provide a thin protocol
|
| plus a `tiktoken`-backed cl100k implementation and a fallback.
|
|
|
| `tiktoken` is an *optional* dependency — install via
|
| `pip install 'cute-tokenizer[baseline]'`. The CUTE inference path does not
|
| need it.
|
| """
|
|
|
| from __future__ import annotations
|
|
|
| import functools
|
| import warnings
|
| from typing import Protocol
|
|
|
|
|
| class BaselineTokenizer(Protocol):
|
| """Counts baseline tokens for a string. Implementations must be deterministic
|
| and side-effect-free with respect to identical inputs."""
|
|
|
| name: str
|
|
|
| def count_tokens(self, text: str) -> int:
|
| """Return the number of baseline tokens for `text`. Must be ≥ 0."""
|
| ...
|
|
|
|
|
| class Cl100kBaseline:
|
| """OpenAI cl100k baseline (GPT-4 / cl100k_base via `tiktoken`).
|
|
|
| Caches per-token counts in an in-process LRU. Thread-safe for read-after-init.
|
| """
|
|
|
| name: str = "cl100k_base"
|
|
|
| def __init__(self, cache_size: int = 1_000_000) -> None:
|
| try:
|
| import tiktoken
|
| except ImportError as e:
|
| raise ImportError(
|
| "Cl100kBaseline requires tiktoken. "
|
| "Install with: pip install 'cute-tokenizer[baseline]'"
|
| ) from e
|
| self._enc = tiktoken.get_encoding("cl100k_base")
|
|
|
|
|
| self._count = functools.lru_cache(maxsize=cache_size)(self._count_uncached)
|
|
|
| def _count_uncached(self, text: str) -> int:
|
| return len(self._enc.encode(text, disallowed_special=()))
|
|
|
| def count_tokens(self, text: str) -> int:
|
| if not text:
|
| return 0
|
| return self._count(text)
|
|
|
|
|
| class NullBaseline:
|
| """Fallback that scores every token as costing 1 baseline token.
|
|
|
| Under this baseline, `compute_candidate_score` collapses to `0` for any
|
| single-token input — i.e. every PUA substitution is a wash. Useful only
|
| as a degenerate fallback that effectively disables savings-based ranking.
|
| Emits a one-time warning when used so callers know they're getting
|
| frequency-based ranking, not savings-based.
|
| """
|
|
|
| name: str = "null"
|
| _warned = False
|
|
|
| def count_tokens(self, text: str) -> int:
|
| if not text:
|
| return 0
|
| if not NullBaseline._warned:
|
| warnings.warn(
|
| "NullBaseline in use — savings scores will all be 0. "
|
| "Install 'cute-tokenizer[baseline]' for cl100k-aware ranking.",
|
| stacklevel=2,
|
| )
|
| NullBaseline._warned = True
|
| return 1
|
|
|
|
|
| def get_default_baseline() -> BaselineTokenizer:
|
| """Return `Cl100kBaseline` if tiktoken is available, else `NullBaseline`.
|
|
|
| Callers that *require* cl100k-aware ranking (e.g. `cute build`) should
|
| construct `Cl100kBaseline()` directly so they get a clear ImportError
|
| instead of silently degraded scoring.
|
| """
|
| try:
|
| return Cl100kBaseline()
|
| except ImportError:
|
| return NullBaseline()
|
|
|
|
|
| __all__ = [
|
| "BaselineTokenizer",
|
| "Cl100kBaseline",
|
| "NullBaseline",
|
| "get_default_baseline",
|
| ]
|
|
|