HusseinEid
/

cute-tokenizer

private-use-area

lossless-roundtrip

Model card Files Files and versions

cute-tokenizer / cute_tokenizer /__init__.py

HusseinEid's picture

Super-squash branch 'main' using huggingface_hub

68a4c53 11 days ago

history blame contribute delete

3.62 kB

	"""CUTE — Compact Unicode Token Encoding.

	Public API:
	build_cute — train a CUTE tokenizer from a corpus directory.
	CUTEConfig — all knobs for the build pipeline.
	CUTETokenizerFast — HuggingFace-compatible inference wrapper.
	PUAMapping — word ↔ PUA character mapping.
	load_default_tokenizer — load the bundled production-ready tokenizer.
	"""

	from __future__ import annotations

	import os as _os
	from pathlib import Path

	# Silence the "None of PyTorch, TensorFlow, or Flax have been found" warning
	# that `transformers` emits at import. CUTE only needs the tokenizer layer, not
	# the model layer, so this warning is irrelevant. Must be set BEFORE the first
	# `transformers` import — putting it here covers both library and CLI paths.
	_os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
	_os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")

	from ._version import __version__
	from .baseline import BaselineTokenizer, Cl100kBaseline, NullBaseline, get_default_baseline
	from .config import CUTEConfig
	from .pua import PUAMapping
	from .selection import compute_candidate_score, select_by_savings
	from .tokenizer import CUTETokenizerFast
	from .trainer import build_cute, load_mapping, save_mapping


	def _bundled_data_dir() -> Path:
	"""Directory containing tokenizer.json and cute_mapping.json.

	Resolution order:
	1. Package ``data/`` (wheel / after prepare copies into the package tree).
	2. Repository ``model/`` (editable install from a checkout with ``src/`` layout).
	"""
	pkg_dir = Path(__file__).resolve().parent
	embedded = pkg_dir / "data"
	if (embedded / "tokenizer.json").is_file() and (embedded / "cute_mapping.json").is_file():
	return embedded
	if pkg_dir.parent.name == "src":
	repo_root = pkg_dir.parent.parent
	dev = repo_root / "model"
	if (dev / "tokenizer.json").is_file() and (dev / "cute_mapping.json").is_file():
	return dev
	raise RuntimeError(
	"Bundled tokenizer files not found. Expected tokenizer.json and cute_mapping.json in "
	"the package data directory, or in ./model at the repository root. "
	"For releases, ensure model/ is populated before building the wheel."
	)


	def load_default_tokenizer() -> CUTETokenizerFast:
	"""Load the bundled production-ready CUTE tokenizer.

	This tokenizer was trained on a large code corpus (The Stack subset)
	with 80k vocab and 90% coverage target. It is ready to use immediately
	after `pip install cute-tokenizer` — no training required.

	Returns
	-------
	CUTETokenizerFast
	Pre-trained tokenizer instance.

	Example
	-------
	>>> from cute_tokenizer import load_default_tokenizer
	>>> tok = load_default_tokenizer()
	>>> ids = tok("def hello(): return 42", add_special_tokens=False).input_ids
	>>> len(ids)
	6
	"""
	data_dir = _bundled_data_dir()
	tokenizer_file = data_dir / "tokenizer.json"
	mapping_file = data_dir / "cute_mapping.json"

	return CUTETokenizerFast(
	tokenizer_file=str(tokenizer_file),
	cute_mapping_file=str(mapping_file),
	)


	__all__ = [
	"BaselineTokenizer",
	"CUTEConfig",
	"CUTETokenizerFast",
	"Cl100kBaseline",
	"NullBaseline",
	"PUAMapping",
	"__version__",
	"build_cute",
	"compute_candidate_score",
	"get_default_baseline",
	"load_default_tokenizer",
	"load_mapping",
	"save_mapping",
	"select_by_savings",
	]