| """Configuration for the CUTE tokenizer build pipeline."""
|
|
|
| from __future__ import annotations
|
|
|
| from dataclasses import asdict, dataclass
|
| from pathlib import Path
|
| from typing import Any
|
|
|
|
|
| def _load_toml(path: Path) -> dict[str, Any]:
|
| """Load TOML file, handling both stdlib tomllib (3.11+) and tomli."""
|
| try:
|
| import tomllib
|
|
|
| with open(path, "rb") as f:
|
| return tomllib.load(f)
|
| except ImportError:
|
| try:
|
| import tomli
|
|
|
| with open(path, "rb") as f:
|
| return tomli.load(f)
|
| except ImportError as e:
|
| raise ImportError(
|
| "Install 'tomli' for TOML support on Python <3.11: pip install tomli"
|
| ) from e
|
|
|
|
|
| DEFAULT_SPECIAL_TOKENS: tuple[str, ...] = (
|
|
|
|
|
|
|
|
|
|
|
|
|
| "<|endoftext|>",
|
| "<|fim_prefix|>",
|
| "<|fim_middle|>",
|
| "<|fim_suffix|>",
|
| "<|file_sep|>",
|
| "<|repo_name|>",
|
| "<|im_start|>",
|
| "<|im_end|>",
|
| "<|im_sep|>",
|
| )
|
|
|
| DEFAULT_CODE_EXTENSIONS: tuple[str, ...] = (
|
| ".py",
|
| ".js",
|
| ".ts",
|
| ".tsx",
|
| ".jsx",
|
| ".java",
|
| ".c",
|
| ".cpp",
|
| ".h",
|
| ".hpp",
|
| ".cs",
|
| ".rs",
|
| ".go",
|
| ".rb",
|
| ".php",
|
| ".swift",
|
| ".kt",
|
| ".scala",
|
| ".sh",
|
| ".sql",
|
| ".html",
|
| ".css",
|
| ".scss",
|
| ".json",
|
| ".yaml",
|
| ".yml",
|
| ".toml",
|
| ".md",
|
| )
|
|
|
|
|
| @dataclass(frozen=True)
|
| class CUTEConfig:
|
| """All knobs for a CUTE build, in one place.
|
|
|
| Frozen so hashing/comparison is well-defined and the manifest serializer
|
| can dump a stable representation.
|
| """
|
|
|
| vocab_size: int = 120_000
|
| pua_budget: int = 50_000
|
| coverage_target: float = 0.90
|
| max_token_len: int = 50
|
| boost_weight: float = 0.3
|
| min_bpe_budget: int = 50_000
|
| min_frequency: int = 2
|
| seed: int = 42
|
| allow_supplementary_pua: bool = False
|
| use_savings_selection: bool = True
|
| strict_pua_atomicity: bool = True
|
|
|
|
|
|
|
|
|
|
|
|
|
| pua_skip_bmp: bool = True
|
| extensions: tuple[str, ...] = DEFAULT_CODE_EXTENSIONS
|
| special_tokens: tuple[str, ...] = DEFAULT_SPECIAL_TOKENS
|
| workers: int = 0
|
| shard_size_bytes: int = 64 * 1024 * 1024
|
| license_allowlist: tuple[str, ...] = (
|
| "MIT",
|
| "Apache-2.0",
|
| "BSD-3-Clause",
|
| "BSD-2-Clause",
|
| "ISC",
|
| "Apache 2.0",
|
| "Apache License 2.0",
|
| )
|
| enable_secret_scrub: bool = True
|
| enable_license_filter: bool = False
|
|
|
| def __post_init__(self) -> None:
|
| if not 0.0 < self.coverage_target < 1.0:
|
| raise ValueError(f"coverage_target must be in (0,1), got {self.coverage_target}")
|
| if self.vocab_size < 1024:
|
| raise ValueError(f"vocab_size too small: {self.vocab_size}")
|
| if self.max_token_len < 1:
|
| raise ValueError(f"max_token_len must be positive: {self.max_token_len}")
|
| if self.pua_budget < 0:
|
| raise ValueError(f"pua_budget must be non-negative: {self.pua_budget}")
|
| if self.min_bpe_budget < 256:
|
| raise ValueError(
|
| f"min_bpe_budget must be ≥ 256 (byte alphabet), got {self.min_bpe_budget}"
|
| )
|
|
|
| floor = 256 + len(self.special_tokens) + self.pua_budget + self.min_bpe_budget
|
| if floor > self.vocab_size:
|
| raise ValueError(
|
| f"vocab_size={self.vocab_size} too small for "
|
| f"pua_budget={self.pua_budget} + min_bpe_budget={self.min_bpe_budget} "
|
| f"+ specials={len(self.special_tokens)} + bytes=256 (need ≥ {floor})"
|
| )
|
|
|
| def to_dict(self) -> dict[str, Any]:
|
| return asdict(self)
|
|
|
| @classmethod
|
| def from_toml(cls, path: Path) -> CUTEConfig:
|
| """Load config from TOML file (e.g., configs/default.toml).
|
|
|
| Falls back to defaults for any missing keys.
|
| """
|
| data = _load_toml(path)
|
|
|
| kwargs: dict[str, Any] = {}
|
|
|
|
|
| for key in [
|
| "vocab_size",
|
| "pua_budget",
|
| "coverage_target",
|
| "max_token_len",
|
| "boost_weight",
|
| "min_bpe_budget",
|
| "min_frequency",
|
| "seed",
|
| "workers",
|
| "shard_size_bytes",
|
| "enable_secret_scrub",
|
| "enable_license_filter",
|
| "allow_supplementary_pua",
|
| "use_savings_selection",
|
| "strict_pua_atomicity",
|
| "pua_skip_bmp",
|
| ]:
|
| if key in data:
|
| kwargs[key] = data[key]
|
|
|
|
|
| if "extensions" in data:
|
| kwargs["extensions"] = tuple(data["extensions"])
|
| if "special_tokens" in data:
|
| kwargs["special_tokens"] = tuple(data["special_tokens"])
|
| if "license_allowlist" in data:
|
| kwargs["license_allowlist"] = tuple(data["license_allowlist"])
|
|
|
| return cls(**kwargs)
|
|
|
|
|
| __all__ = ["DEFAULT_CODE_EXTENSIONS", "DEFAULT_SPECIAL_TOKENS", "CUTEConfig"]
|
|
|