File size: 3,620 Bytes
68a4c53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""CUTE β€” Compact Unicode Token Encoding.



Public API:

    build_cute      β€” train a CUTE tokenizer from a corpus directory.

    CUTEConfig      β€” all knobs for the build pipeline.

    CUTETokenizerFast β€” HuggingFace-compatible inference wrapper.

    PUAMapping      β€” word ↔ PUA character mapping.

    load_default_tokenizer β€” load the bundled production-ready tokenizer.

"""

from __future__ import annotations

import os as _os
from pathlib import Path

# Silence the "None of PyTorch, TensorFlow, or Flax have been found" warning
# that `transformers` emits at import. CUTE only needs the tokenizer layer, not
# the model layer, so this warning is irrelevant. Must be set BEFORE the first
# `transformers` import β€” putting it here covers both library and CLI paths.
_os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
_os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")

from ._version import __version__
from .baseline import BaselineTokenizer, Cl100kBaseline, NullBaseline, get_default_baseline
from .config import CUTEConfig
from .pua import PUAMapping
from .selection import compute_candidate_score, select_by_savings
from .tokenizer import CUTETokenizerFast
from .trainer import build_cute, load_mapping, save_mapping


def _bundled_data_dir() -> Path:
    """Directory containing tokenizer.json and cute_mapping.json.



    Resolution order:

    1. Package ``data/`` (wheel / after prepare copies into the package tree).

    2. Repository ``model/`` (editable install from a checkout with ``src/`` layout).

    """
    pkg_dir = Path(__file__).resolve().parent
    embedded = pkg_dir / "data"
    if (embedded / "tokenizer.json").is_file() and (embedded / "cute_mapping.json").is_file():
        return embedded
    if pkg_dir.parent.name == "src":
        repo_root = pkg_dir.parent.parent
        dev = repo_root / "model"
        if (dev / "tokenizer.json").is_file() and (dev / "cute_mapping.json").is_file():
            return dev
    raise RuntimeError(
        "Bundled tokenizer files not found. Expected tokenizer.json and cute_mapping.json in "
        "the package data directory, or in ./model at the repository root. "
        "For releases, ensure model/ is populated before building the wheel."
    )


def load_default_tokenizer() -> CUTETokenizerFast:
    """Load the bundled production-ready CUTE tokenizer.



    This tokenizer was trained on a large code corpus (The Stack subset)

    with 80k vocab and 90% coverage target. It is ready to use immediately

    after `pip install cute-tokenizer` β€” no training required.



    Returns

    -------

    CUTETokenizerFast

        Pre-trained tokenizer instance.



    Example

    -------

        >>> from cute_tokenizer import load_default_tokenizer

        >>> tok = load_default_tokenizer()

        >>> ids = tok("def hello(): return 42", add_special_tokens=False).input_ids

        >>> len(ids)

        6

    """
    data_dir = _bundled_data_dir()
    tokenizer_file = data_dir / "tokenizer.json"
    mapping_file = data_dir / "cute_mapping.json"

    return CUTETokenizerFast(
        tokenizer_file=str(tokenizer_file),
        cute_mapping_file=str(mapping_file),
    )


__all__ = [
    "BaselineTokenizer",
    "CUTEConfig",
    "CUTETokenizerFast",
    "Cl100kBaseline",
    "NullBaseline",
    "PUAMapping",
    "__version__",
    "build_cute",
    "compute_candidate_score",
    "get_default_baseline",
    "load_default_tokenizer",
    "load_mapping",
    "save_mapping",
    "select_by_savings",
]