bpe tokenizer w byte-fallback: 24k vocab

BPE tokenizer for encoders/MLM objective with byte-pair fallback:

Trained on pints-ai/Expository-Prose-V1; this tokenizer is primarily for English and code.
this tokenizer is cased: "HELLO WORLD" is different than "hello world"
model_max_length is set to 1e9 to not cause hidden issues. Set tokenizer.model_max_length to your model's max position embeddings when training.

visualize

code

from typing import Any, Callable, Optional, Union

from tokenizers import Tokenizer as RustTokenizer
from tokenizers.tools import EncodingVisualizer
from transformers import AutoTokenizer, PreTrainedTokenizerBase

SAMPLE_TEXT = """class DyT(nn.Module):
    def __init__(self, num_features, alpha_init_value=0.5):
        super().__init__()
        self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
        self.weight = nn.Parameter(torch.ones(num_features))
        self.bias = nn.Parameter(torch.zeros(num_features))

    def forward(self, x):
        x = torch.tanh(self.alpha * x)
        return x * self.weight + self.bias"""


def tokenizer_report_and_visualize(
    tk: Union[PreTrainedTokenizerBase, RustTokenizer],
    sample_text: str = SAMPLE_TEXT,
    *,
    default_to_notebook: bool = True,
    annotation_converter: Optional[Callable[[Any], Any]] = None,
    add_special_tokens: bool = True,
    n_first_tokens: int = 15,
):
    """
    Count tokens and ALWAYS launch tokenization visualization for the given tokenizer.

    tk:
      - Either a HuggingFace *fast* tokenizer (PreTrainedTokenizerFast subclass)
        or a low-level `tokenizers.Tokenizer` (RustTokenizer).
    """

    # Resolve a Rust-backed tokenizer for the visualizer and do the encoding.
    if isinstance(tk, PreTrainedTokenizerBase):
        name = getattr(tk, "name_or_path", tk.__class__.__name__)
        backend = getattr(tk, "backend_tokenizer", None)
        if backend is None:
            raise ValueError(
                "EncodingVisualizer requires a *fast* tokenizer. "
                "Reload with `use_fast=True` or pass a `tokenizers.Tokenizer`."
            )
        # mirror your original behavior that used `.encode(...)`
        input_ids = tk.encode(
            sample_text,
            add_special_tokens=add_special_tokens,
            padding=False,
            truncation=False,
        )
        tokens = tk.convert_ids_to_tokens(input_ids, skip_special_tokens=False)
        rust_tok = backend  # this is a tokenizers.Tokenizer
    elif isinstance(tk, RustTokenizer):
        name = "tokenizers.Tokenizer"
        enc = tk.encode(sample_text)
        input_ids = enc.ids
        tokens = enc.tokens
        rust_tok = tk
    else:
        raise TypeError(
            "`tk` must be a HF *fast* tokenizer or a `tokenizers.Tokenizer`."
        )

    num_tokens = len(input_ids)
    print(f"tokenizer ({name}): {num_tokens} tokens")
    print(
        "first tokens:",
        tokens[:n_first_tokens] + (["..."] if len(tokens) > n_first_tokens else []),
    )

    # EncodingVisualizer is REQUIRED (not optional).
    viz = EncodingVisualizer(
        tokenizer=rust_tok,
        default_to_notebook=default_to_notebook,
        annotation_converter=annotation_converter,
    )
    viz(sample_text)

    return {
        "tokenizer_name": name,
        "num_tokens": num_tokens,
        "input_ids": input_ids,
        "tokens": tokens,
        "text": sample_text,
    }


# --- example usage ---
repo_id = "pszemraj/bytebpe-tokenizer-24k-en_code-mlm"
tk = AutoTokenizer.from_pretrained(repo_id)
report = tokenizer_report_and_visualize(tk)

Downloads last month: -; Downloads are not tracked for this model. How to track

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

pszemraj
/

bytebpe-tokenizer-24k-mlm

bpe tokenizer w byte-fallback: 24k vocab

visualize

code

Dataset used to train pszemraj/bytebpe-tokenizer-24k-mlm