bpe tokenizer w byte-fallback: 24k vocab
BPE tokenizer for encoders/MLM objective with byte-pair fallback:
- Trained on
pints-ai/Expository-Prose-V1; this tokenizer is primarily for English and code. - this tokenizer is cased: "HELLO WORLD" is different than "hello world"
model_max_lengthis set to 1e9 to not cause hidden issues. Settokenizer.model_max_lengthto your model's max position embeddings when training.
visualize
code
from typing import Any, Callable, Optional, Union
from tokenizers import Tokenizer as RustTokenizer
from tokenizers.tools import EncodingVisualizer
from transformers import AutoTokenizer, PreTrainedTokenizerBase
SAMPLE_TEXT = """class DyT(nn.Module):
def __init__(self, num_features, alpha_init_value=0.5):
super().__init__()
self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
self.weight = nn.Parameter(torch.ones(num_features))
self.bias = nn.Parameter(torch.zeros(num_features))
def forward(self, x):
x = torch.tanh(self.alpha * x)
return x * self.weight + self.bias"""
def tokenizer_report_and_visualize(
tk: Union[PreTrainedTokenizerBase, RustTokenizer],
sample_text: str = SAMPLE_TEXT,
*,
default_to_notebook: bool = True,
annotation_converter: Optional[Callable[[Any], Any]] = None,
add_special_tokens: bool = True,
n_first_tokens: int = 15,
):
"""
Count tokens and ALWAYS launch tokenization visualization for the given tokenizer.
tk:
- Either a HuggingFace *fast* tokenizer (PreTrainedTokenizerFast subclass)
or a low-level `tokenizers.Tokenizer` (RustTokenizer).
"""
# Resolve a Rust-backed tokenizer for the visualizer and do the encoding.
if isinstance(tk, PreTrainedTokenizerBase):
name = getattr(tk, "name_or_path", tk.__class__.__name__)
backend = getattr(tk, "backend_tokenizer", None)
if backend is None:
raise ValueError(
"EncodingVisualizer requires a *fast* tokenizer. "
"Reload with `use_fast=True` or pass a `tokenizers.Tokenizer`."
)
# mirror your original behavior that used `.encode(...)`
input_ids = tk.encode(
sample_text,
add_special_tokens=add_special_tokens,
padding=False,
truncation=False,
)
tokens = tk.convert_ids_to_tokens(input_ids, skip_special_tokens=False)
rust_tok = backend # this is a tokenizers.Tokenizer
elif isinstance(tk, RustTokenizer):
name = "tokenizers.Tokenizer"
enc = tk.encode(sample_text)
input_ids = enc.ids
tokens = enc.tokens
rust_tok = tk
else:
raise TypeError(
"`tk` must be a HF *fast* tokenizer or a `tokenizers.Tokenizer`."
)
num_tokens = len(input_ids)
print(f"tokenizer ({name}): {num_tokens} tokens")
print(
"first tokens:",
tokens[:n_first_tokens] + (["..."] if len(tokens) > n_first_tokens else []),
)
# EncodingVisualizer is REQUIRED (not optional).
viz = EncodingVisualizer(
tokenizer=rust_tok,
default_to_notebook=default_to_notebook,
annotation_converter=annotation_converter,
)
viz(sample_text)
return {
"tokenizer_name": name,
"num_tokens": num_tokens,
"input_ids": input_ids,
"tokens": tokens,
"text": sample_text,
}
# --- example usage ---
repo_id = "pszemraj/bytebpe-tokenizer-24k-en_code-mlm"
tk = AutoTokenizer.from_pretrained(repo_id)
report = tokenizer_report_and_visualize(tk)
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support