"""
Custom SentencePiece tokenizer for Jeeves model.
This tokenizer wraps SentencePiece directly, bypassing HuggingFace's
broken SentencePieceExtractor conversion (transformers >= 5.x).
It ensures token IDs match training exactly.
Usage:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Anurich/Jeeves-Small-100M", trust_remote_code=True)
"""
import os
from typing import Dict, List, Optional, Tuple
import sentencepiece as spm
from transformers import PreTrainedTokenizer
class JeevesTokenizer(PreTrainedTokenizer):
"""SentencePiece BPE tokenizer for Jeeves models.
Wraps the SentencePiece model directly to guarantee exact token ID
correspondence with the training tokenizer. This avoids the broken
HF slow->fast converter in transformers >= 5.x.
"""
vocab_files_names = {"vocab_file": "tokenizer.model"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file: str,
bos_token: str = "",
eos_token: str = "",
unk_token: str = "",
pad_token: str = "",
chat_template: Optional[str] = None,
additional_special_tokens: Optional[List[str]] = None,
**kwargs,
):
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
if additional_special_tokens is None:
additional_special_tokens = [
"<|im_start|>",
"<|im_end|>",
"<|tool_call|>",
"<|tool_result|>",
"<|system|>",
"<|user|>",
"<|assistant|>",
]
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
chat_template=chat_template,
**kwargs,
)
@property
def vocab_size(self) -> int:
return self.sp_model.GetPieceSize()
def get_vocab(self) -> Dict[str, int]:
vocab = {self.sp_model.IdToPiece(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
"""Tokenize text into subword pieces using SentencePiece."""
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token: str) -> int:
"""Convert a token string to its ID."""
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index: int) -> str:
"""Convert an ID to its token string."""
if index < 0 or index >= self.vocab_size:
return self.unk_token
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Convert a sequence of tokens to a single string."""
return self.sp_model.DecodePieces(tokens)
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> Tuple[str]:
"""Save the SentencePiece model file."""
if not os.path.isdir(save_directory):
os.makedirs(save_directory, exist_ok=True)
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + "tokenizer.model",
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
import shutil
shutil.copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""Build model inputs — no BOS/EOS added by default (matching training)."""
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False,
) -> List[int]:
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
if token_ids_1 is None:
return [0] * len(token_ids_0)
return [0] * len(token_ids_0) + [0] * len(token_ids_1)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
if token_ids_1 is None:
return [0] * len(token_ids_0)
return [0] * len(token_ids_0) + [1] * len(token_ids_1)