""" Custom SentencePiece tokenizer for Jeeves model. This tokenizer wraps SentencePiece directly, bypassing HuggingFace's broken SentencePieceExtractor conversion (transformers >= 5.x). It ensures token IDs match training exactly. Usage: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("Anurich/Jeeves-Small-100M", trust_remote_code=True) """ import os from typing import Dict, List, Optional, Tuple import sentencepiece as spm from transformers import PreTrainedTokenizer class JeevesTokenizer(PreTrainedTokenizer): """SentencePiece BPE tokenizer for Jeeves models. Wraps the SentencePiece model directly to guarantee exact token ID correspondence with the training tokenizer. This avoids the broken HF slow->fast converter in transformers >= 5.x. """ vocab_files_names = {"vocab_file": "tokenizer.model"} model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file: str, bos_token: str = "", eos_token: str = "", unk_token: str = "", pad_token: str = "", chat_template: Optional[str] = None, additional_special_tokens: Optional[List[str]] = None, **kwargs, ): self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) if additional_special_tokens is None: additional_special_tokens = [ "<|im_start|>", "<|im_end|>", "<|tool_call|>", "<|tool_result|>", "<|system|>", "<|user|>", "<|assistant|>", ] super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, chat_template=chat_template, **kwargs, ) @property def vocab_size(self) -> int: return self.sp_model.GetPieceSize() def get_vocab(self) -> Dict[str, int]: vocab = {self.sp_model.IdToPiece(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text: str) -> List[str]: """Tokenize text into subword pieces using SentencePiece.""" return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token: str) -> int: """Convert a token string to its ID.""" return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index: int) -> str: """Convert an ID to its token string.""" if index < 0 or index >= self.vocab_size: return self.unk_token return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens: List[str]) -> str: """Convert a sequence of tokens to a single string.""" return self.sp_model.DecodePieces(tokens) def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None ) -> Tuple[str]: """Save the SentencePiece model file.""" if not os.path.isdir(save_directory): os.makedirs(save_directory, exist_ok=True) out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.model", ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): import shutil shutil.copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """Build model inputs — no BOS/EOS added by default (matching training).""" if token_ids_1 is None: return token_ids_0 return token_ids_0 + token_ids_1 def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False, ) -> List[int]: if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True, ) if token_ids_1 is None: return [0] * len(token_ids_0) return [0] * len(token_ids_0) + [0] * len(token_ids_1) def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: if token_ids_1 is None: return [0] * len(token_ids_0) return [0] * len(token_ids_0) + [1] * len(token_ids_1)