"""Custom SentencePiece tokenizer for Jeeves model. Wraps SentencePiece directly for exact token ID match with training. Usage: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("REPO_ID", trust_remote_code=True) """ import os from typing import Dict, List, Optional, Tuple import sentencepiece as spm from transformers import PreTrainedTokenizer class JeevesTokenizer(PreTrainedTokenizer): """SentencePiece BPE tokenizer for Jeeves models.""" vocab_files_names = {"vocab_file": "tokenizer.model"} model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file: str, bos_token: str = "", eos_token: str = "", unk_token: str = "", pad_token: str = "", chat_template: Optional[str] = None, additional_special_tokens: Optional[List[str]] = None, **kwargs, ): self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) if additional_special_tokens is None: additional_special_tokens = [ "<|im_start|>", "<|im_end|>", "<|tool_call|>", "<|tool_result|>", "<|system|>", "<|user|>", "<|assistant|>", ] super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, chat_template=chat_template, **kwargs, ) @property def vocab_size(self) -> int: return self.sp_model.GetPieceSize() def get_vocab(self) -> Dict[str, int]: vocab = {self.sp_model.IdToPiece(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text: str) -> List[str]: return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token: str) -> int: return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index: int) -> str: if index < 0 or index >= self.vocab_size: return self.unk_token return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens: List[str]) -> str: return self.sp_model.DecodePieces(tokens) def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None ) -> Tuple[str]: if not os.path.isdir(save_directory): os.makedirs(save_directory, exist_ok=True) out_path = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "tokenizer.model", ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_path): import shutil shutil.copyfile(self.vocab_file, out_path) return (out_path,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): if token_ids_1 is None: return token_ids_0 return token_ids_0 + token_ids_1 def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True, ) n = len(token_ids_0) + (len(token_ids_1) if token_ids_1 else 0) return [0] * n def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): if token_ids_1 is None: return [0] * len(token_ids_0) return [0] * len(token_ids_0) + [1] * len(token_ids_1)