|
|
""" |
|
|
Custom SentencePiece tokenizer for Jeeves model. |
|
|
|
|
|
This tokenizer wraps SentencePiece directly, bypassing HuggingFace's |
|
|
broken SentencePieceExtractor conversion (transformers >= 5.x). |
|
|
It ensures token IDs match training exactly. |
|
|
|
|
|
Usage: |
|
|
from transformers import AutoTokenizer |
|
|
tokenizer = AutoTokenizer.from_pretrained("Anurich/Jeeves-Small-100M", trust_remote_code=True) |
|
|
""" |
|
|
|
|
|
import os |
|
|
from typing import Dict, List, Optional, Tuple |
|
|
|
|
|
import sentencepiece as spm |
|
|
from transformers import PreTrainedTokenizer |
|
|
|
|
|
|
|
|
class JeevesTokenizer(PreTrainedTokenizer): |
|
|
"""SentencePiece BPE tokenizer for Jeeves models. |
|
|
|
|
|
Wraps the SentencePiece model directly to guarantee exact token ID |
|
|
correspondence with the training tokenizer. This avoids the broken |
|
|
HF slow->fast converter in transformers >= 5.x. |
|
|
""" |
|
|
|
|
|
vocab_files_names = {"vocab_file": "tokenizer.model"} |
|
|
model_input_names = ["input_ids", "attention_mask"] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_file: str, |
|
|
bos_token: str = "<s>", |
|
|
eos_token: str = "</s>", |
|
|
unk_token: str = "<unk>", |
|
|
pad_token: str = "<pad>", |
|
|
chat_template: Optional[str] = None, |
|
|
additional_special_tokens: Optional[List[str]] = None, |
|
|
**kwargs, |
|
|
): |
|
|
self.vocab_file = vocab_file |
|
|
self.sp_model = spm.SentencePieceProcessor() |
|
|
self.sp_model.Load(vocab_file) |
|
|
|
|
|
if additional_special_tokens is None: |
|
|
additional_special_tokens = [ |
|
|
"<|im_start|>", |
|
|
"<|im_end|>", |
|
|
"<|tool_call|>", |
|
|
"<|tool_result|>", |
|
|
"<|system|>", |
|
|
"<|user|>", |
|
|
"<|assistant|>", |
|
|
] |
|
|
|
|
|
super().__init__( |
|
|
bos_token=bos_token, |
|
|
eos_token=eos_token, |
|
|
unk_token=unk_token, |
|
|
pad_token=pad_token, |
|
|
additional_special_tokens=additional_special_tokens, |
|
|
chat_template=chat_template, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
@property |
|
|
def vocab_size(self) -> int: |
|
|
return self.sp_model.GetPieceSize() |
|
|
|
|
|
def get_vocab(self) -> Dict[str, int]: |
|
|
vocab = {self.sp_model.IdToPiece(i): i for i in range(self.vocab_size)} |
|
|
vocab.update(self.added_tokens_encoder) |
|
|
return vocab |
|
|
|
|
|
def _tokenize(self, text: str) -> List[str]: |
|
|
"""Tokenize text into subword pieces using SentencePiece.""" |
|
|
return self.sp_model.EncodeAsPieces(text) |
|
|
|
|
|
def _convert_token_to_id(self, token: str) -> int: |
|
|
"""Convert a token string to its ID.""" |
|
|
return self.sp_model.PieceToId(token) |
|
|
|
|
|
def _convert_id_to_token(self, index: int) -> str: |
|
|
"""Convert an ID to its token string.""" |
|
|
if index < 0 or index >= self.vocab_size: |
|
|
return self.unk_token |
|
|
return self.sp_model.IdToPiece(index) |
|
|
|
|
|
def convert_tokens_to_string(self, tokens: List[str]) -> str: |
|
|
"""Convert a sequence of tokens to a single string.""" |
|
|
return self.sp_model.DecodePieces(tokens) |
|
|
|
|
|
def save_vocabulary( |
|
|
self, save_directory: str, filename_prefix: Optional[str] = None |
|
|
) -> Tuple[str]: |
|
|
"""Save the SentencePiece model file.""" |
|
|
if not os.path.isdir(save_directory): |
|
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
|
|
out_vocab_file = os.path.join( |
|
|
save_directory, |
|
|
(filename_prefix + "-" if filename_prefix else "") + "tokenizer.model", |
|
|
) |
|
|
|
|
|
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): |
|
|
import shutil |
|
|
shutil.copyfile(self.vocab_file, out_vocab_file) |
|
|
|
|
|
return (out_vocab_file,) |
|
|
|
|
|
def build_inputs_with_special_tokens( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
|
|
) -> List[int]: |
|
|
"""Build model inputs — no BOS/EOS added by default (matching training).""" |
|
|
if token_ids_1 is None: |
|
|
return token_ids_0 |
|
|
return token_ids_0 + token_ids_1 |
|
|
|
|
|
def get_special_tokens_mask( |
|
|
self, |
|
|
token_ids_0: List[int], |
|
|
token_ids_1: Optional[List[int]] = None, |
|
|
already_has_special_tokens: bool = False, |
|
|
) -> List[int]: |
|
|
if already_has_special_tokens: |
|
|
return super().get_special_tokens_mask( |
|
|
token_ids_0=token_ids_0, |
|
|
token_ids_1=token_ids_1, |
|
|
already_has_special_tokens=True, |
|
|
) |
|
|
if token_ids_1 is None: |
|
|
return [0] * len(token_ids_0) |
|
|
return [0] * len(token_ids_0) + [0] * len(token_ids_1) |
|
|
|
|
|
def create_token_type_ids_from_sequences( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
|
|
) -> List[int]: |
|
|
if token_ids_1 is None: |
|
|
return [0] * len(token_ids_0) |
|
|
return [0] * len(token_ids_0) + [1] * len(token_ids_1) |
|
|
|