Jeeves-Small-95M / tokenization_jeeves.py
Anurich's picture
Upload Jeeves model (trust_remote_code)
09168df verified
"""Custom SentencePiece tokenizer for Jeeves model.
Wraps SentencePiece directly for exact token ID match with training.
Usage:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("REPO_ID", trust_remote_code=True)
"""
import os
from typing import Dict, List, Optional, Tuple
import sentencepiece as spm
from transformers import PreTrainedTokenizer
class JeevesTokenizer(PreTrainedTokenizer):
"""SentencePiece BPE tokenizer for Jeeves models."""
vocab_files_names = {"vocab_file": "tokenizer.model"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file: str,
bos_token: str = "<s>",
eos_token: str = "</s>",
unk_token: str = "<unk>",
pad_token: str = "<pad>",
chat_template: Optional[str] = None,
additional_special_tokens: Optional[List[str]] = None,
**kwargs,
):
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
if additional_special_tokens is None:
additional_special_tokens = [
"<|im_start|>", "<|im_end|>",
"<|tool_call|>", "<|tool_result|>",
"<|system|>", "<|user|>", "<|assistant|>",
]
super().__init__(
bos_token=bos_token, eos_token=eos_token,
unk_token=unk_token, pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
chat_template=chat_template, **kwargs,
)
@property
def vocab_size(self) -> int:
return self.sp_model.GetPieceSize()
def get_vocab(self) -> Dict[str, int]:
vocab = {self.sp_model.IdToPiece(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token: str) -> int:
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index: int) -> str:
if index < 0 or index >= self.vocab_size:
return self.unk_token
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.sp_model.DecodePieces(tokens)
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> Tuple[str]:
if not os.path.isdir(save_directory):
os.makedirs(save_directory, exist_ok=True)
out_path = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + "tokenizer.model",
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_path):
import shutil
shutil.copyfile(self.vocab_file, out_path)
return (out_path,)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
n = len(token_ids_0) + (len(token_ids_1) if token_ids_1 else 0)
return [0] * n
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
if token_ids_1 is None:
return [0] * len(token_ids_0)
return [0] * len(token_ids_0) + [1] * len(token_ids_1)