File size: 4,848 Bytes
204acd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
"""
Custom SentencePiece tokenizer for Jeeves model.
This tokenizer wraps SentencePiece directly, bypassing HuggingFace's
broken SentencePieceExtractor conversion (transformers >= 5.x).
It ensures token IDs match training exactly.
Usage:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Anurich/Jeeves-Small-100M", trust_remote_code=True)
"""
import os
from typing import Dict, List, Optional, Tuple
import sentencepiece as spm
from transformers import PreTrainedTokenizer
class JeevesTokenizer(PreTrainedTokenizer):
"""SentencePiece BPE tokenizer for Jeeves models.
Wraps the SentencePiece model directly to guarantee exact token ID
correspondence with the training tokenizer. This avoids the broken
HF slow->fast converter in transformers >= 5.x.
"""
vocab_files_names = {"vocab_file": "tokenizer.model"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file: str,
bos_token: str = "<s>",
eos_token: str = "</s>",
unk_token: str = "<unk>",
pad_token: str = "<pad>",
chat_template: Optional[str] = None,
additional_special_tokens: Optional[List[str]] = None,
**kwargs,
):
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
if additional_special_tokens is None:
additional_special_tokens = [
"<|im_start|>",
"<|im_end|>",
"<|tool_call|>",
"<|tool_result|>",
"<|system|>",
"<|user|>",
"<|assistant|>",
]
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
chat_template=chat_template,
**kwargs,
)
@property
def vocab_size(self) -> int:
return self.sp_model.GetPieceSize()
def get_vocab(self) -> Dict[str, int]:
vocab = {self.sp_model.IdToPiece(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text: str) -> List[str]:
"""Tokenize text into subword pieces using SentencePiece."""
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token: str) -> int:
"""Convert a token string to its ID."""
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index: int) -> str:
"""Convert an ID to its token string."""
if index < 0 or index >= self.vocab_size:
return self.unk_token
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Convert a sequence of tokens to a single string."""
return self.sp_model.DecodePieces(tokens)
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> Tuple[str]:
"""Save the SentencePiece model file."""
if not os.path.isdir(save_directory):
os.makedirs(save_directory, exist_ok=True)
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + "tokenizer.model",
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
import shutil
shutil.copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""Build model inputs — no BOS/EOS added by default (matching training)."""
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False,
) -> List[int]:
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
if token_ids_1 is None:
return [0] * len(token_ids_0)
return [0] * len(token_ids_0) + [0] * len(token_ids_1)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
if token_ids_1 is None:
return [0] * len(token_ids_0)
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|