File size: 4,848 Bytes
204acd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Custom SentencePiece tokenizer for Jeeves model.

This tokenizer wraps SentencePiece directly, bypassing HuggingFace's
broken SentencePieceExtractor conversion (transformers >= 5.x).
It ensures token IDs match training exactly.

Usage:
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("Anurich/Jeeves-Small-100M", trust_remote_code=True)
"""

import os
from typing import Dict, List, Optional, Tuple

import sentencepiece as spm
from transformers import PreTrainedTokenizer


class JeevesTokenizer(PreTrainedTokenizer):
    """SentencePiece BPE tokenizer for Jeeves models.

    Wraps the SentencePiece model directly to guarantee exact token ID
    correspondence with the training tokenizer. This avoids the broken
    HF slow->fast converter in transformers >= 5.x.
    """

    vocab_files_names = {"vocab_file": "tokenizer.model"}
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file: str,
        bos_token: str = "<s>",
        eos_token: str = "</s>",
        unk_token: str = "<unk>",
        pad_token: str = "<pad>",
        chat_template: Optional[str] = None,
        additional_special_tokens: Optional[List[str]] = None,
        **kwargs,
    ):
        self.vocab_file = vocab_file
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

        if additional_special_tokens is None:
            additional_special_tokens = [
                "<|im_start|>",
                "<|im_end|>",
                "<|tool_call|>",
                "<|tool_result|>",
                "<|system|>",
                "<|user|>",
                "<|assistant|>",
            ]

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            additional_special_tokens=additional_special_tokens,
            chat_template=chat_template,
            **kwargs,
        )

    @property
    def vocab_size(self) -> int:
        return self.sp_model.GetPieceSize()

    def get_vocab(self) -> Dict[str, int]:
        vocab = {self.sp_model.IdToPiece(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        """Tokenize text into subword pieces using SentencePiece."""
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token: str) -> int:
        """Convert a token string to its ID."""
        return self.sp_model.PieceToId(token)

    def _convert_id_to_token(self, index: int) -> str:
        """Convert an ID to its token string."""
        if index < 0 or index >= self.vocab_size:
            return self.unk_token
        return self.sp_model.IdToPiece(index)

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """Convert a sequence of tokens to a single string."""
        return self.sp_model.DecodePieces(tokens)

    def save_vocabulary(
        self, save_directory: str, filename_prefix: Optional[str] = None
    ) -> Tuple[str]:
        """Save the SentencePiece model file."""
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory, exist_ok=True)

        out_vocab_file = os.path.join(
            save_directory,
            (filename_prefix + "-" if filename_prefix else "") + "tokenizer.model",
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            import shutil
            shutil.copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """Build model inputs — no BOS/EOS added by default (matching training)."""
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1

    def get_special_tokens_mask(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: bool = False,
    ) -> List[int]:
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True,
            )
        if token_ids_1 is None:
            return [0] * len(token_ids_0)
        return [0] * len(token_ids_0) + [0] * len(token_ids_1)

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if token_ids_1 is None:
            return [0] * len(token_ids_0)
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)