File size: 10,267 Bytes
33c60bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The Spec-2 Authors
# Licensed under the Apache License, Version 2.0 (the "License")

"""Tokenizer for Spec-2 model"""

import json
import os
from typing import Dict, List, Optional, Tuple, Union

import regex as re
from transformers import PreTrainedTokenizer
from transformers.utils import is_sentencepiece_available, logging

if is_sentencepiece_available():
    import sentencepiece as spm
else:
    raise ImportError(
        "You need to install sentencepiece to use Spec2Tokenizer: https://github.com/google/sentencepiece"
        "pip install sentencepiece"
    )

logger = logging.get_logger(__name__)


class Spec2Tokenizer(PreTrainedTokenizer):
    """
    Construct a Spec-2 tokenizer based on SentencePiece.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file generated by SentencePiece.
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        bos_token (`str`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining.
        eos_token (`str`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        sp_model_kwargs (`dict`, *optional*):
            Arguments to be passed to the SentencePiece model initialization method.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
            Whether or not to clean up the tokenization spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not to use the default system prompt.
    """
    
    vocab_files_names = {"vocab_file": "tokenizer.model"}
    model_input_names = ["input_ids", "attention_mask"]
    
    def __init__(
        self,
        vocab_file,
        additional_special_tokens=None,
        bos_token="<bos>",
        eos_token="<eos>",
        unk_token="<unk>",
        pad_token="<pad>",
        sp_model_kwargs: Optional[Dict[str, str]] = None,
        clean_up_tokenization_spaces=True,
        use_default_system_prompt=False,
        **kwargs,
    ):
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        self.vocab_file = vocab_file
        
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)
        
        # Mapping special tokens
        self.special_tokens_map = {
            "bos_token": bos_token,
            "eos_token": eos_token,
            "unk_token": unk_token,
            "pad_token": pad_token,
        }
        
        # Add additional special tokens
        self._additional_special_tokens = []
        if additional_special_tokens:
            self._additional_special_tokens = list(additional_special_tokens)
            
        self.use_default_system_prompt = use_default_system_prompt
        
        self.clean_up_tokenization_spaces = clean_up_tokenization_spaces

        # Dictionary to store the token ids for special tokens
        self.special_token_ids = {}
        for token_name, token in self.special_tokens_map.items():
            token_id = self.sp_model.piece_to_id(token)
            self.special_token_ids[token_name] = token_id
            setattr(self, f"{token_name}_id", token_id)
            
        # Load additional special token mappings if available
        self.vocab_mapping = {}
        vocab_mapping_file = os.path.join(os.path.dirname(vocab_file), "tokenizer_config.json")
        if os.path.exists(vocab_mapping_file):
            with open(vocab_mapping_file, "r", encoding="utf-8") as f:
                config = json.load(f)
                if "vocab_mapping" in config:
                    self.vocab_mapping = config["vocab_mapping"]

        # Initialize PreTrainedTokenizer
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            additional_special_tokens=self._additional_special_tokens,
            **kwargs,
        )
        
    @property
    def vocab_size(self):
        """Return the size of vocabulary."""
        return self.sp_model.get_piece_size()
        
    def get_vocab(self):
        """Return vocab as a dict."""
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab
        
    def _tokenize(self, text):
        """Tokenize a string."""
        return self.sp_model.encode(text, out_type=str)
        
    def _convert_token_to_id(self, token):
        """Convert a token to an id using the vocab."""
        return self.sp_model.piece_to_id(token)
        
    def _convert_id_to_token(self, index):
        """Convert an id to a token using the vocab."""
        if index in self.added_tokens_decoder:
            return self.added_tokens_decoder[index]
        if index >= self.sp_model.get_piece_size():
            for token_id_str, info in self.vocab_mapping.items():
                if int(token_id_str) == index:
                    return info["content"]
            return self.unk_token
        token = self.sp_model.id_to_piece(index)
        return token
        
    def convert_tokens_to_string(self, tokens):
        """Convert a list of tokens to a string."""
        text = self.sp_model.decode(tokens)
        if self.clean_up_tokenization_spaces:
            text = self.clean_up_tokenization(text)
        return text
        
    def save_vocabulary(self, save_directory, filename_prefix=None):
        """Save the vocabulary to a directory."""
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
            
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
        )
        
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            import shutil
            shutil.copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content = self.sp_model.serialized_model_proto()
                fi.write(content)
                
        # Save tokenizer config with vocab mapping
        config_file = os.path.join(save_directory, "tokenizer_config.json")
        tokenizer_config = {
            "vocab_file": self.vocab_files_names["vocab_file"],
            "bos_token": self.bos_token,
            "eos_token": self.eos_token,
            "unk_token": self.unk_token,
            "pad_token": self.pad_token,
            "additional_special_tokens": self._additional_special_tokens,
            "clean_up_tokenization_spaces": self.clean_up_tokenization_spaces,
            "use_default_system_prompt": self.use_default_system_prompt,
            "sp_model_kwargs": self.sp_model_kwargs,
            "tokenizer_class": "Spec2Tokenizer",
            "vocab_mapping": self.vocab_mapping
        }
        
        with open(config_file, "w", encoding="utf-8") as f:
            json.dump(tokenizer_config, f, indent=2)
            
        return (out_vocab_file, config_file)
        
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """Build model inputs from a sequence by appending eos_token_id."""
        if token_ids_1 is None:
            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
        
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences.
        """
        eos = [self.eos_token_id]
        bos = [self.bos_token_id]

        if token_ids_1 is None:
            return len(bos + token_ids_0 + eos) * [0]
        return len(bos + token_ids_0 + eos + token_ids_1 + eos) * [0]

    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        **kwargs
    ):
        """
        Prepare inputs for the model.
        """
        return super().prepare_for_model(
            ids, pair_ids, add_special_tokens=add_special_tokens, **kwargs
        )
        
    def prepare_seq2seq_batch(
        self,
        src_texts: Union[str, List[str]],
        tgt_texts: Optional[Union[str, List[str]]] = None,
        **kwargs
    ):
        """
        Prepare a batch for sequence-to-sequence tasks.
        """
        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)