sujitvasanth commited on
Commit
defb439
·
verified ·
1 Parent(s): f67774e

Upload 7 files

Browse files
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/sujit/OpenCUA-7B",
3
+ "architectures": [
4
+ "VisionEncoderExtractor"
5
+ ],
6
+ "depth": 32,
7
+ "fullatt_block_indexes": [
8
+ 7,
9
+ 15,
10
+ 23,
11
+ 31
12
+ ],
13
+ "hidden_act": "silu",
14
+ "hidden_size": 1280,
15
+ "in_channels": 3,
16
+ "in_chans": 3,
17
+ "intermediate_size": 3420,
18
+ "model_type": "qwen2_5_vl",
19
+ "num_heads": 16,
20
+ "out_hidden_size": 3584,
21
+ "patch_size": 14,
22
+ "spatial_merge_size": 2,
23
+ "spatial_patch_size": 14,
24
+ "temporal_patch_size": 2,
25
+ "tokens_per_second": 2,
26
+ "torch_dtype": "float16",
27
+ "transformers_version": "4.49.0",
28
+ "window_size": 112
29
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fe8d0dbefef249b93e035683ff29fab17c795b416b30580d6eb329222d491d0
3
+ size 1353143976
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor"
18
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>",
4
+ "<|im_user|>",
5
+ "<|im_assistant|>",
6
+ "<|reserved_token_0|>",
7
+ "<|start_header_id|>",
8
+ "<|end_header_id|>",
9
+ "<|reserved_token_1|>",
10
+ "[EOT]",
11
+ "<|im_system|>",
12
+ "<|reserved_token_2|>",
13
+ "<|reserved_token_3|>",
14
+ "<|reserved_token_4|>",
15
+ "<|reserved_token_5|>",
16
+ "<|reserved_token_6|>",
17
+ "<|reserved_token_7|>",
18
+ "<|im_middle|>",
19
+ "<|media_begin|>",
20
+ "<|media_content|>",
21
+ "<|media_end|>",
22
+ "<|media_placeholder|>"
23
+ ],
24
+ "bos_token": {
25
+ "content": "[BOS]",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "eos_token": {
32
+ "content": "[EOS]",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ "pad_token": {
39
+ "content": "[PAD]",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ "unk_token": {
46
+ "content": "[UNK]",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ }
52
+ }
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2b1b8dfb5cc5f024bafc373121c6aba3f66f9a5a0269e243470a1de16a33186
3
+ size 2561218
tokenization_opencua.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
20
+
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+
26
+ class TikTokenTokenizer(PreTrainedTokenizer):
27
+ """
28
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
29
+
30
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
31
+ this superclass for more information regarding those methods.
32
+
33
+ Args:
34
+ vocab_file (`str`):
35
+ The path to the Tiktoken model file.
36
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
37
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
38
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
39
+ The end of sequence token.
40
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
41
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
42
+ token instead. The second to last item in special_tokens.
43
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
44
+ The token used for padding, for example when batching sequences of different lengths.
45
+ additional_special_tokens (list of `str`, *optional*):
46
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
47
+ skipped when decoding if `skip_special_tokens` is set to `True`.
48
+ """
49
+
50
+ vocab_files_names = VOCAB_FILES_NAMES
51
+
52
+ model_input_names = ["input_ids", "attention_mask"]
53
+
54
+ special_tokens: Dict[str, int]
55
+
56
+ num_reserved_special_tokens = 256
57
+
58
+ pat_str = "|".join(
59
+ [
60
+ r"""[\p{Han}]+""",
61
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
62
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
63
+ r"""\p{N}{1,3}""",
64
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
65
+ r"""\s*[\r\n]+""",
66
+ r"""\s+(?!\S)""",
67
+ r"""\s+""",
68
+ ]
69
+ )
70
+
71
+ def __init__(
72
+ self,
73
+ vocab_file,
74
+ bos_token: Union[str, AddedToken]="[BOS]",
75
+ eos_token: Union[str, AddedToken]="[EOS]",
76
+ unk_token: Union[str, AddedToken, None]=None,
77
+ pad_token: Union[str, AddedToken, None]=None,
78
+ additional_special_tokens: List[str]=None,
79
+ added_tokens_decoder: Optional[dict] = None,
80
+ **kwargs,
81
+ ):
82
+ assert os.path.isfile(vocab_file), vocab_file
83
+
84
+ if additional_special_tokens is None:
85
+ # dumping mode
86
+ used_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_user|>",
89
+ "<|im_assistant|>",
90
+ "<|reserved_token_0|>",
91
+ "<|start_header_id|>",
92
+ "<|end_header_id|>",
93
+ "<|reserved_token_1|>",
94
+ "[EOT]",
95
+ "<|im_system|>",
96
+ "<|reserved_token_2|>",
97
+ "<|reserved_token_3|>",
98
+ "<|reserved_token_4|>",
99
+ "<|reserved_token_5|>",
100
+ "<|reserved_token_6|>",
101
+ "<|reserved_token_7|>",
102
+ "<|im_middle|>",
103
+ "<|media_begin|>",
104
+ "<|media_content|>",
105
+ "<|media_end|>",
106
+ "<|media_placeholder|>",
107
+ ]
108
+ used_reserved_tokens = 8
109
+ last_reserved_token_id = self.num_reserved_special_tokens - 4 - len(used_special_tokens) + used_reserved_tokens - 1
110
+ additional_special_tokens = used_special_tokens + [
111
+ f"<|reserved_token_{i}|>"
112
+ for i in range(used_reserved_tokens, last_reserved_token_id + 1)
113
+ ]
114
+ # num_reserved_special_tokens = additional_special_tokens + BOS + EOS + unk_token + pad_token
115
+ assert len(additional_special_tokens) + 4 == self.num_reserved_special_tokens, f"additional_special_tokens num: {len(additional_special_tokens)} is not correct"
116
+ # we assume that the instance is under initialization and unk_token and pad_token should be automatically inferred
117
+ if unk_token is not None:
118
+ raise ValueError("unk_token should not be set in dumping mode when additional_special_tokens is None")
119
+ if pad_token is not None:
120
+ raise ValueError("pad_token should not be set in dumping mode when additional_special_tokens is None")
121
+ # last two reserved tokens
122
+ unk_token = f"[UNK]"
123
+ pad_token = f"[PAD]"
124
+
125
+ logger.info(f"adding unk_token: {unk_token} and pad_token: {pad_token}")
126
+ self.additional_special_tokens = additional_special_tokens
127
+ special_tokens = [str(bos_token), str(eos_token)] + additional_special_tokens + [str(unk_token), str(pad_token)]
128
+
129
+ self.vocab_file = vocab_file
130
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
131
+ num_base_tokens = len(mergeable_ranks)
132
+ self.special_tokens = {
133
+ token: num_base_tokens + i for i, token in enumerate(special_tokens)
134
+ }
135
+ else:
136
+ self.additional_special_tokens = additional_special_tokens
137
+ special_tokens_mapping = {
138
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
139
+ }
140
+
141
+ self.vocab_file = vocab_file
142
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
143
+ num_base_tokens = len(mergeable_ranks)
144
+ self.special_tokens = {
145
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
146
+ for i in range(
147
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
148
+ )
149
+ }
150
+
151
+
152
+
153
+ self.model = tiktoken.Encoding(
154
+ name=Path(vocab_file).name,
155
+ pat_str=self.pat_str,
156
+ mergeable_ranks=mergeable_ranks,
157
+ special_tokens=self.special_tokens,
158
+ )
159
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
160
+
161
+ self.n_words: int = self.model.n_vocab
162
+ # BOS / EOS token IDs
163
+ self.bos_id: int = self.special_tokens[str(bos_token)]
164
+ self.eos_id: int = self.special_tokens[str(eos_token)]
165
+
166
+ logger.info(
167
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
168
+ )
169
+
170
+ self.pad_id: int = self.special_tokens[str(pad_token)]
171
+ self.unk_id: int = self.special_tokens[str(unk_token)]
172
+ self.byte_encoder = bytes_to_unicode()
173
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
174
+
175
+ self.decoder = {}
176
+ for i in range(self.n_words):
177
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
178
+ decoding = ''.join([
179
+ self.byte_encoder[ord(char)] for char in
180
+ self.model.decode_single_token_bytes(i).decode('latin-1')
181
+ ])
182
+ self.decoder[i] = decoding
183
+
184
+ self.encoder = {}
185
+ for i in range(self.n_words):
186
+ if i in self.decoder:
187
+ self.encoder[self.decoder[i]] = i
188
+
189
+ super().__init__(
190
+ bos_token=bos_token,
191
+ eos_token=eos_token,
192
+ unk_token=unk_token,
193
+ pad_token=pad_token,
194
+ additional_special_tokens=self.additional_special_tokens,
195
+ **kwargs,
196
+ )
197
+ self.all_special_ids_set = set(self.all_special_ids)
198
+
199
+ def encode(
200
+ self,
201
+ text: str,
202
+ allow_special_tokens = True,
203
+ **kwargs
204
+ ) -> List[int]:
205
+ """
206
+ Encodes a string into a list of token IDs.
207
+
208
+ Args:
209
+ text (str): The input string to be encoded.
210
+
211
+ Returns:
212
+ list[int]: A list of token IDs.
213
+ """
214
+ # If there are other args, we should call super().encode because there are a lot of code
215
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
216
+ # NOTE: our encode method is not compatible with the super().encode method,
217
+ # e.g. split_special_tokens' default is True in our encode method.
218
+ if len(kwargs) > 0:
219
+ logger.warning( f"Calling super().encode with {kwargs}" )
220
+ return super().encode(text, **kwargs)
221
+
222
+ assert type(text) is str
223
+
224
+ # The tiktoken tokenizer can handle <=400k chars without
225
+ # pyo3_runtime.PanicException.
226
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
227
+
228
+ # https://github.com/openai/tiktoken/issues/195
229
+ # Here we iterate over subsequences and split if we exceed the limit
230
+ # of max consecutive non-whitespace or whitespace characters.
231
+ MAX_NO_WHITESPACES_CHARS = 25_000
232
+
233
+ texts = self.pre_tokenizer_process(text)
234
+
235
+ all_substrs = []
236
+ for text in texts:
237
+ substrs = (
238
+ substr
239
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
240
+ for substr in self._split_whitespaces_or_nonwhitespaces(
241
+ text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
242
+ )
243
+ )
244
+ all_substrs.extend(substrs)
245
+
246
+ t: List[int] = []
247
+ for substr in all_substrs:
248
+ if allow_special_tokens:
249
+ t.extend(
250
+ self.model.encode(
251
+ substr,
252
+ allowed_special="all",
253
+ )
254
+ )
255
+ else:
256
+ t.extend(
257
+ self.model.encode(
258
+ substr,
259
+ disallowed_special=(),
260
+ )
261
+ )
262
+
263
+ return t
264
+
265
+ def decode(
266
+ self,
267
+ token_ids: Union[int, List[int]],
268
+ **kwargs
269
+ ) -> str:
270
+ """
271
+ Decodes a list of token IDs into a string.
272
+
273
+ Args:
274
+ token_ids (List[int]): The list of token IDs to be decoded.
275
+
276
+ Returns:
277
+ str: The decoded string.
278
+ """
279
+ # If there are other args, we should call super().decode because there are a lot of code
280
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
281
+ if len(kwargs) > 0:
282
+ return super().decode(token_ids, **kwargs)
283
+
284
+ if type(token_ids) is int:
285
+ token_ids = [token_ids]
286
+
287
+ return self.model.decode(cast(List[int], token_ids))
288
+
289
+ @staticmethod
290
+ def _split_whitespaces_or_nonwhitespaces(
291
+ s: str, max_consecutive_slice_len: int
292
+ ) -> Iterator[str]:
293
+ """
294
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
295
+ consecutive whitespaces or consecutive non-whitespaces.
296
+ """
297
+ current_slice_len = 0
298
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
299
+ slice_start = 0
300
+
301
+ for i in range(len(s)):
302
+ is_now_space = s[i].isspace()
303
+
304
+ if current_slice_is_space ^ is_now_space:
305
+ current_slice_len = 1
306
+ current_slice_is_space = is_now_space
307
+ else:
308
+ current_slice_len += 1
309
+ if current_slice_len > max_consecutive_slice_len:
310
+ yield s[slice_start:i]
311
+ slice_start = i
312
+ current_slice_len = 1
313
+ yield s[slice_start:]
314
+
315
+ def pre_tokenizer_process(self, text: str) -> List[str]:
316
+ """
317
+ pre-tokenizes the input text into a list of tokens.
318
+ This method is used to split the input text into smaller chunks for internal processing.
319
+ """
320
+ return [text]
321
+
322
+
323
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
324
+ @property
325
+ def vocab_size(self) -> int:
326
+ return self.n_words
327
+
328
+ def get_vocab(self) -> Dict[str, int]:
329
+ return self.encoder
330
+
331
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
332
+ return [
333
+ self.decoder[t]
334
+ for t in self.encode(text)
335
+ ]
336
+
337
+ def _convert_token_to_id(self, token: str) -> int:
338
+ return self.encoder.get(token, self.unk_id)
339
+
340
+ def _convert_id_to_token(self, index: int) -> str:
341
+ return self.decoder.get(index)
342
+
343
+ @staticmethod
344
+ def clean_up_tokenization(out_string: str) -> str:
345
+ return out_string
346
+
347
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
348
+ text = ''.join(tokens)
349
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
350
+ return text
351
+
352
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
353
+ if not os.path.isdir(save_directory):
354
+ raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
355
+ out_vocab_file = os.path.join(
356
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
357
+ )
358
+
359
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
360
+ copyfile(self.vocab_file, out_vocab_file)
361
+
362
+ return (out_vocab_file,)
363
+
364
+
365
+ class TikTokenV3(TikTokenTokenizer):
366
+ num_reserved_special_tokens = 293 + 128
367
+ pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
tokenizer_config.json ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151643": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151644": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151645": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151646": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151647": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151648": {
44
+ "content": "<|reserved_token_0|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151649": {
52
+ "content": "<|start_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151650": {
60
+ "content": "<|end_header_id|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151651": {
68
+ "content": "<|reserved_token_1|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151652": {
76
+ "content": "[EOT]",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151653": {
84
+ "content": "<|im_system|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151654": {
92
+ "content": "<|reserved_token_2|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151655": {
100
+ "content": "<|reserved_token_3|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151656": {
108
+ "content": "<|reserved_token_4|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "151657": {
116
+ "content": "<|reserved_token_5|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "151658": {
124
+ "content": "<|reserved_token_6|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "151659": {
132
+ "content": "<|reserved_token_7|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "151660": {
140
+ "content": "<|im_middle|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "151661": {
148
+ "content": "<|media_begin|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "151662": {
156
+ "content": "<|media_content|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "151663": {
164
+ "content": "<|media_end|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "151664": {
172
+ "content": "<|media_placeholder|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "152062": {
180
+ "content": "[UNK]",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "152063": {
188
+ "content": "[PAD]",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ }
195
+ },
196
+ "additional_special_tokens": [
197
+ "<|im_end|>",
198
+ "<|im_user|>",
199
+ "<|im_assistant|>",
200
+ "<|reserved_token_0|>",
201
+ "<|start_header_id|>",
202
+ "<|end_header_id|>",
203
+ "<|reserved_token_1|>",
204
+ "[EOT]",
205
+ "<|im_system|>",
206
+ "<|reserved_token_2|>",
207
+ "<|reserved_token_3|>",
208
+ "<|reserved_token_4|>",
209
+ "<|reserved_token_5|>",
210
+ "<|reserved_token_6|>",
211
+ "<|reserved_token_7|>",
212
+ "<|im_middle|>",
213
+ "<|media_begin|>",
214
+ "<|media_content|>",
215
+ "<|media_end|>",
216
+ "<|media_placeholder|>"
217
+ ],
218
+ "auto_map": {
219
+ "AutoTokenizer": "tokenization_opencua.TikTokenV3"
220
+ },
221
+ "bos_token": "[BOS]",
222
+ "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_begin|>image<|media_content|><|media_placeholder|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
223
+ "clean_up_tokenization_spaces": false,
224
+ "eos_token": "[EOS]",
225
+ "extra_special_tokens": {},
226
+ "model_max_length": 1000000000000000019884624838656,
227
+ "pad_token": "[PAD]",
228
+ "tokenizer_class": "TikTokenV3",
229
+ "unk_token": "[UNK]"
230
+ }