erfanzar commited on
Commit
edea10e
·
verified ·
1 Parent(s): 65399b9

Upload processor

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- for message in messages -%}
2
+ {%- if loop.first and messages[0]['role'] != 'system' -%}
3
+ {{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}
4
+ {%- endif -%}
5
+ {%- if message['role'] == 'system' -%}
6
+ {{'<|im_system|>'}}
7
+ {%- endif -%}
8
+ {%- if message['role'] == 'user' -%}
9
+ {{'<|im_user|>'}}
10
+ {%- endif -%}
11
+ {%- if message['role'] == 'assistant' -%}
12
+ {{'<|im_assistant|>'}}
13
+ {%- endif -%}
14
+ {{- message['role'] -}}
15
+ {{'<|im_middle|>'}}
16
+ {%- if message['content'] is string -%}
17
+ {{- message['content'] + '<|im_end|>' -}}
18
+ {%- else -%}
19
+ {%- for content in message['content'] -%}
20
+ {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
21
+ {{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}
22
+ {%- else -%}
23
+ {{content['text']}}
24
+ {%- endif -%}
25
+ {%- endfor -%}
26
+ {{'<|im_end|>'}}
27
+ {%- endif -%}
28
+ {%- endfor -%}
29
+ {%- if add_generation_prompt -%}
30
+ {{'<|im_assistant|>assistant<|im_middle|>'}}
31
+ {%- endif -%}
image_processing_kimi_vl.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for KimiVL."""
2
+
3
+ import math
4
+ import numpy as np
5
+ from PIL import Image
6
+ from typing import Optional, Union
7
+
8
+ import torch
9
+ from torchvision.transforms import functional as TF
10
+ from transformers.image_utils import ImageInput, make_list_of_images, valid_images
11
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
12
+ from transformers.utils import TensorType
13
+
14
+
15
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
16
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
17
+
18
+
19
+ class KimiVLImageProcessor(BaseImageProcessor):
20
+ model_type = "kimi_vl"
21
+
22
+ def __init__(
23
+ self,
24
+ patch_size: int = 14,
25
+ pad_input: bool = False,
26
+ image_mean: tuple[float, float, float] = OPENAI_DATASET_MEAN,
27
+ image_std: tuple[float, float, float] = OPENAI_DATASET_STD,
28
+ in_token_limit: int = 4096,
29
+ merge_kernel_size: list[int, int] = [2, 2],
30
+ **kwargs,
31
+ ):
32
+ super().__init__(**kwargs)
33
+ self.in_token_limit = in_token_limit
34
+ self.patch_size = patch_size
35
+ self.pad_input = pad_input
36
+ self.image_mean = image_mean
37
+ self.image_std = image_std
38
+ self.merge_kernel_size = merge_kernel_size
39
+
40
+ def rescale(
41
+ self, image: Image.Image, merge_kernel_size: list[int, int] = [2, 2]
42
+ ) -> Image.Image:
43
+ w, h = image.size
44
+ patch_size = self.patch_size
45
+
46
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
47
+ scale = math.sqrt(self.in_token_limit / ((w // patch_size) * (h // patch_size)))
48
+ new_w, new_h = int(w * scale), int(h * scale)
49
+ image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
50
+ if self.pad_input:
51
+ new_w, new_h = image.size
52
+ pad_size_h = merge_kernel_size[0] * patch_size
53
+ pad_size_w = merge_kernel_size[1] * patch_size
54
+
55
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
56
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
57
+
58
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
59
+ else:
60
+ new_w, new_h = image.size
61
+ new_w = new_w - new_w % patch_size
62
+ new_h = new_h - new_h % patch_size
63
+ image = TF.center_crop(image, (new_h, new_w))
64
+
65
+ w, h = image.size
66
+ if w // patch_size >= 512 or h // patch_size >= 512:
67
+ raise ValueError("Exceed pos emb")
68
+
69
+ return image
70
+
71
+ def to_tensor(self, image: Image.Image) -> torch.Tensor:
72
+ return TF.to_tensor(image.convert("RGB"))
73
+
74
+ def normalize(self, image: torch.Tensor) -> torch.Tensor:
75
+ return TF.normalize(image, self.image_mean, self.image_std)
76
+
77
+ def patchify(self, image: torch.Tensor) -> tuple[torch.Tensor, list[int, int]]:
78
+ patch_size = self.patch_size
79
+ C, H, W = image.shape
80
+ patches = image.reshape(C, H // patch_size, patch_size, W // patch_size, patch_size)
81
+ patches = patches.permute(1, 3, 0, 2, 4)
82
+ patches = patches.contiguous().view(-1, C, patch_size, patch_size)
83
+ grid_hw = (H // patch_size, W // patch_size)
84
+ return patches, grid_hw
85
+
86
+ def _preprocess(self, image: ImageInput) -> tuple[torch.Tensor, list[int, int]]:
87
+ """
88
+ Preprocess image and patchify it.
89
+
90
+ Args:
91
+ image (`ImageInput`):
92
+ Image to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
93
+
94
+ Returns:
95
+ patches: torch.Tensor
96
+ grid_hw: list[int, int]
97
+ """
98
+ image = self.rescale(image, self.merge_kernel_size)
99
+ image = self.to_tensor(image)
100
+ image = self.normalize(image)
101
+ patches, grid_hw = self.patchify(image)
102
+ return patches, grid_hw
103
+
104
+ def preprocess(
105
+ self,
106
+ images: ImageInput,
107
+ return_tensors: Optional[Union[str, TensorType]] = None,
108
+ ) -> BatchFeature:
109
+ images = make_list_of_images(images)
110
+
111
+ if not valid_images(images):
112
+ raise ValueError(
113
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
114
+ "torch.Tensor, tf.Tensor or jax.ndarray."
115
+ )
116
+
117
+ pixel_values, image_grid_hws = [], []
118
+ for image in images:
119
+ patches, image_grid_hw = self._preprocess(image)
120
+ pixel_values.append(patches)
121
+ image_grid_hws.append(image_grid_hw)
122
+ pixel_values = torch.concat(pixel_values, dim=0)
123
+ image_grid_hws = np.array(image_grid_hws)
124
+ data = {"pixel_values": pixel_values, "image_grid_hws": image_grid_hws}
125
+
126
+ return BatchFeature(data=data, tensor_type=return_tensors)
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_kimi_vl.KimiVLImageProcessor",
4
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
5
+ },
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "KimiVLImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "in_token_limit": 4096,
18
+ "merge_kernel_size": [
19
+ 2,
20
+ 2
21
+ ],
22
+ "num_pooled_tokens": 1024,
23
+ "pad_input": true,
24
+ "patch_size": 14,
25
+ "processor_class": "KimiVLProcessor"
26
+ }
processing_kimi_vl.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Moonshot Team and HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # The code is based on the Qwen2VL processor (qwen2_vl/processing_qwen2_vl.py), but modified for KimiVL.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """
18
+ Processor class for KimiVL.
19
+ """
20
+
21
+ from typing import List, Union
22
+
23
+ from transformers.feature_extraction_utils import BatchFeature
24
+ from transformers.image_utils import ImageInput
25
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
26
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
27
+ from transformers.utils import logging
28
+
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ class KimiVLProcessorKwargs(ProcessingKwargs, total=False):
34
+ _defaults = {
35
+ "text_kwargs": {
36
+ "padding": False,
37
+ },
38
+ "images_kwargs": {},
39
+ }
40
+
41
+
42
+ class KimiVLProcessor(ProcessorMixin):
43
+ r"""
44
+ Constructs a KimiVL processor which wraps a KimiVL image processor and a tokenizer into a single processor.
45
+
46
+ [`KimiVLProcessor`] offers all the functionalities of [`KimiVLImageProcessor`] and [`TikTokenTokenizer`]. See the
47
+ [`~KimiVLProcessor.__call__`] and [`~KimiVLProcessor.decode`] for more information.
48
+
49
+ Args:
50
+ image_processor ([`KimiVLImageProcessor`], *optional*):
51
+ The image processor is a required input.
52
+ tokenizer ([`TikTokenTokenizer`], *optional*):
53
+ The tokenizer is a required input.
54
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
55
+ in a chat into a tokenizable string.
56
+ """
57
+
58
+ attributes = ["image_processor", "tokenizer"]
59
+ valid_kwargs = [ "chat_template"]
60
+ image_processor_class = "AutoImageProcessor"
61
+ tokenizer_class = "AutoTokenizer"
62
+
63
+ def __init__(
64
+ self,
65
+ image_processor=None,
66
+ tokenizer=None,
67
+ chat_template=None,
68
+ **kwargs,
69
+ ):
70
+ self.image_token = "<|media_pad|>"
71
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
72
+
73
+ def __call__(
74
+ self,
75
+ images: ImageInput = None,
76
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
77
+ **kwargs: Unpack[KimiVLProcessorKwargs],
78
+ ) -> BatchFeature:
79
+ """
80
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
81
+ and `kwargs` arguments to TikTokenTokenizer's [`~TikTokenTokenizer.__call__`] if `text` is not `None` to encode
82
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
83
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
84
+ of the above two methods for more information.
85
+
86
+ Args:
87
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
88
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
89
+ tensor. Both channels-first and channels-last formats are supported.
90
+ text (`str`, `List[str]`, `List[List[str]]`):
91
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
92
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
93
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
94
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
95
+ If set, will return tensors of a particular framework. Acceptable values are:
96
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
97
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
98
+ - `'np'`: Return NumPy `np.ndarray` objects.
99
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
100
+
101
+ Returns:
102
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
103
+
104
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
105
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
106
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
107
+ `None`).
108
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
109
+ """
110
+ if images is None and text is None:
111
+ raise ValueError("You have to specify at least one of `images` or `text`.")
112
+
113
+ output_kwargs = self._merge_kwargs(
114
+ KimiVLProcessorKwargs,
115
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
116
+ **kwargs,
117
+ )
118
+ if images is not None:
119
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
120
+ image_grid_hws = image_inputs["image_grid_hws"]
121
+ else:
122
+ image_inputs = {}
123
+ image_grid_hws = None
124
+
125
+ if isinstance(text, str):
126
+ text = [text]
127
+ elif not isinstance(text, list) and not isinstance(text[0], str):
128
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
129
+
130
+ if image_grid_hws is not None:
131
+ merge_length = self.image_processor.merge_kernel_size[0] * self.image_processor.merge_kernel_size[1]
132
+ index = 0
133
+ for i in range(len(text)):
134
+ while self.image_token in text[i]:
135
+ text[i] = text[i].replace(
136
+ self.image_token,
137
+ "<|placeholder|>" * (image_grid_hws[index].prod() // merge_length),
138
+ 1,
139
+ )
140
+ index += 1
141
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
142
+
143
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
144
+ return BatchFeature(data={**text_inputs, **image_inputs})
145
+
146
+ def batch_decode(self, *args, **kwargs):
147
+ """
148
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
149
+ refer to the docstring of this method for more information.
150
+ """
151
+ return self.tokenizer.batch_decode(*args, **kwargs)
152
+
153
+ def decode(self, *args, **kwargs):
154
+ """
155
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
156
+ the docstring of this method for more information.
157
+ """
158
+ return self.tokenizer.decode(*args, **kwargs)
159
+
160
+ @property
161
+ def model_input_names(self):
162
+ tokenizer_input_names = self.tokenizer.model_input_names
163
+ image_processor_input_names = self.image_processor.model_input_names
164
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
165
+
166
+
167
+ __all__ = ["KimiVLProcessorKwargs"]
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
4
+ },
5
+ "processor_class": "KimiVLProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>",
4
+ "<|im_user|>",
5
+ "<|im_assistant|>",
6
+ "<|im_system|>",
7
+ "<|im_middle|>",
8
+ "<|media_start|>",
9
+ "<|media_content|>",
10
+ "<|media_end|>",
11
+ "<|media_pad|>"
12
+ ],
13
+ "bos_token": {
14
+ "content": "[BOS]",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "content": "[EOS]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "pad_token": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "unk_token": {
35
+ "content": "[UNK]",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ }
41
+ }
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_moonshot.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.utils import to_py_obj
20
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+ SPIECE_UNDERLINE = "▁"
26
+
27
+
28
+ class TikTokenTokenizer(PreTrainedTokenizer):
29
+ """
30
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
31
+
32
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
33
+ this superclass for more information regarding those methods.
34
+
35
+ Args:
36
+ vocab_file (`str`):
37
+ The path to the Tiktoken model file.
38
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
39
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
40
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
41
+ The end of sequence token.
42
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
43
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
44
+ token instead. The second to last item in special_tokens.
45
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
46
+ The token used for padding, for example when batching sequences of different lengths.
47
+ additional_special_tokens (list of `str`, *optional*):
48
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
49
+ skipped when decoding if `skip_special_tokens` is set to `True`.
50
+ """
51
+
52
+ vocab_files_names = VOCAB_FILES_NAMES
53
+
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ special_tokens: Dict[str, int]
57
+
58
+ num_reserved_special_tokens = 256
59
+
60
+ pat_str = "|".join(
61
+ [
62
+ r"""[\p{Han}]+""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
65
+ r"""\p{N}{1,3}""",
66
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
67
+ r"""\s*[\r\n]+""",
68
+ r"""\s+(?!\S)""",
69
+ r"""\s+""",
70
+ ]
71
+ )
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_file,
76
+ bos_token: Union[str, AddedToken] = "[BOS]",
77
+ eos_token: Union[str, AddedToken] = "[EOS]",
78
+ unk_token: Union[str, AddedToken] = "[UNK]",
79
+ pad_token: Union[str, AddedToken] = "[PAD]",
80
+ additional_special_tokens: Optional[List[str]] = None,
81
+ added_tokens_decoder: Optional[dict] = None,
82
+ **kwargs,
83
+ ):
84
+ assert os.path.isfile(vocab_file), vocab_file
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_middle|>",
89
+ "<|im_user|>",
90
+ "<|im_assistant|>",
91
+ "<|im_system|>",
92
+ ]
93
+ special_tokens_mapping = {
94
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
95
+ }
96
+
97
+ self.vocab_file = vocab_file
98
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
99
+ num_base_tokens = len(mergeable_ranks)
100
+ self.special_tokens = {
101
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
102
+ for i in range(
103
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
104
+ )
105
+ }
106
+
107
+ self.model = tiktoken.Encoding(
108
+ name=Path(vocab_file).name,
109
+ pat_str=self.pat_str,
110
+ mergeable_ranks=mergeable_ranks,
111
+ special_tokens=self.special_tokens,
112
+ )
113
+
114
+ self.n_words: int = self.model.n_vocab
115
+ # BOS / EOS token IDs
116
+ self.bos_id: int = self.special_tokens[str(bos_token)]
117
+ self.eos_id: int = self.special_tokens[str(eos_token)]
118
+
119
+ self.pad_id: int = self.special_tokens[str(pad_token)]
120
+ self.unk_id: int = self.special_tokens[str(unk_token)]
121
+
122
+ self.byte_encoder = bytes_to_unicode()
123
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
124
+
125
+ self.decoder = {}
126
+ for i in range(self.n_words):
127
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
128
+ decoding = "".join(
129
+ [
130
+ self.byte_encoder[ord(char)]
131
+ for char in self.model.decode_single_token_bytes(i).decode(
132
+ "latin-1"
133
+ )
134
+ ]
135
+ )
136
+ self.decoder[i] = decoding
137
+
138
+ self.encoder = {}
139
+ for i in range(self.n_words):
140
+ if i in self.decoder:
141
+ self.encoder[self.decoder[i]] = i
142
+
143
+ super().__init__(
144
+ bos_token=bos_token,
145
+ eos_token=eos_token,
146
+ unk_token=unk_token,
147
+ pad_token=pad_token,
148
+ additional_special_tokens=additional_special_tokens,
149
+ **kwargs,
150
+ )
151
+ self.all_special_ids_set = set(self.all_special_ids)
152
+
153
+ def encode(
154
+ self, text: str, allow_special_tokens: bool = True, **kwargs
155
+ ) -> List[int]:
156
+ """
157
+ Encodes a string into a list of token IDs.
158
+
159
+ Args:
160
+ text (str): The input string to be encoded.
161
+
162
+ Returns:
163
+ list[int]: A list of token IDs.
164
+ """
165
+ # If there are other args, we should call super().encode because there are a lot of code
166
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
167
+ if len(kwargs) > 0:
168
+ return super().encode(text, **kwargs)
169
+
170
+ assert type(text) is str
171
+
172
+ # The tiktoken tokenizer can handle <=400k chars without
173
+ # pyo3_runtime.PanicException.
174
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
175
+
176
+ # https://github.com/openai/tiktoken/issues/195
177
+ # Here we iterate over subsequences and split if we exceed the limit
178
+ # of max consecutive non-whitespace or whitespace characters.
179
+ MAX_NO_WHITESPACES_CHARS = 25_000
180
+
181
+ substrs = (
182
+ substr
183
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
184
+ for substr in self._split_whitespaces_or_nonwhitespaces(
185
+ text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
186
+ )
187
+ )
188
+ t: List[int] = []
189
+ for substr in substrs:
190
+ if allow_special_tokens:
191
+ t.extend(
192
+ # we should consider special token as a common token
193
+ self.model.encode(
194
+ substr,
195
+ allowed_special="all",
196
+ )
197
+ )
198
+ else:
199
+ t.extend(
200
+ # we should consider special token as a common token
201
+ self.model.encode(
202
+ substr,
203
+ disallowed_special=(),
204
+ )
205
+ )
206
+ return t
207
+
208
+ def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
209
+ """
210
+ Decodes a list of token IDs into a string.
211
+
212
+ Args:
213
+ t (List[int]): The list of token IDs to be decoded.
214
+
215
+ Returns:
216
+ str: The decoded string.
217
+ """
218
+ # If there are other args, we should call super().decode because there are a lot of code
219
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
220
+ if len(kwargs) > 0:
221
+ return super().decode(token_ids, **kwargs)
222
+
223
+ token_ids = to_py_obj(token_ids)
224
+
225
+ if type(token_ids) is int:
226
+ token_ids = [token_ids]
227
+
228
+ return self.model.decode(cast(List[int], token_ids))
229
+
230
+ @staticmethod
231
+ def _split_whitespaces_or_nonwhitespaces(
232
+ s: str, max_consecutive_slice_len: int
233
+ ) -> Iterator[str]:
234
+ """
235
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
236
+ consecutive whitespaces or consecutive non-whitespaces.
237
+ """
238
+ current_slice_len = 0
239
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
240
+ slice_start = 0
241
+
242
+ for i in range(len(s)):
243
+ is_now_space = s[i].isspace()
244
+
245
+ if current_slice_is_space ^ is_now_space:
246
+ current_slice_len = 1
247
+ current_slice_is_space = is_now_space
248
+ else:
249
+ current_slice_len += 1
250
+ if current_slice_len > max_consecutive_slice_len:
251
+ yield s[slice_start:i]
252
+ slice_start = i
253
+ current_slice_len = 1
254
+ yield s[slice_start:]
255
+
256
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
257
+
258
+ @property
259
+ def vocab_size(self) -> int:
260
+ return self.n_words
261
+
262
+ def get_vocab(self) -> Dict[str, int]:
263
+ return self.encoder
264
+
265
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
266
+ return [self.decoder[t] for t in self.encode(text)]
267
+
268
+ def _convert_token_to_id(self, token: str) -> int:
269
+ return self.encoder.get(token, self.unk_id)
270
+
271
+ def _convert_id_to_token(self, index: int) -> str:
272
+ return self.decoder.get(index)
273
+
274
+ @staticmethod
275
+ def clean_up_tokenization(out_string: str) -> str:
276
+ return out_string
277
+
278
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
279
+ text = "".join(tokens).replace(SPIECE_UNDERLINE, "")
280
+ text = bytearray([self.byte_decoder[c] for c in text]).decode(
281
+ "utf-8", "replace"
282
+ )
283
+ return text
284
+
285
+ def save_vocabulary(
286
+ self, save_directory: str, filename_prefix: Optional[str] = None
287
+ ) -> Tuple[str]:
288
+ if not os.path.isdir(save_directory):
289
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
290
+ return
291
+ out_vocab_file = os.path.join(
292
+ save_directory,
293
+ (filename_prefix + "-" if filename_prefix else "")
294
+ + VOCAB_FILES_NAMES["vocab_file"],
295
+ )
296
+
297
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
298
+ out_vocab_file
299
+ ) and os.path.isfile(self.vocab_file):
300
+ copyfile(self.vocab_file, out_vocab_file)
301
+
302
+ return (out_vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163594": {
44
+ "content": "<|im_system|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163601": {
52
+ "content": "<|im_middle|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163602": {
60
+ "content": "<|media_start|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163603": {
68
+ "content": "<|media_content|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163604": {
76
+ "content": "<|media_end|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "163605": {
84
+ "content": "<|media_pad|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "163838": {
92
+ "content": "[PAD]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "163839": {
100
+ "content": "[UNK]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ }
107
+ },
108
+ "additional_special_tokens": [
109
+ "<|im_end|>",
110
+ "<|im_user|>",
111
+ "<|im_assistant|>",
112
+ "<|im_system|>",
113
+ "<|im_middle|>",
114
+ "<|media_start|>",
115
+ "<|media_content|>",
116
+ "<|media_end|>",
117
+ "<|media_pad|>"
118
+ ],
119
+ "auto_map": {
120
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor",
121
+ "AutoTokenizer": [
122
+ "tokenization_moonshot.TikTokenTokenizer",
123
+ null
124
+ ]
125
+ },
126
+ "bos_token": "[BOS]",
127
+ "clean_up_tokenization_spaces": false,
128
+ "eos_token": "[EOS]",
129
+ "extra_special_tokens": {},
130
+ "model_max_length": 1048576,
131
+ "pad_token": "[PAD]",
132
+ "processor_class": "KimiVLProcessor",
133
+ "tokenizer_class": "TikTokenTokenizer",
134
+ "unk_token": "[UNK]"
135
+ }