JPShi commited on
Commit
247f1fd
·
verified ·
1 Parent(s): 8776c46

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/model.jpg filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # VideoLoom: A Video Large Language Model for Joint Spatial-Temporal Understanding
6
+
7
+ Jiapeng Shi, [Junke Wang](https://wdrink.github.io/), [Zuyao You](https://scholar.google.com/citations?hl=en&user=X8Kh8uoAAAAJ), [Bo He](https://boheumd.github.io/), [Zuxuan Wu<sup>&#9993;</sup>](https://zxwu.azurewebsites.net/)
8
+
9
+ [\[📜 Paper\]](https://arxiv.org/abs/2601.07290) [\[📥 Model\]](https://huggingface.co/collections/JPShi/videoloom)
10
+
11
+ ## 🔎 Overview
12
+
13
+ This paper presents **VideoLoom**, a unified Video Large Language Model (Video LLM) for joint spatial-temporal understanding. To facilitate the development of fine-grained spatial and temporal localization capabilities, we curate **LoomData-8.7k**, a human-centric video dataset with temporally grounded and spatially localized captions. With this, VideoLoom achieves state-of-the-art or highly competitive performance across a variety of spatial and temporal benchmarks (e.g., 63.1 J&F on ReVOS for referring video object segmentation, and 48.3 R1@0.7 on Charades-STA for temporal grounding). In addition, we introduce **LoomBench**, a novel benchmark consisting of temporal, spatial, and compositional video-question pairs, enabling a comprehensive evaluation of Video LLMs from diverse aspects. Collectively, these contributions offer a universal and effective suite for joint spatial-temporal video understanding, setting a new standard in multimodal intelligence.
14
+
15
+ ![Model](assets/model.jpg)
added_tokens.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</fast_img>": 151675,
4
+ "</img>": 151666,
5
+ "</p>": 151679,
6
+ "</quad>": 151669,
7
+ "</ref>": 151671,
8
+ "</tool_call>": 151658,
9
+ "</vp>": 151681,
10
+ "<FAST_IMG_CONTEXT>": 151676,
11
+ "<IMG_CONTEXT>": 151667,
12
+ "<box>": 151672,
13
+ "<fast_img>": 151674,
14
+ "<img>": 151665,
15
+ "<p>": 151678,
16
+ "<quad>": 151668,
17
+ "<ref>": 151670,
18
+ "<tool_call>": 151657,
19
+ "<vp>": 151680,
20
+ "<|box_end|>": 151649,
21
+ "<|box_start|>": 151648,
22
+ "<|endoftext|>": 151643,
23
+ "<|file_sep|>": 151664,
24
+ "<|fim_middle|>": 151660,
25
+ "<|fim_pad|>": 151662,
26
+ "<|fim_prefix|>": 151659,
27
+ "<|fim_suffix|>": 151661,
28
+ "<|im_end|>": 151645,
29
+ "<|im_start|>": 151644,
30
+ "<|image_pad|>": 151655,
31
+ "<|object_ref_end|>": 151647,
32
+ "<|object_ref_start|>": 151646,
33
+ "<|quad_end|>": 151651,
34
+ "<|quad_start|>": 151650,
35
+ "<|repo_name|>": 151663,
36
+ "<|video_pad|>": 151656,
37
+ "<|vision_end|>": 151653,
38
+ "<|vision_pad|>": 151654,
39
+ "<|vision_start|>": 151652,
40
+ "[SEG]": 151677
41
+ }
assets/model.jpg ADDED

Git LFS Details

  • SHA256: a98cbe3ddf5b9fb74a88bf3096f98b6ba837667f515098b105a17dfac491402e
  • Pointer size: 132 Bytes
  • Size of remote file: 2.43 MB
config.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/mnt/petrelfs/wangweiyun/workspace_wwy/open_source/InternVL/internvl_chat/work_dirs/internvl_chat_v3_0/InternVL3_0-8B-MPO-try0-2",
4
+ "architectures": [
5
+ "Sa2VAChatModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_sa2va_chat.Sa2VAChatConfig",
9
+ "AutoModel": "modeling_sa2va_chat.Sa2VAChatModel",
10
+ "AutoModelForCausalLM": "modeling_sa2va_chat.Sa2VAChatModel"
11
+ },
12
+ "downsample_ratio": 0.5,
13
+ "dynamic_image_size": true,
14
+ "force_image_size": 448,
15
+ "hidden_size": 3584,
16
+ "image_fold": null,
17
+ "llm_config": {
18
+ "_attn_implementation_autoset": true,
19
+ "_name_or_path": "./pretrained/Qwen2.5-32B-Instruct",
20
+ "add_cross_attention": false,
21
+ "architectures": [
22
+ "Qwen2ForCausalLM"
23
+ ],
24
+ "attention_dropout": 0.0,
25
+ "bad_words_ids": null,
26
+ "begin_suppress_tokens": null,
27
+ "bos_token_id": 151643,
28
+ "chunk_size_feed_forward": 0,
29
+ "cross_attention_hidden_size": null,
30
+ "decoder_start_token_id": null,
31
+ "diversity_penalty": 0.0,
32
+ "do_sample": false,
33
+ "early_stopping": false,
34
+ "encoder_no_repeat_ngram_size": 0,
35
+ "eos_token_id": 151643,
36
+ "exponential_decay_length_penalty": null,
37
+ "finetuning_task": null,
38
+ "forced_bos_token_id": null,
39
+ "forced_eos_token_id": null,
40
+ "hidden_act": "silu",
41
+ "hidden_size": 3584,
42
+ "id2label": {
43
+ "0": "LABEL_0",
44
+ "1": "LABEL_1"
45
+ },
46
+ "initializer_range": 0.02,
47
+ "intermediate_size": 18944,
48
+ "is_decoder": false,
49
+ "is_encoder_decoder": false,
50
+ "label2id": {
51
+ "LABEL_0": 0,
52
+ "LABEL_1": 1
53
+ },
54
+ "length_penalty": 1.0,
55
+ "max_length": 20,
56
+ "max_position_embeddings": 32768,
57
+ "max_window_layers": 70,
58
+ "min_length": 0,
59
+ "model_type": "qwen2",
60
+ "moe_config": null,
61
+ "no_repeat_ngram_size": 0,
62
+ "num_attention_heads": 28,
63
+ "num_beam_groups": 1,
64
+ "num_beams": 1,
65
+ "num_hidden_layers": 28,
66
+ "num_key_value_heads": 4,
67
+ "num_return_sequences": 1,
68
+ "output_attentions": false,
69
+ "output_hidden_states": false,
70
+ "output_scores": false,
71
+ "pad_token_id": null,
72
+ "prefix": null,
73
+ "problem_type": null,
74
+ "pruned_heads": {},
75
+ "remove_invalid_values": false,
76
+ "repetition_penalty": 1.0,
77
+ "return_dict": true,
78
+ "return_dict_in_generate": false,
79
+ "rms_norm_eps": 1e-06,
80
+ "rope_scaling": {
81
+ "factor": 2.0,
82
+ "rope_type": "dynamic",
83
+ "type": "dynamic"
84
+ },
85
+ "rope_theta": 1000000.0,
86
+ "sep_token_id": null,
87
+ "sliding_window": null,
88
+ "suppress_tokens": null,
89
+ "task_specific_params": null,
90
+ "temperature": 1.0,
91
+ "tf_legacy_loss": false,
92
+ "tie_encoder_decoder": false,
93
+ "tie_word_embeddings": false,
94
+ "tokenizer_class": null,
95
+ "top_k": 50,
96
+ "top_p": 1.0,
97
+ "torch_dtype": "bfloat16",
98
+ "torchscript": false,
99
+ "transformers_version": "4.44.2",
100
+ "typical_p": 1.0,
101
+ "use_bfloat16": true,
102
+ "use_cache": false,
103
+ "use_sliding_window": false,
104
+ "vocab_size": 151682
105
+ },
106
+ "max_dynamic_patch": 12,
107
+ "min_dynamic_patch": 1,
108
+ "model_type": "sa2va_chat",
109
+ "pad2square": false,
110
+ "ps_version": "v2",
111
+ "select_layer": -1,
112
+ "system_message": null,
113
+ "template": "qwen_chat",
114
+ "tie_word_embeddings": false,
115
+ "torch_dtype": "bfloat16",
116
+ "transformers_version": null,
117
+ "use_backbone_lora": 0,
118
+ "use_llm_lora": 0,
119
+ "use_thumbnail": true,
120
+ "vision_config": {
121
+ "_attn_implementation_autoset": true,
122
+ "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5",
123
+ "add_cross_attention": false,
124
+ "architectures": [
125
+ "InternVisionModel"
126
+ ],
127
+ "attention_dropout": 0.0,
128
+ "auto_map": {
129
+ "AutoConfig": "configuration_intern_vit.InternVisionConfig",
130
+ "AutoModel": "modeling_intern_vit.InternVisionModel"
131
+ },
132
+ "bad_words_ids": null,
133
+ "begin_suppress_tokens": null,
134
+ "bos_token_id": null,
135
+ "capacity_factor": 1.2,
136
+ "chunk_size_feed_forward": 0,
137
+ "cross_attention_hidden_size": null,
138
+ "decoder_start_token_id": null,
139
+ "diversity_penalty": 0.0,
140
+ "do_sample": false,
141
+ "drop_path_rate": 0.1,
142
+ "dropout": 0.0,
143
+ "early_stopping": false,
144
+ "encoder_no_repeat_ngram_size": 0,
145
+ "eos_token_id": null,
146
+ "eval_capacity_factor": 1.4,
147
+ "exponential_decay_length_penalty": null,
148
+ "finetuning_task": null,
149
+ "forced_bos_token_id": null,
150
+ "forced_eos_token_id": null,
151
+ "hidden_act": "gelu",
152
+ "hidden_size": 1024,
153
+ "id2label": {
154
+ "0": "LABEL_0",
155
+ "1": "LABEL_1"
156
+ },
157
+ "image_size": 448,
158
+ "initializer_factor": 0.1,
159
+ "initializer_range": 1e-10,
160
+ "intermediate_size": 4096,
161
+ "is_decoder": false,
162
+ "is_encoder_decoder": false,
163
+ "label2id": {
164
+ "LABEL_0": 0,
165
+ "LABEL_1": 1
166
+ },
167
+ "laux_allreduce": "all_nodes",
168
+ "layer_norm_eps": 1e-06,
169
+ "length_penalty": 1.0,
170
+ "max_length": 20,
171
+ "min_length": 0,
172
+ "model_type": "intern_vit_6b",
173
+ "moe_coeff_ratio": 0.5,
174
+ "moe_intermediate_size": 768,
175
+ "moe_output_scale": 4.0,
176
+ "no_repeat_ngram_size": 0,
177
+ "noisy_gate_policy": "RSample_before",
178
+ "norm_type": "layer_norm",
179
+ "num_attention_heads": 16,
180
+ "num_beam_groups": 1,
181
+ "num_beams": 1,
182
+ "num_channels": 3,
183
+ "num_experts": 8,
184
+ "num_hidden_layers": 24,
185
+ "num_return_sequences": 1,
186
+ "num_routed_experts": 4,
187
+ "num_shared_experts": 4,
188
+ "output_attentions": false,
189
+ "output_hidden_states": false,
190
+ "output_scores": false,
191
+ "pad_token_id": null,
192
+ "patch_size": 14,
193
+ "prefix": null,
194
+ "problem_type": null,
195
+ "pruned_heads": {},
196
+ "qk_normalization": false,
197
+ "qkv_bias": true,
198
+ "remove_invalid_values": false,
199
+ "repetition_penalty": 1.0,
200
+ "return_dict": true,
201
+ "return_dict_in_generate": false,
202
+ "sep_token_id": null,
203
+ "shared_expert_intermediate_size": 3072,
204
+ "suppress_tokens": null,
205
+ "task_specific_params": null,
206
+ "temperature": 1.0,
207
+ "tf_legacy_loss": false,
208
+ "tie_encoder_decoder": false,
209
+ "tie_word_embeddings": true,
210
+ "tokenizer_class": null,
211
+ "top_k": 50,
212
+ "top_p": 1.0,
213
+ "torch_dtype": "bfloat16",
214
+ "torchscript": false,
215
+ "transformers_version": "4.44.2",
216
+ "typical_p": 1.0,
217
+ "use_bfloat16": true,
218
+ "use_flash_attn": true,
219
+ "use_moe": false,
220
+ "use_residual": true,
221
+ "use_rts": false,
222
+ "use_weighted_residual": false
223
+ }
224
+ }
configuration_intern_vit.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import os
8
+ from typing import Union
9
+
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+
13
+ logger = logging.get_logger(__name__)
14
+
15
+
16
+ class InternVisionConfig(PretrainedConfig):
17
+ r"""
18
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
19
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
20
+
21
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
22
+ documentation from [`PretrainedConfig`] for more information.
23
+
24
+ Args:
25
+ num_channels (`int`, *optional*, defaults to 3):
26
+ Number of color channels in the input images (e.g., 3 for RGB).
27
+ patch_size (`int`, *optional*, defaults to 14):
28
+ The size (resolution) of each patch.
29
+ image_size (`int`, *optional*, defaults to 224):
30
+ The size (resolution) of each image.
31
+ qkv_bias (`bool`, *optional*, defaults to `False`):
32
+ Whether to add a bias to the queries and values in the self-attention layers.
33
+ hidden_size (`int`, *optional*, defaults to 3200):
34
+ Dimensionality of the encoder layers and the pooler layer.
35
+ num_attention_heads (`int`, *optional*, defaults to 25):
36
+ Number of attention heads for each attention layer in the Transformer encoder.
37
+ intermediate_size (`int`, *optional*, defaults to 12800):
38
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
39
+ qk_normalization (`bool`, *optional*, defaults to `True`):
40
+ Whether to normalize the queries and keys in the self-attention layers.
41
+ num_hidden_layers (`int`, *optional*, defaults to 48):
42
+ Number of hidden layers in the Transformer encoder.
43
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
44
+ Whether to use flash attention mechanism.
45
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
46
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
47
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
48
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
49
+ The epsilon used by the layer normalization layers.
50
+ dropout (`float`, *optional*, defaults to 0.0):
51
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
52
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
53
+ Dropout rate for stochastic depth.
54
+ attention_dropout (`float`, *optional*, defaults to 0.0):
55
+ The dropout ratio for the attention probabilities.
56
+ initializer_range (`float`, *optional*, defaults to 0.02):
57
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
58
+ initializer_factor (`float`, *optional*, defaults to 0.1):
59
+ A factor for layer scale.
60
+ """
61
+
62
+ model_type = 'intern_vit_6b'
63
+
64
+ def __init__(
65
+ self,
66
+ num_channels=3,
67
+ patch_size=14,
68
+ image_size=224,
69
+ qkv_bias=False,
70
+ hidden_size=3200,
71
+ num_attention_heads=25,
72
+ intermediate_size=12800,
73
+ qk_normalization=True,
74
+ num_hidden_layers=48,
75
+ use_flash_attn=True,
76
+ hidden_act='gelu',
77
+ norm_type='rms_norm',
78
+ layer_norm_eps=1e-6,
79
+ dropout=0.0,
80
+ drop_path_rate=0.0,
81
+ attention_dropout=0.0,
82
+ initializer_range=0.02,
83
+ initializer_factor=0.1,
84
+ **kwargs,
85
+ ):
86
+ super().__init__(**kwargs)
87
+
88
+ self.hidden_size = hidden_size
89
+ self.intermediate_size = intermediate_size
90
+ self.dropout = dropout
91
+ self.drop_path_rate = drop_path_rate
92
+ self.num_hidden_layers = num_hidden_layers
93
+ self.num_attention_heads = num_attention_heads
94
+ self.num_channels = num_channels
95
+ self.patch_size = patch_size
96
+ self.image_size = image_size
97
+ self.initializer_range = initializer_range
98
+ self.initializer_factor = initializer_factor
99
+ self.attention_dropout = attention_dropout
100
+ self.layer_norm_eps = layer_norm_eps
101
+ self.hidden_act = hidden_act
102
+ self.norm_type = norm_type
103
+ self.qkv_bias = qkv_bias
104
+ self.qk_normalization = qk_normalization
105
+ self.use_flash_attn = use_flash_attn
106
+
107
+ @classmethod
108
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
109
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
+
111
+ if 'vision_config' in config_dict:
112
+ config_dict = config_dict['vision_config']
113
+
114
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
115
+ logger.warning(
116
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
117
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
118
+ )
119
+
120
+ return cls.from_dict(config_dict, **kwargs)
configuration_phi3.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License atd
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """ Phi-3 model configuration"""
16
+
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import logging
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
+ 'microsoft/Phi-3-mini-4k-instruct': 'https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json',
25
+ 'microsoft/Phi-3-mini-128k-instruct': 'https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json',
26
+ }
27
+
28
+
29
+ class Phi3Config(PretrainedConfig):
30
+ r"""
31
+ This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
32
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
33
+ defaults will yield a similar configuration to that of the
34
+ [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
35
+
36
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37
+ documentation from [`PretrainedConfig`] for more information.
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 32064):
41
+ Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`Phi3Model`].
43
+ hidden_size (`int`, *optional*, defaults to 3072):
44
+ Dimension of the hidden representations.
45
+ intermediate_size (`int`, *optional*, defaults to 8192):
46
+ Dimension of the MLP representations.
47
+ num_hidden_layers (`int`, *optional*, defaults to 32):
48
+ Number of hidden layers in the Transformer decoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 32):
50
+ Number of attention heads for each attention layer in the Transformer decoder.
51
+ num_key_value_heads (`int`, *optional*):
52
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
53
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
56
+ by meanpooling all the original heads within that group. For more details checkout [this
57
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
58
+ `num_attention_heads`.
59
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
60
+ Dropout probability for mlp outputs.
61
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
62
+ The dropout ratio for the embeddings.
63
+ attention_dropout (`float`, *optional*, defaults to 0.0):
64
+ The dropout ratio after computing the attention scores.
65
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
66
+ The non-linear activation function (function or string) in the decoder.
67
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
68
+ The maximum sequence length that this model might ever be used with.
69
+ original_max_position_embeddings (`int`, *optional*, defaults to 4096):
70
+ The maximum sequence length that this model was trained with. This is used to determine the size of the
71
+ original RoPE embeddings when using long scaling.
72
+ initializer_range (`float`, *optional*, defaults to 0.02):
73
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
74
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
75
+ The epsilon value used for the RMSNorm.
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
78
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
79
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
80
+ Whether to tie weight embeddings
81
+ rope_theta (`float`, *optional*, defaults to 10000.0):
82
+ The base period of the RoPE embeddings.
83
+ rope_scaling (`dict`, *optional*):
84
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
85
+ contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
86
+ the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
87
+ divided by the number of attention heads divided by 2.
88
+ bos_token_id (`int`, *optional*, defaults to 1):
89
+ The id of the "beginning-of-sequence" token.
90
+ eos_token_id (`int`, *optional*, defaults to 32000):
91
+ The id of the "end-of-sequence" token.
92
+ pad_token_id (`int`, *optional*, defaults to 32000):
93
+ The id of the padding token.
94
+ sliding_window (`int`, *optional*):
95
+ Sliding window attention window size. If `None`, no sliding window is applied.
96
+
97
+ Example:
98
+
99
+ ```python
100
+ >>> from transformers import Phi3Model, Phi3Config
101
+
102
+ >>> # Initializing a Phi-3 style configuration
103
+ >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
104
+
105
+ >>> # Initializing a model from the configuration
106
+ >>> model = Phi3Model(configuration)
107
+
108
+ >>> # Accessing the model configuration
109
+ >>> configuration = model.config
110
+ ```"""
111
+
112
+ model_type = 'phi3'
113
+ keys_to_ignore_at_inference = ['past_key_values']
114
+
115
+ def __init__(
116
+ self,
117
+ vocab_size=32064,
118
+ hidden_size=3072,
119
+ intermediate_size=8192,
120
+ num_hidden_layers=32,
121
+ num_attention_heads=32,
122
+ num_key_value_heads=None,
123
+ resid_pdrop=0.0,
124
+ embd_pdrop=0.0,
125
+ attention_dropout=0.0,
126
+ hidden_act='silu',
127
+ max_position_embeddings=4096,
128
+ original_max_position_embeddings=4096,
129
+ initializer_range=0.02,
130
+ rms_norm_eps=1e-5,
131
+ use_cache=True,
132
+ tie_word_embeddings=False,
133
+ rope_theta=10000.0,
134
+ rope_scaling=None,
135
+ bos_token_id=1,
136
+ eos_token_id=32000,
137
+ pad_token_id=32000,
138
+ sliding_window=None,
139
+ **kwargs,
140
+ ):
141
+ self.vocab_size = vocab_size
142
+ self.hidden_size = hidden_size
143
+ self.intermediate_size = intermediate_size
144
+ self.num_hidden_layers = num_hidden_layers
145
+ self.num_attention_heads = num_attention_heads
146
+
147
+ if num_key_value_heads is None:
148
+ num_key_value_heads = num_attention_heads
149
+
150
+ self.num_key_value_heads = num_key_value_heads
151
+ self.resid_pdrop = resid_pdrop
152
+ self.embd_pdrop = embd_pdrop
153
+ self.attention_dropout = attention_dropout
154
+ self.hidden_act = hidden_act
155
+ self.max_position_embeddings = max_position_embeddings
156
+ self.original_max_position_embeddings = original_max_position_embeddings
157
+ self.initializer_range = initializer_range
158
+ self.rms_norm_eps = rms_norm_eps
159
+ self.use_cache = use_cache
160
+ self.rope_theta = rope_theta
161
+ self.rope_scaling = rope_scaling
162
+ self._rope_scaling_validation()
163
+ self.sliding_window = sliding_window
164
+
165
+ super().__init__(
166
+ bos_token_id=bos_token_id,
167
+ eos_token_id=eos_token_id,
168
+ pad_token_id=pad_token_id,
169
+ tie_word_embeddings=tie_word_embeddings,
170
+ **kwargs,
171
+ )
172
+
173
+ def _rope_scaling_validation(self):
174
+ """
175
+ Validate the `rope_scaling` configuration.
176
+ """
177
+ if self.rope_scaling is None:
178
+ return
179
+
180
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
181
+ raise ValueError(
182
+ '`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, '
183
+ f'got {self.rope_scaling}'
184
+ )
185
+ rope_scaling_type = self.rope_scaling.get('type', None)
186
+ rope_scaling_short_factor = self.rope_scaling.get('short_factor', None)
187
+ rope_scaling_long_factor = self.rope_scaling.get('long_factor', None)
188
+ if rope_scaling_type is None or rope_scaling_type not in ['su', 'yarn']:
189
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
190
+ if not (
191
+ isinstance(rope_scaling_short_factor, list)
192
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
193
+ ):
194
+ raise ValueError(
195
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
196
+ )
197
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
198
+ raise ValueError(
199
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
200
+ )
201
+ if not (
202
+ isinstance(rope_scaling_long_factor, list)
203
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
204
+ ):
205
+ raise ValueError(
206
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
207
+ )
208
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
209
+ raise ValueError(
210
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
211
+ )
configuration_sa2va_chat.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import copy
8
+
9
+ from .configuration_internlm2 import InternLM2Config
10
+ from .configuration_phi3 import Phi3Config
11
+ from transformers import AutoConfig, LlamaConfig, Qwen2Config
12
+ from transformers.configuration_utils import PretrainedConfig
13
+ from transformers.utils import logging
14
+
15
+ from .configuration_intern_vit import InternVisionConfig
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+
20
+ class Sa2VAChatConfig(PretrainedConfig):
21
+ model_type = 'sa2va_chat'
22
+ is_composition = True
23
+
24
+ def __init__(
25
+ self,
26
+ vision_config=None,
27
+ llm_config=None,
28
+ use_backbone_lora=0,
29
+ use_llm_lora=0,
30
+ pad2square=False,
31
+ select_layer=-1,
32
+ force_image_size=None,
33
+ downsample_ratio=0.5,
34
+ template=None,
35
+ dynamic_image_size=False,
36
+ use_thumbnail=False,
37
+ ps_version='v1',
38
+ min_dynamic_patch=1,
39
+ max_dynamic_patch=6,
40
+ **kwargs):
41
+ super().__init__(**kwargs)
42
+ if vision_config is None:
43
+ vision_config = {}
44
+ logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
45
+
46
+ if llm_config is None:
47
+ llm_config = {}
48
+ logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
49
+
50
+ self.vision_config = InternVisionConfig(**vision_config)
51
+
52
+ if llm_config['architectures'][0] == 'LlamaForCausalLM':
53
+ self.llm_config = LlamaConfig(**llm_config)
54
+ elif llm_config['architectures'][0] == 'InternLM2ForCausalLM':
55
+ self.llm_config = InternLM2Config(**llm_config)
56
+ elif llm_config['architectures'][0] == 'Phi3ForCausalLM':
57
+ self.llm_config = Phi3Config(**llm_config)
58
+ elif llm_config['architectures'][0] == 'Qwen2ForCausalLM':
59
+ self.llm_config = Qwen2Config(**llm_config)
60
+ else:
61
+ raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
62
+ self.use_backbone_lora = use_backbone_lora
63
+ self.use_llm_lora = use_llm_lora
64
+ self.pad2square = pad2square
65
+ self.select_layer = select_layer
66
+ self.force_image_size = force_image_size
67
+ self.downsample_ratio = downsample_ratio
68
+ self.template = template
69
+ self.dynamic_image_size = dynamic_image_size
70
+ self.use_thumbnail = use_thumbnail
71
+ self.ps_version = ps_version # pixel shuffle version
72
+ self.min_dynamic_patch = min_dynamic_patch
73
+ self.max_dynamic_patch = max_dynamic_patch
74
+
75
+ self.hidden_size = self.llm_config.hidden_size
76
+ self.tie_word_embeddings = False
77
+
78
+ logger.info(f'vision_select_layer: {self.select_layer}')
79
+ logger.info(f'ps_version: {self.ps_version}')
80
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
81
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
82
+
83
+ def to_dict(self):
84
+ """
85
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
86
+
87
+ Returns:
88
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
89
+ """
90
+ output = copy.deepcopy(self.__dict__)
91
+ output['vision_config'] = self.vision_config.to_dict()
92
+ output['llm_config'] = self.llm_config.to_dict()
93
+ output['model_type'] = self.__class__.model_type
94
+ output['use_backbone_lora'] = self.use_backbone_lora
95
+ output['use_llm_lora'] = self.use_llm_lora
96
+ output['pad2square'] = self.pad2square
97
+ output['select_layer'] = self.select_layer
98
+ output['force_image_size'] = self.force_image_size
99
+ output['downsample_ratio'] = self.downsample_ratio
100
+ output['template'] = self.template
101
+ output['dynamic_image_size'] = self.dynamic_image_size
102
+ output['use_thumbnail'] = self.use_thumbnail
103
+ output['ps_version'] = self.ps_version
104
+ output['min_dynamic_patch'] = self.min_dynamic_patch
105
+ output['max_dynamic_patch'] = self.max_dynamic_patch
106
+
107
+ return output
flash_attention.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
2
+ import torch
3
+ import torch.nn as nn
4
+ from einops import rearrange
5
+
6
+ try: # v1
7
+ from flash_attn.flash_attn_interface import \
8
+ flash_attn_unpadded_qkvpacked_func
9
+ except: # v2
10
+ from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
11
+
12
+ from flash_attn.bert_padding import pad_input, unpad_input
13
+
14
+
15
+ class FlashAttention(nn.Module):
16
+ """Implement the scaled dot product attention with softmax.
17
+ Arguments
18
+ ---------
19
+ softmax_scale: The temperature to use for the softmax attention.
20
+ (default: 1/sqrt(d_keys) where d_keys is computed at
21
+ runtime)
22
+ attention_dropout: The dropout rate to apply to the attention
23
+ (default: 0.0)
24
+ """
25
+
26
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
27
+ super().__init__()
28
+ self.softmax_scale = softmax_scale
29
+ self.dropout_p = attention_dropout
30
+
31
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
32
+ max_s=None, need_weights=False):
33
+ """Implements the multihead softmax attention.
34
+ Arguments
35
+ ---------
36
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
37
+ if unpadded: (nnz, 3, h, d)
38
+ key_padding_mask: a bool tensor of shape (B, S)
39
+ """
40
+ assert not need_weights
41
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
42
+ assert qkv.is_cuda
43
+
44
+ if cu_seqlens is None:
45
+ batch_size = qkv.shape[0]
46
+ seqlen = qkv.shape[1]
47
+ if key_padding_mask is None:
48
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
49
+ max_s = seqlen
50
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
51
+ device=qkv.device)
52
+ output = flash_attn_unpadded_qkvpacked_func(
53
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
54
+ softmax_scale=self.softmax_scale, causal=causal
55
+ )
56
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
57
+ else:
58
+ nheads = qkv.shape[-2]
59
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
60
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
61
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
62
+ output_unpad = flash_attn_unpadded_qkvpacked_func(
63
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
64
+ softmax_scale=self.softmax_scale, causal=causal
65
+ )
66
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
67
+ indices, batch_size, seqlen),
68
+ 'b s (h d) -> b s h d', h=nheads)
69
+ else:
70
+ assert max_s is not None
71
+ output = flash_attn_unpadded_qkvpacked_func(
72
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
73
+ softmax_scale=self.softmax_scale, causal=causal
74
+ )
75
+
76
+ return output, None
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.44.2"
4
+ }
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e64f6c9596c5bcccb1c6ff117418d2395ae676bd9b114458dff43557c8b089fd
3
+ size 4764503072
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89488078b7de14c05cae30542b8a146d6d832c64c4c320b4e5ea1da085ae50e2
3
+ size 4932744848
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ce063607895d2f8a32978ecac71c4e7968468e3d875608d7f13f3ca93a9400d
3
+ size 4932744864
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68eb39c7bec07fd1315ff63d457c32ae3c85881e14e3b40c9d96ddc145ce1dd5
3
+ size 4998853328
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e07ec5bf73aa13f83a260ec68ba430ea6a98cae57e216f239d25b1ff7fed267e
3
+ size 4984125264
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab03c31a74ee0cafbffcf4cffc646c3b5051060be0b78d9784414f73b0bbe53
3
+ size 4272136880
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e7723f244c111781314146103cc76b9b4b1dfdb687e5f5fbdb22b41b728aed5
3
+ size 3237595280
templates.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ PROMPT_TEMPLATE = dict(
3
+ default=dict(
4
+ SYSTEM='<|System|>:{system}\n',
5
+ INSTRUCTION='<|User|>:{input}\n<|Bot|>:',
6
+ SEP='\n'),
7
+ zephyr=dict(
8
+ SYSTEM='<|system|>\n{system}\n',
9
+ INSTRUCTION='<|user|>\n{input}\n<|assistant|>\n',
10
+ SEP='\n'),
11
+ internlm_chat=dict(
12
+ SYSTEM='<|System|>:{system}\n',
13
+ INSTRUCTION='<|User|>:{input}<eoh>\n<|Bot|>:',
14
+ SUFFIX='<eoa>',
15
+ SUFFIX_AS_EOS=True,
16
+ SEP='\n',
17
+ STOP_WORDS=['<eoa>']),
18
+ internlm2_chat=dict(
19
+ SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
20
+ INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
21
+ '<|im_start|>assistant\n'),
22
+ SUFFIX='<|im_end|>',
23
+ SUFFIX_AS_EOS=True,
24
+ SEP='\n',
25
+ STOP_WORDS=['<|im_end|>']),
26
+ moss_sft=dict(
27
+ SYSTEM='{system}\n',
28
+ INSTRUCTION='<|Human|>: {input}<eoh>\n',
29
+ SEP='\n',
30
+ STOP_WORDS=['<eoc>', '<eom>']),
31
+ llama2_chat=dict(
32
+ SYSTEM=(
33
+ '[INST] <<SYS>>\n You are a helpful, respectful and honest '
34
+ 'assistant. Always answer as helpfully as possible, while being '
35
+ 'safe. Your answers should not include any harmful, unethical, '
36
+ 'racist, sexist, toxic, dangerous, or illegal content. Please '
37
+ 'ensure that your responses are socially unbiased and positive in '
38
+ 'nature.\n{system}\n<</SYS>>\n [/INST] '),
39
+ INSTRUCTION='[INST] {input} [/INST]',
40
+ SEP='\n'),
41
+ code_llama_chat=dict(
42
+ SYSTEM='{system}\n', INSTRUCTION='[INST] {input} [/INST]'),
43
+ chatglm2=dict(
44
+ SYSTEM='{system}\n',
45
+ INSTRUCTION='[Round {round}]\n\n问:{input}\n\n答:',
46
+ SEP='\n\n'),
47
+ chatglm3=dict(
48
+ SYSTEM='<|system|>\n{system}',
49
+ INSTRUCTION='<|user|>\n{input}<|assistant|>\n',
50
+ SEP='\n'),
51
+ qwen_chat=dict(
52
+ SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
53
+ INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
54
+ '<|im_start|>assistant\n'),
55
+ SUFFIX='<|im_end|>',
56
+ SUFFIX_AS_EOS=True,
57
+ SEP='\n',
58
+ STOP_WORDS=['<|im_end|>', '<|endoftext|>']),
59
+ baichuan_chat=dict(
60
+ SYSTEM='{system}\n',
61
+ INSTRUCTION='<reserved_102>{input}<reserved_103>',
62
+ SEP='\n'),
63
+ baichuan2_chat=dict(
64
+ SYSTEM='{system}\n',
65
+ INSTRUCTION='<reserved_106>{input}<reserved_107>',
66
+ SEP='\n'),
67
+ wizardlm=dict(
68
+ SYSTEM=('A chat between a curious user and an artificial '
69
+ 'intelligence assistant. The assistant gives '
70
+ 'helpful, detailed, and polite answers to the '
71
+ 'user\'s questions. {system}\n '),
72
+ INSTRUCTION=('USER: {input} ASSISTANT:'),
73
+ SEP='\n'),
74
+ wizardcoder=dict(
75
+ SYSTEM=(
76
+ 'Below is an instruction that describes a task. '
77
+ 'Write a response that appropriately completes the request.\n\n'
78
+ '{system}\n '),
79
+ INSTRUCTION=('### Instruction:\n{input}\n\n### Response:'),
80
+ SEP='\n\n'),
81
+ vicuna=dict(
82
+ SYSTEM=('A chat between a curious user and an artificial '
83
+ 'intelligence assistant. The assistant gives '
84
+ 'helpful, detailed, and polite answers to the '
85
+ 'user\'s questions. {system}\n '),
86
+ INSTRUCTION=('USER: {input} ASSISTANT:'),
87
+ SEP='\n'),
88
+ deepseek_coder=dict(
89
+ SYSTEM=('You are an AI programming assistant, utilizing '
90
+ 'the DeepSeek Coder model, developed by DeepSeek'
91
+ 'Company, and you only answer questions related '
92
+ 'to computer science. For politically sensitive '
93
+ 'questions, security and privacy issues, and '
94
+ 'other non-computer science questions, you will '
95
+ 'refuse to answer. {system}\n'),
96
+ INSTRUCTION=('### Instruction:\n{input}\n### Response:\n'),
97
+ SEP='\n'),
98
+ # TODO: deprecation, v0.2.0
99
+ deepseekcoder=dict(
100
+ SYSTEM=('You are an AI programming assistant, utilizing '
101
+ 'the DeepSeek Coder model, developed by DeepSeek'
102
+ 'Company, and you only answer questions related '
103
+ 'to computer science. For politically sensitive '
104
+ 'questions, security and privacy issues, and '
105
+ 'other non-computer science questions, you will '
106
+ 'refuse to answer. {system}\n'),
107
+ INSTRUCTION=('### Instruction:\n{input}\n### Response:\n'),
108
+ SEP='\n'),
109
+ deepseek_moe=dict(
110
+ SYSTEM=('[INST] {system} [/INST]\n'),
111
+ INSTRUCTION=('[INST] {input} [/INST]'),
112
+ SEP='\n'),
113
+ deepseek_v2=dict(
114
+ SYSTEM='{system}\n\n',
115
+ INSTRUCTION='User: {input}\n\nAssistant: ',
116
+ SUFFIX='<|end▁of▁sentence|>',
117
+ SUFFIX_AS_EOS=True,
118
+ STOP_WORDS=['<|end▁of▁sentence|>']),
119
+ mistral=dict(
120
+ SYSTEM=('[INST] {system} [/INST]\n'),
121
+ INSTRUCTION=('[INST] {input} [/INST]'),
122
+ SEP='\n'),
123
+ mixtral=dict(
124
+ SYSTEM=('[INST] {system} [/INST]\n'),
125
+ INSTRUCTION=('[INST] {input} [/INST]'),
126
+ SEP='\n'),
127
+ minicpm=dict(INSTRUCTION=('<用户> {input} <AI>'), SEP='\n'),
128
+ minicpm3=dict(
129
+ SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
130
+ INSTRUCTION=('<|im_start|>user\n{input}<|im_end|>\n'
131
+ '<|im_start|>assistant\n'),
132
+ SUFFIX='<|im_end|>',
133
+ SUFFIX_AS_EOS=True,
134
+ SEP='\n',
135
+ STOP_WORDS=['<|im_end|>', '<|endoftext|>']),
136
+ gemma=dict(
137
+ # `system` field is extended by xtuner
138
+ SYSTEM=('<start_of_turn>system\n{system}<end_of_turn>\n'),
139
+ INSTRUCTION=('<start_of_turn>user\n{input}<end_of_turn>\n'
140
+ '<start_of_turn>model\n'),
141
+ SUFFIX='<end_of_turn>',
142
+ SUFFIX_AS_EOS=False,
143
+ SEP='\n',
144
+ STOP_WORDS=['<end_of_turn>']),
145
+ cohere_chat=dict(
146
+ SYSTEM=('<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{system}'
147
+ '<|END_OF_TURN_TOKEN|>'),
148
+ INSTRUCTION=(
149
+ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{input}<|END_OF_TURN_TOKEN|>'
150
+ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'),
151
+ SUFFIX='<|END_OF_TURN_TOKEN|>',
152
+ SUFFIX_AS_EOS=True,
153
+ STOP_WORDS=['<|END_OF_TURN_TOKEN|>']),
154
+ llama3_chat=dict(
155
+ SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n'
156
+ '{system}<|eot_id|>'),
157
+ INSTRUCTION=(
158
+ '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>'
159
+ '<|start_header_id|>assistant<|end_header_id|>\n\n'),
160
+ SUFFIX='<|eot_id|>',
161
+ SUFFIX_AS_EOS=True,
162
+ STOP_WORDS=['<|eot_id|>']),
163
+ phi3_chat=dict(
164
+ SYSTEM='<|system|>\n{system}<|end|>\n',
165
+ INSTRUCTION='<|user|>\n{input}<|end|>\n<|assistant|>\n',
166
+ SUFFIX='<|end|>',
167
+ SUFFIX_AS_EOS=True,
168
+ SEP='\n',
169
+ STOP_WORDS=['<|end|>']),
170
+ )
tokenization_internlm2.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Tokenization classes for InternLM."""
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import sentencepiece as spm
23
+ from transformers.tokenization_utils import PreTrainedTokenizer
24
+ from transformers.utils import logging
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
29
+
30
+ PRETRAINED_VOCAB_FILES_MAP = {}
31
+
32
+
33
+ # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
34
+ class InternLM2Tokenizer(PreTrainedTokenizer):
35
+ """
36
+ Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
37
+
38
+ Args:
39
+ vocab_file (`str`):
40
+ Path to the vocabulary file.
41
+ """
42
+
43
+ vocab_files_names = VOCAB_FILES_NAMES
44
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
45
+ model_input_names = ['input_ids', 'attention_mask']
46
+ _auto_class = 'AutoTokenizer'
47
+
48
+ def __init__(
49
+ self,
50
+ vocab_file,
51
+ unk_token='<unk>',
52
+ bos_token='<s>',
53
+ eos_token='</s>',
54
+ pad_token='</s>',
55
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
56
+ add_bos_token=True,
57
+ add_eos_token=False,
58
+ decode_with_prefix_space=False,
59
+ clean_up_tokenization_spaces=False,
60
+ **kwargs,
61
+ ):
62
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
63
+ self.vocab_file = vocab_file
64
+ self.add_bos_token = add_bos_token
65
+ self.add_eos_token = add_eos_token
66
+ self.decode_with_prefix_space = decode_with_prefix_space
67
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
68
+ self.sp_model.Load(vocab_file)
69
+ self._no_prefix_space_tokens = None
70
+ super().__init__(
71
+ bos_token=bos_token,
72
+ eos_token=eos_token,
73
+ unk_token=unk_token,
74
+ pad_token=pad_token,
75
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
76
+ **kwargs,
77
+ )
78
+
79
+ @property
80
+ def no_prefix_space_tokens(self):
81
+ if self._no_prefix_space_tokens is None:
82
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
83
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith('▁')}
84
+ return self._no_prefix_space_tokens
85
+
86
+ @property
87
+ def vocab_size(self):
88
+ """Returns vocab size"""
89
+ return self.sp_model.get_piece_size()
90
+
91
+ @property
92
+ def bos_token_id(self) -> Optional[int]:
93
+ return self.sp_model.bos_id()
94
+
95
+ @property
96
+ def eos_token_id(self) -> Optional[int]:
97
+ return self.sp_model.eos_id()
98
+
99
+ def get_vocab(self):
100
+ """Returns vocab as a dict"""
101
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
102
+ vocab.update(self.added_tokens_encoder)
103
+ return vocab
104
+
105
+ def _tokenize(self, text):
106
+ """Returns a tokenized string."""
107
+ return self.sp_model.encode(text, out_type=str)
108
+
109
+ def _convert_token_to_id(self, token):
110
+ """Converts a token (str) in an id using the vocab."""
111
+ return self.sp_model.piece_to_id(token)
112
+
113
+ def _convert_id_to_token(self, index):
114
+ """Converts an index (integer) in a token (str) using the vocab."""
115
+ token = self.sp_model.IdToPiece(index)
116
+ return token
117
+
118
+ def _maybe_add_prefix_space(self, tokens, decoded):
119
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
120
+ return ' ' + decoded
121
+ else:
122
+ return decoded
123
+
124
+ def convert_tokens_to_string(self, tokens):
125
+ """Converts a sequence of tokens (string) in a single string."""
126
+ current_sub_tokens = []
127
+ out_string = ''
128
+ prev_is_special = False
129
+ for token in tokens:
130
+ # make sure that special tokens are not decoded using sentencepiece model
131
+ if token in self.all_special_tokens:
132
+ if not prev_is_special:
133
+ out_string += ' '
134
+ out_string += self.sp_model.decode(current_sub_tokens) + token
135
+ prev_is_special = True
136
+ current_sub_tokens = []
137
+ else:
138
+ current_sub_tokens.append(token)
139
+ prev_is_special = False
140
+ out_string += self.sp_model.decode(current_sub_tokens)
141
+ out_string = self.clean_up_tokenization(out_string)
142
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
143
+ return out_string[1:]
144
+
145
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """
147
+ Save the vocabulary and special tokens file to a directory.
148
+
149
+ Args:
150
+ save_directory (`str`):
151
+ The directory in which to save the vocabulary.
152
+
153
+ Returns:
154
+ `Tuple(str)`: Paths to the files saved.
155
+ """
156
+ if not os.path.isdir(save_directory):
157
+ logger.error(f'Vocabulary path ({save_directory}) should be a directory')
158
+ return
159
+ out_vocab_file = os.path.join(
160
+ save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
161
+ )
162
+
163
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
164
+ copyfile(self.vocab_file, out_vocab_file)
165
+ elif not os.path.isfile(self.vocab_file):
166
+ with open(out_vocab_file, 'wb') as fi:
167
+ content_spiece_model = self.sp_model.serialized_model_proto()
168
+ fi.write(content_spiece_model)
169
+
170
+ return (out_vocab_file,)
171
+
172
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
173
+ if self.add_bos_token:
174
+ bos_token_ids = [self.bos_token_id]
175
+ else:
176
+ bos_token_ids = []
177
+
178
+ output = bos_token_ids + token_ids_0
179
+
180
+ if token_ids_1 is not None:
181
+ output = output + token_ids_1
182
+
183
+ if self.add_eos_token:
184
+ output = output + [self.eos_token_id]
185
+
186
+ return output
187
+
188
+ def get_special_tokens_mask(
189
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
190
+ ) -> List[int]:
191
+ """
192
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
193
+ special tokens using the tokenizer `prepare_for_model` method.
194
+
195
+ Args:
196
+ token_ids_0 (`List[int]`):
197
+ List of IDs.
198
+ token_ids_1 (`List[int]`, *optional*):
199
+ Optional second list of IDs for sequence pairs.
200
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
201
+ Whether or not the token list is already formatted with special tokens for the model.
202
+
203
+ Returns:
204
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
205
+ """
206
+ if already_has_special_tokens:
207
+ return super().get_special_tokens_mask(
208
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
209
+ )
210
+
211
+ if token_ids_1 is None:
212
+ return [1] + ([0] * len(token_ids_0)) + [1]
213
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
214
+
215
+ def create_token_type_ids_from_sequences(
216
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
217
+ ) -> List[int]:
218
+ """
219
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
220
+ use of token type ids, therefore a list of zeros is returned.
221
+
222
+ Args:
223
+ token_ids_0 (`List[int]`):
224
+ List of IDs.
225
+ token_ids_1 (`List[int]`, *optional*):
226
+ Optional second list of IDs for sequence pairs.
227
+
228
+ Returns:
229
+ `List[int]`: List of zeros.
230
+ """
231
+ eos = [self.eos_token_id]
232
+
233
+ if token_ids_1 is None:
234
+ return len(token_ids_0 + eos) * [0]
235
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
tokenizer_config.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<img>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "151666": {
191
+ "content": "</img>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "151667": {
199
+ "content": "<IMG_CONTEXT>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "151668": {
207
+ "content": "<quad>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "151669": {
215
+ "content": "</quad>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<ref>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</ref>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "151674": {
255
+ "content": "<fast_img>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "151675": {
263
+ "content": "</fast_img>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "151676": {
271
+ "content": "<FAST_IMG_CONTEXT>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "151677": {
279
+ "content": "[SEG]",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "151678": {
287
+ "content": "<p>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "151679": {
295
+ "content": "</p>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "151680": {
303
+ "content": "<vp>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": true
309
+ },
310
+ "151681": {
311
+ "content": "</vp>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": true
317
+ }
318
+ },
319
+ "additional_special_tokens": [
320
+ "<|im_start|>",
321
+ "<|im_end|>",
322
+ "<|object_ref_start|>",
323
+ "<|object_ref_end|>",
324
+ "<|box_start|>",
325
+ "<|box_end|>",
326
+ "<|quad_start|>",
327
+ "<|quad_end|>",
328
+ "<|vision_start|>",
329
+ "<|vision_end|>",
330
+ "<|vision_pad|>",
331
+ "<|image_pad|>",
332
+ "<|video_pad|>"
333
+ ],
334
+ "bos_token": null,
335
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
336
+ "clean_up_tokenization_spaces": false,
337
+ "eos_token": "<|im_end|>",
338
+ "errors": "replace",
339
+ "extra_special_tokens": {},
340
+ "model_max_length": 12288,
341
+ "pad_token": "<|endoftext|>",
342
+ "padding_side": "right",
343
+ "split_special_tokens": false,
344
+ "tokenizer_class": "Qwen2Tokenizer",
345
+ "unk_token": null
346
+ }