hithink-ai commited on
Commit
bc59815
·
verified ·
1 Parent(s): 2039032

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,3 @@
1
- ---
2
- license: apache-2.0
3
- language:
4
- - en
5
- base_model:
6
- - Qwen/Qwen2.5-VL-7B-Instruct
7
- - Qwen/Qwen2-Audio-7B-Instruct
8
- ---
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
 
 
 
 
 
config.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HithinkOmniForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "audio_config": {
7
+ "activation_dropout": 0.0,
8
+ "activation_function": "gelu",
9
+ "attention_dropout": 0.0,
10
+ "d_model": 1280,
11
+ "dropout": 0.0,
12
+ "encoder_attention_heads": 20,
13
+ "encoder_ffn_dim": 5120,
14
+ "encoder_layerdrop": 0.0,
15
+ "encoder_layers": 32,
16
+ "init_std": 0.02,
17
+ "max_source_positions": 1500,
18
+ "model_type": "hithink_audio_encoder",
19
+ "num_hidden_layers": 32,
20
+ "num_mel_bins": 128,
21
+ "scale_embedding": false
22
+ },
23
+ "audio_decoder_config": {
24
+ "_attn_implementation_autoset": false,
25
+ "_name_or_path": "",
26
+ "add_cross_attention": false,
27
+ "architectures": null,
28
+ "bad_words_ids": null,
29
+ "begin_suppress_tokens": null,
30
+ "bos_token_id": null,
31
+ "chunk_size_feed_forward": 0,
32
+ "codebook_size": 1024,
33
+ "cross_attention_hidden_size": null,
34
+ "decoder_start_token_id": null,
35
+ "diversity_penalty": 0.0,
36
+ "do_sample": false,
37
+ "early_stopping": false,
38
+ "encoder_no_repeat_ngram_size": 0,
39
+ "eos_token_id": null,
40
+ "exponential_decay_length_penalty": null,
41
+ "finetuning_task": null,
42
+ "forced_bos_token_id": null,
43
+ "forced_eos_token_id": null,
44
+ "id2label": {
45
+ "0": "LABEL_0",
46
+ "1": "LABEL_1"
47
+ },
48
+ "is_decoder": false,
49
+ "is_encoder_decoder": false,
50
+ "label2id": {
51
+ "LABEL_0": 0,
52
+ "LABEL_1": 1
53
+ },
54
+ "length_penalty": 1.0,
55
+ "max_length": 20,
56
+ "min_length": 0,
57
+ "model_type": "hithink_omni_audio_decoder",
58
+ "no_repeat_ngram_size": 0,
59
+ "num_beam_groups": 1,
60
+ "num_beams": 1,
61
+ "num_codebooks": 8,
62
+ "num_hidden_layers": 6,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": null,
68
+ "prefix": null,
69
+ "problem_type": null,
70
+ "pruned_heads": {},
71
+ "remove_invalid_values": false,
72
+ "repetition_penalty": 1.0,
73
+ "return_dict": true,
74
+ "return_dict_in_generate": false,
75
+ "sep_token_id": null,
76
+ "suppress_tokens": null,
77
+ "task_specific_params": null,
78
+ "temperature": 1.0,
79
+ "tf_legacy_loss": false,
80
+ "tie_encoder_decoder": false,
81
+ "tie_word_embeddings": true,
82
+ "tokenizer_class": null,
83
+ "top_k": 50,
84
+ "top_p": 1.0,
85
+ "torch_dtype": null,
86
+ "torchscript": false,
87
+ "typical_p": 1.0,
88
+ "use_bfloat16": false
89
+ },
90
+ "audio_token_index": 151665,
91
+ "auto_map": {
92
+ "AutoConfig": "configuration_hithinkomni.HithinkOmniConfig",
93
+ "AutoModel": "modeling_hithinkomni.HithinkOmniForConditionalGeneration"
94
+ },
95
+ "bos_token_id": 151643,
96
+ "eos_token_id": 151645,
97
+ "hidden_act": "silu",
98
+ "hidden_size": 3584,
99
+ "ignore_index": -100,
100
+ "image_token_id": 151655,
101
+ "initializer_range": 0.02,
102
+ "intermediate_size": 18944,
103
+ "max_position_embeddings": 128000,
104
+ "max_window_layers": 28,
105
+ "model_type": "hithink_omni",
106
+ "num_attention_heads": 28,
107
+ "num_hidden_layers": 28,
108
+ "num_key_value_heads": 4,
109
+ "pad_token_id": 151643,
110
+ "rms_norm_eps": 1e-06,
111
+ "rope_scaling": {
112
+ "mrope_section": [
113
+ 16,
114
+ 24,
115
+ 24
116
+ ],
117
+ "rope_type": "default",
118
+ "type": "default"
119
+ },
120
+ "rope_theta": 1000000.0,
121
+ "sliding_window": 32768,
122
+ "tie_word_embeddings": false,
123
+ "torch_dtype": "bfloat16",
124
+ "transformers_version": "4.50.3",
125
+ "use_cache": false,
126
+ "use_sliding_window": false,
127
+ "video_token_id": 151656,
128
+ "vision_config": {
129
+ "depth": 32,
130
+ "fullatt_block_indexes": [
131
+ 7,
132
+ 15,
133
+ 23,
134
+ 31
135
+ ],
136
+ "hidden_act": "silu",
137
+ "hidden_size": 1280,
138
+ "in_channels": 3,
139
+ "in_chans": 3,
140
+ "intermediate_size": 3420,
141
+ "model_type": "hithink_omni",
142
+ "num_heads": 16,
143
+ "out_hidden_size": 3584,
144
+ "patch_size": 14,
145
+ "spatial_merge_size": 2,
146
+ "spatial_patch_size": 14,
147
+ "temporal_patch_size": 2,
148
+ "tokens_per_second": 2,
149
+ "window_size": 112
150
+ },
151
+ "vision_end_token_id": 151653,
152
+ "vision_start_token_id": 151652,
153
+ "vision_token_id": 151654,
154
+ "vocab_size": 151665,
155
+ "vocab_size_ext": 3
156
+ }
configuration_hithinkomni.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Union
3
+
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.models.auto import CONFIG_MAPPING
6
+ from transformers.modeling_rope_utils import rope_config_validation
7
+ from transformers.utils import logging
8
+
9
+
10
+ logger = logging.get_logger(__name__)
11
+
12
+
13
+ class HithinkOmniVisionConfig(PretrainedConfig):
14
+ model_type = "hithink_omni"
15
+ base_config_key = "vision_config"
16
+
17
+ def __init__(
18
+ self,
19
+ depth=32,
20
+ hidden_size=3584,
21
+ hidden_act="silu",
22
+ intermediate_size=3420,
23
+ num_heads=16,
24
+ in_channels=3,
25
+ patch_size=14,
26
+ spatial_merge_size=2,
27
+ temporal_patch_size=2,
28
+ tokens_per_second=4,
29
+ window_size=112,
30
+ out_hidden_size=3584,
31
+ fullatt_block_indexes=[7, 15, 23, 31],
32
+ **kwargs,
33
+ ):
34
+ super().__init__(**kwargs)
35
+
36
+ self.depth = depth
37
+ self.hidden_size = hidden_size
38
+ self.hidden_act = hidden_act
39
+ self.intermediate_size = intermediate_size
40
+ self.num_heads = num_heads
41
+ self.in_channels = in_channels
42
+ self.patch_size = patch_size
43
+ self.spatial_merge_size = spatial_merge_size
44
+ self.temporal_patch_size = temporal_patch_size
45
+ self.tokens_per_second = tokens_per_second
46
+ self.window_size = window_size
47
+ self.fullatt_block_indexes = fullatt_block_indexes
48
+ self.out_hidden_size = out_hidden_size
49
+
50
+
51
+ class HithinkAudioEncoderConfig(PretrainedConfig):
52
+ r"""
53
+ This is the configuration class to store the configuration of a [`HithinkAudioEncoder`]. It is used to instantiate a
54
+ HithinkAudio audio encoder according to the specified arguments, defining the model architecture. Instantiating a
55
+ configuration with the defaults will yield a similar configuration to that of the audio encoder of the HithinkAudio
56
+ architecture.
57
+
58
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
59
+ documentation from [`PretrainedConfig`] for more information.
60
+
61
+ Args:
62
+ num_mel_bins (`int`, *optional*, defaults to 128):
63
+ Number of mel features used per input features. Should correspond to the value used in the
64
+ `HithinkOmniProcessor` class.
65
+ encoder_layers (`int`, *optional*, defaults to 32):
66
+ Number of encoder layers.
67
+ encoder_attention_heads (`int`, *optional*, defaults to 20):
68
+ Number of attention heads for each attention layer in the Transformer encoder.
69
+ encoder_ffn_dim (`int`, *optional*, defaults to 5120):
70
+ Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
71
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
72
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
73
+ for more details.
74
+ d_model (`int`, *optional*, defaults to 1280):
75
+ Dimensionality of the layers.
76
+ dropout (`float`, *optional*, defaults to 0.0):
77
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
78
+ attention_dropout (`float`, *optional*, defaults to 0.0):
79
+ The dropout ratio for the attention probabilities.
80
+ activation_function (`str`, *optional*, defaults to `"gelu"`):
81
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
82
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
83
+ activation_dropout (`float`, *optional*, defaults to 0.0):
84
+ The dropout ratio for activations inside the fully connected layer.
85
+ scale_embedding (`bool`, *optional*, defaults to `False`):
86
+ Scale embeddings by diving by sqrt(d_model).
87
+ init_std (`float`, *optional*, defaults to 0.02):
88
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
89
+ max_source_positions (`int`, *optional*, defaults to 1500):
90
+ The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
91
+
92
+ Example:
93
+
94
+ ```python
95
+ >>> from transformers import HithinkAudioEncoderConfig, HithinkAudioEncoder
96
+
97
+ >>> # Initializing a HithinkAudioEncoderConfig
98
+ >>> configuration = HithinkAudioEncoderConfig()
99
+
100
+ >>> # Initializing a HithinkAudioEncoder (with random weights)
101
+ >>> model = HithinkAudioEncoder(configuration)
102
+
103
+ >>> # Accessing the model configuration
104
+ >>> configuration = model.config
105
+ ```"""
106
+
107
+ model_type = "hithink_audio_encoder"
108
+
109
+ def __init__(
110
+ self,
111
+ num_mel_bins=128,
112
+ encoder_layers=32,
113
+ encoder_attention_heads=20,
114
+ encoder_ffn_dim=5120,
115
+ encoder_layerdrop=0.0,
116
+ d_model=1280,
117
+ dropout=0.0,
118
+ attention_dropout=0.0,
119
+ activation_function="gelu",
120
+ activation_dropout=0.0,
121
+ scale_embedding=False,
122
+ init_std=0.02,
123
+ max_source_positions=1500,
124
+ **kwargs,
125
+ ):
126
+ super().__init__(**kwargs)
127
+
128
+ self.num_mel_bins = num_mel_bins
129
+ self.d_model = d_model
130
+ self.encoder_layers = encoder_layers
131
+ self.encoder_attention_heads = encoder_attention_heads
132
+ self.encoder_ffn_dim = encoder_ffn_dim
133
+ self.dropout = dropout
134
+ self.attention_dropout = attention_dropout
135
+ self.activation_function = activation_function
136
+ self.activation_dropout = activation_dropout
137
+ self.encoder_layerdrop = encoder_layerdrop
138
+ self.num_hidden_layers = encoder_layers
139
+ self.init_std = init_std
140
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
141
+ self.max_source_positions = max_source_positions
142
+
143
+
144
+ class HithinkAudioDecoderConfig(PretrainedConfig):
145
+
146
+ model_type = "hithink_omni_audio_decoder"
147
+
148
+ def __init__(
149
+ self,
150
+ num_hidden_layers=6,
151
+ codebook_size=1024,
152
+ num_codebooks=8,
153
+ **kwargs,
154
+ ):
155
+ super().__init__(**kwargs)
156
+ self.num_hidden_layers = num_hidden_layers
157
+ self.codebook_size = codebook_size
158
+ self.num_codebooks = num_codebooks
159
+
160
+
161
+ class HithinkOmniConfig(PretrainedConfig):
162
+ r"""
163
+ This is the configuration class to store the configuration of a [`HithinkOmniModel`]. It is used to instantiate a
164
+ HithinkOmni model according to the specified arguments, defining the model architecture.
165
+
166
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
167
+ documentation from [`PretrainedConfig`] for more information.
168
+
169
+
170
+ Args:
171
+ vocab_size (`int`, *optional*, defaults to 152064):
172
+ Vocabulary size of the HithinkOmni model. Defines the number of different tokens that can be represented by the
173
+ `inputs_ids` passed when calling [`HithinkOmniModel`]
174
+ hidden_size (`int`, *optional*, defaults to 8192):
175
+ Dimension of the hidden representations.
176
+ intermediate_size (`int`, *optional*, defaults to 29568):
177
+ Dimension of the MLP representations.
178
+ num_hidden_layers (`int`, *optional*, defaults to 80):
179
+ Number of hidden layers in the Transformer encoder.
180
+ num_attention_heads (`int`, *optional*, defaults to 64):
181
+ Number of attention heads for each attention layer in the Transformer encoder.
182
+ num_key_value_heads (`int`, *optional*, defaults to 8):
183
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
184
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
185
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
186
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
187
+ by meanpooling all the original heads within that group. For more details checkout [this
188
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
189
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
190
+ The non-linear activation function (function or string) in the decoder.
191
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
192
+ The maximum sequence length that this model might ever be used with.
193
+ initializer_range (`float`, *optional*, defaults to 0.02):
194
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
195
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
196
+ The epsilon used by the rms normalization layers.
197
+ use_cache (`bool`, *optional*, defaults to `True`):
198
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
199
+ relevant if `config.is_decoder=True`.
200
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
201
+ Whether the model's input and output word embeddings should be tied.
202
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
203
+ The base period of the RoPE embeddings.
204
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
205
+ Whether to use sliding window attention.
206
+ sliding_window (`int`, *optional*, defaults to 4096):
207
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
208
+ max_window_layers (`int`, *optional*, defaults to 80):
209
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
210
+ attention_dropout (`float`, *optional*, defaults to 0.0):
211
+ The dropout ratio for the attention probabilities.
212
+ vision_config (`Dict`, *optional*):
213
+ The config for the visual encoder initialization.
214
+ rope_scaling (`Dict`, *optional*):
215
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
216
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
217
+ accordingly.
218
+ Expected contents:
219
+ `rope_type` (`str`):
220
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
221
+ 'llama3'], with 'default' being the original RoPE implementation.
222
+ `factor` (`float`, *optional*):
223
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
224
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
225
+ original maximum pre-trained length.
226
+ `original_max_position_embeddings` (`int`, *optional*):
227
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
228
+ pretraining.
229
+ `attention_factor` (`float`, *optional*):
230
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
231
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
232
+ `factor` field to infer the suggested value.
233
+ `beta_fast` (`float`, *optional*):
234
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
235
+ ramp function. If unspecified, it defaults to 32.
236
+ `beta_slow` (`float`, *optional*):
237
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
238
+ ramp function. If unspecified, it defaults to 1.
239
+ `short_factor` (`List[float]`, *optional*):
240
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
241
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
242
+ size divided by the number of attention heads divided by 2
243
+ `long_factor` (`List[float]`, *optional*):
244
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
245
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
246
+ size divided by the number of attention heads divided by 2
247
+ `low_freq_factor` (`float`, *optional*):
248
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
249
+ `high_freq_factor` (`float`, *optional*):
250
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
251
+
252
+ ```python
253
+ >>> from transformers import HithinkOmniForConditionalGeneration, HithinkOmniConfig
254
+
255
+ >>> # Initializing a HithinkOmni style configuration
256
+ >>> configuration = HithinkOmniConfig()
257
+
258
+ >>> # Initializing a model from the HithinkOmni-7B style configuration
259
+ >>> model = HithinkOmniForConditionalGeneration(configuration)
260
+
261
+ >>> # Accessing the model configuration
262
+ >>> configuration = model.config
263
+ ```"""
264
+
265
+ model_type = "hithink_omni"
266
+ sub_configs = {"vision_config": HithinkOmniVisionConfig}
267
+ keys_to_ignore_at_inference = ["past_key_values"]
268
+ # Default tensor parallel plan for base model `HithinkOmni`
269
+ base_model_tp_plan = {
270
+ "layers.*.self_attn.q_proj": "colwise",
271
+ "layers.*.self_attn.k_proj": "colwise",
272
+ "layers.*.self_attn.v_proj": "colwise",
273
+ "layers.*.self_attn.o_proj": "rowwise",
274
+ "layers.*.mlp.gate_proj": "colwise",
275
+ "layers.*.mlp.up_proj": "colwise",
276
+ "layers.*.mlp.down_proj": "rowwise",
277
+ }
278
+
279
+ def __init__(
280
+ self,
281
+ vocab_size=152064,
282
+ vocab_size_ext=None,
283
+ hidden_size=8192,
284
+ intermediate_size=29568,
285
+ num_hidden_layers=80,
286
+ num_attention_heads=64,
287
+ num_key_value_heads=8,
288
+ hidden_act="silu",
289
+ max_position_embeddings=32768,
290
+ initializer_range=0.02,
291
+ rms_norm_eps=1e-05,
292
+ use_cache=True,
293
+ tie_word_embeddings=False,
294
+ rope_theta=1000000.0,
295
+ use_sliding_window=False,
296
+ sliding_window=4096,
297
+ max_window_layers=80,
298
+ attention_dropout=0.0,
299
+ vision_config=None,
300
+ rope_scaling=None,
301
+ audio_config=None,
302
+ audio_token_index=151665,
303
+ audio_decoder_config=None,
304
+ **kwargs,
305
+ ):
306
+ if isinstance(vision_config, dict):
307
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
308
+ elif vision_config is None:
309
+ self.vision_config = self.sub_configs["vision_config"]()
310
+
311
+ self.vocab_size = vocab_size
312
+ self.vocab_size_ext = vocab_size_ext
313
+ self.max_position_embeddings = max_position_embeddings
314
+ self.hidden_size = hidden_size
315
+ self.intermediate_size = intermediate_size
316
+ self.num_hidden_layers = num_hidden_layers
317
+ self.num_attention_heads = num_attention_heads
318
+ self.use_sliding_window = use_sliding_window
319
+ self.sliding_window = sliding_window
320
+ self.max_window_layers = max_window_layers
321
+
322
+ # for backward compatibility
323
+ if num_key_value_heads is None:
324
+ num_key_value_heads = num_attention_heads
325
+
326
+ self.num_key_value_heads = num_key_value_heads
327
+ self.hidden_act = hidden_act
328
+ self.initializer_range = initializer_range
329
+ self.rms_norm_eps = rms_norm_eps
330
+ self.use_cache = use_cache
331
+ self.rope_theta = rope_theta
332
+ self.attention_dropout = attention_dropout
333
+ self.rope_scaling = rope_scaling
334
+
335
+ # define audio config
336
+ self.audio_token_index = audio_token_index
337
+ self.ignore_index = -100
338
+ if isinstance(audio_config, dict):
339
+ audio_config = HithinkAudioEncoderConfig(**audio_config)
340
+ elif audio_config is None:
341
+ audio_config = HithinkAudioEncoderConfig(
342
+ d_model=1280,
343
+ encoder_attention_heads=20,
344
+ encoder_ffn_dim=5120,
345
+ encoder_layerdrop=0.0,
346
+ encoder_layers=32,
347
+ num_mel_bins=128,
348
+ max_source_positions=1500,
349
+ scale_embedding=False,
350
+ activation_function="gelu",
351
+ )
352
+ self.audio_config = audio_config
353
+
354
+ if isinstance(audio_decoder_config, dict):
355
+ self.audio_decoder_config = HithinkAudioDecoderConfig(**audio_decoder_config)
356
+ else:
357
+ self.audio_decoder_config = None
358
+
359
+ # Validate the correctness of rotary position embeddings parameters
360
+ # BC: if there is a 'type' field, move it to 'rope_type'.
361
+ # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
362
+ # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
363
+ # TODO: @raushan update config in the hub
364
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
365
+ if self.rope_scaling["type"] == "mrope":
366
+ self.rope_scaling["type"] = "default"
367
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
368
+ rope_config_validation(self, ignore_keys={"mrope_section"})
369
+
370
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.50.3"
9
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a962f71e35583dcdec95dc44f51b033c7a35a65e4ae238e4796c988e793f8be
3
+ size 9977516952
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f817ff1bea9de1b7faa07b69dc21ae055479fb4d768b3dc7d4e08ec64cff6919
3
+ size 9943393624
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3736c5b8f17c19392d11b217f212660a135c844aa44a26d9bf1697c45a99592
3
+ size 855522352
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_hithinkomni.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_hithinkomni.HithinkOmniProcessor"
4
+ },
5
+ "chunk_length": 30,
6
+ "dither": 0.0,
7
+ "feature_extractor_type": "WhisperFeatureExtractor",
8
+ "feature_size": 128,
9
+ "hop_length": 160,
10
+ "image_mean": [
11
+ 0.48145466,
12
+ 0.4578275,
13
+ 0.40821073
14
+ ],
15
+ "image_processor_type": "Qwen2VLImageProcessor",
16
+ "image_std": [
17
+ 0.26862954,
18
+ 0.26130258,
19
+ 0.27577711
20
+ ],
21
+ "max_pixels": 12845056,
22
+ "merge_size": 2,
23
+ "min_pixels": 3136,
24
+ "n_fft": 400,
25
+ "n_samples": 480000,
26
+ "nb_max_frames": 3000,
27
+ "padding_side": "right",
28
+ "padding_value": 0.0,
29
+ "patch_size": 14,
30
+ "processor_class": "HithinkOmniProcessor",
31
+ "return_attention_mask": false,
32
+ "sampling_rate": 16000,
33
+ "temporal_patch_size": 2
34
+ }
processing_hithinkomni.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Union
2
+
3
+ import numpy as np
4
+
5
+ from transformers import BatchFeature
6
+ from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput
7
+ from transformers.image_utils import ImageInput, VideoInput
8
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
9
+
10
+
11
+ class HithinkOmniVideosProcessorKwargs(VideosKwargs, total=False):
12
+ fps: Union[List[float], float]
13
+
14
+
15
+ class HithinkOmniProcessorKwargs(ProcessingKwargs, total=False):
16
+ videos_kwargs: HithinkOmniVideosProcessorKwargs
17
+ _defaults = {
18
+ "text_kwargs": {
19
+ "padding": False,
20
+ },
21
+ "videos_kwargs": {"fps": 2.0},
22
+ }
23
+
24
+
25
+ class HithinkOmniProcessor(ProcessorMixin):
26
+ r"""
27
+ Constructs a HithinkOmni processor which wraps a Qwen2.5-VL image processor and a HithinkOmni tokenizer into a single processor.
28
+ [`HithinkOmniProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`PreTrainedTokenizerFast`]. See the
29
+ [`~HithinkOmniProcessor.__call__`] and [`~HithinkOmniProcessor.decode`] for more information.
30
+ Args:
31
+ image_processor ([`Qwen2VLImageProcessor`], *optional*):
32
+ The image processor is a required input.
33
+ feature_extractor ([`WhisperFeatureExtractor`], *optional*):
34
+ The feature extractor is a required input.
35
+ tokenizer ([`PreTrainedTokenizerFast`], *optional*):
36
+ The tokenizer is a required input.
37
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
38
+ in a chat into a tokenizable string.
39
+ """
40
+
41
+ attributes = ["image_processor", "feature_extractor", "tokenizer"]
42
+ valid_kwargs = ["chat_template"]
43
+
44
+ image_processor_class = "Qwen2VLImageProcessor"
45
+ feature_extractor_class = "WhisperFeatureExtractor"
46
+ tokenizer_class = "PreTrainedTokenizerFast"
47
+
48
+ def __init__(self, image_processor=None, feature_extractor=None, tokenizer=None, chat_template=None, **kwargs):
49
+ tokenizer.model_input_names = ["input_ids", "attention_mask"] # do not include token_type_ids
50
+ super().__init__(image_processor, feature_extractor, tokenizer, chat_template=chat_template)
51
+ self.image_token = getattr(tokenizer, 'image_token', '<|image_pad|>')
52
+ self.video_token = getattr(tokenizer, 'video_token', '<|video_pad|>')
53
+ self.chat_template = tokenizer.chat_template if chat_template is None else chat_template
54
+
55
+ def __call__(
56
+ self,
57
+ images: ImageInput = None,
58
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
59
+ videos: VideoInput = None,
60
+ audios: Union[np.ndarray, List[np.ndarray]] = None,
61
+ sampling_rate: Optional[int] = None,
62
+ **kwargs: Unpack[HithinkOmniProcessorKwargs],
63
+ ) -> BatchFeature:
64
+ """
65
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
66
+ and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
67
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
68
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
69
+
70
+ Args:
71
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
72
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
73
+ tensor. Both channels-first and channels-last formats are supported.
74
+ text (`str`, `List[str]`, `List[List[str]]`):
75
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
76
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
77
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
78
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
79
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
80
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
81
+ audios (`np.ndarray`, `List[np.ndarray]`):
82
+ The audio or batch of audios to be prepared. Each audio can be a NumPy array.
83
+ sampling_rate (`int`, defaults to 16000):
84
+ The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
85
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
86
+ If set, will return tensors of a particular framework. Acceptable values are:
87
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
88
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
89
+ - `'np'`: Return NumPy `np.ndarray` objects.
90
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
91
+
92
+ Returns:
93
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
94
+
95
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
96
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
97
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
98
+ `None`).
99
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
100
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
101
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
102
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
103
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
104
+ """
105
+ output_kwargs = self._merge_kwargs(
106
+ HithinkOmniProcessorKwargs,
107
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
108
+ **kwargs,
109
+ )
110
+ if images is not None:
111
+ image_inputs = self.image_processor(images=images, videos=None, **output_kwargs["images_kwargs"])
112
+ image_grid_thw = image_inputs["image_grid_thw"]
113
+ else:
114
+ image_inputs = {}
115
+ image_grid_thw = None
116
+
117
+ if videos is not None:
118
+ videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["images_kwargs"])
119
+ video_grid_thw = videos_inputs["video_grid_thw"]
120
+
121
+ fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
122
+ if isinstance(fps, (int, float)):
123
+ second_per_grid_ts = [self.image_processor.temporal_patch_size / fps] * len(video_grid_thw)
124
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
125
+ second_per_grid_ts = [self.image_processor.temporal_patch_size / tmp for tmp in fps]
126
+ else:
127
+ raise ValueError(
128
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
129
+ )
130
+ videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
131
+
132
+ else:
133
+ videos_inputs = {}
134
+ video_grid_thw = None
135
+
136
+ if not isinstance(text, list):
137
+ text = [text]
138
+
139
+ if image_grid_thw is not None:
140
+ merge_length = self.image_processor.merge_size**2
141
+ index = 0
142
+ for i in range(len(text)):
143
+ while self.image_token in text[i]:
144
+ text[i] = text[i].replace(
145
+ self.image_token,
146
+ "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
147
+ 1,
148
+ )
149
+ index += 1
150
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
151
+
152
+ if video_grid_thw is not None:
153
+ merge_length = self.image_processor.merge_size**2
154
+ index = 0
155
+ for i in range(len(text)):
156
+ while self.video_token in text[i]:
157
+ text[i] = text[i].replace(
158
+ self.video_token,
159
+ "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
160
+ 1,
161
+ )
162
+ index += 1
163
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
164
+
165
+ if audios is not None:
166
+ audio_inputs = self.feature_extractor(
167
+ audios, sampling_rate=sampling_rate, return_attention_mask=True, padding="max_length", **kwargs
168
+ )
169
+ audio_inputs["feature_attention_mask"] = audio_inputs.pop(
170
+ "attention_mask"
171
+ ) # rename attention_mask to prevent conflicts later on
172
+ audio_output_lengths = self.get_feat_extract_output_lengths(
173
+ audio_inputs['feature_attention_mask'].sum(-1)
174
+ )
175
+ index = 0
176
+ for i in range(len(text)):
177
+ while "<|AUDIO|>" in text[i]:
178
+ text[i] = text[i].replace(
179
+ "<|AUDIO|>", "<|placeholder|>" * audio_output_lengths[index], 1
180
+ )
181
+ index += 1
182
+ text[i] = text[i].replace("<|placeholder|>", "<|AUDIO|>")
183
+ else:
184
+ audio_inputs = {}
185
+
186
+ text_inputs =self.tokenizer(text, **output_kwargs["text_kwargs"])
187
+
188
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs, **audio_inputs})
189
+
190
+ @staticmethod
191
+ def get_feat_extract_input_length(audio_length):
192
+ """
193
+ Computes the input length of the audio encoder (i.e. output of the feature extractor)
194
+ e.g. 30-second audio has 480,000 samples (sampling_rate = 16,000), the feature length will be 3,000
195
+ """
196
+ return int(np.ceil((audio_length - 40) / 160)) # 第一帧需要200样本,后续每帧需要160样本
197
+
198
+ @staticmethod
199
+ def get_feat_extract_output_lengths(input_lengths):
200
+ """
201
+ Computes the output length of the convolutional layers and the output length of the audio encoder
202
+ """
203
+ input_lengths = (input_lengths - 1) // 2 + 1
204
+ output_lengths = (input_lengths - 2) // 2 + 1
205
+ return output_lengths
206
+
207
+ def featurize_audio_chunk(self, audio: np.ndarray, is_last: bool, n_extracted_frames: int = 0, **kwargs):
208
+ """
209
+ Extract the features from the audio chunk during streaming inference
210
+ """
211
+ n_frames = (len(audio) - 40) / 160 # 第一帧需要200样本,后续每帧需要160样本
212
+ n_frames = int(np.ceil(n_frames) if is_last else np.floor(n_frames))
213
+ n_new_frames = n_frames - n_extracted_frames
214
+ i_end = n_frames * 160 + 40
215
+ i_start = max(0, (n_extracted_frames + 1 - 3) * 160) # 滑窗需要400样本,即最少3帧
216
+ if n_new_frames <= 0 or n_frames < 2:
217
+ return
218
+ a = audio[i_start: i_end] # 截取计算new frames需要的chunk
219
+ if is_last and (n_pad := int(np.ceil(len(a) / 160)) * 160 - len(a)): # pad to multiple of 160
220
+ a = np.pad(a, [0, n_pad])
221
+ features = self.feature_extractor(
222
+ a, sampling_rate=self.feature_extractor.sampling_rate, padding='do_not_pad', **kwargs
223
+ )['input_features']
224
+ return features[:, :, -n_new_frames:]
225
+
226
+ def batch_decode(self, *args, **kwargs):
227
+ """
228
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
229
+ refer to the docstring of this method for more information.
230
+ """
231
+ return self.tokenizer.batch_decode(*args, **kwargs)
232
+
233
+ def decode(self, *args, **kwargs):
234
+ """
235
+ This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
236
+ the docstring of this method for more information.
237
+ """
238
+ return self.tokenizer.decode(*args, **kwargs)
239
+
240
+ def post_process_image_text_to_text(self, generated_outputs):
241
+ """
242
+ Post-process the output of the model to decode the text.
243
+
244
+ Args:
245
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
246
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
247
+ or `(sequence_length,)`.
248
+
249
+ Returns:
250
+ `List[str]`: The decoded text.
251
+ """
252
+ return self.tokenizer.batch_decode(
253
+ generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
254
+ )
255
+
256
+ @property
257
+ def model_input_names(self):
258
+ tokenizer_input_names = self.tokenizer.model_input_names
259
+ image_processor_input_names = self.image_processor.model_input_names
260
+ feature_extractor_input_names = self.feature_extractor.model_input_names
261
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names
262
+ + feature_extractor_input_names + ["feature_attention_mask"])) # audio
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_hithinkomni.HithinkOmniProcessor"
4
+ },
5
+ "processor_class": "HithinkOmniProcessor"
6
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8382c71e5571bd71a614f60271a48b21e4c21b02f0140ee7ab2f708ce510949f
3
+ size 11422462
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|AUDIO|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|audio_bos|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|audio_eos|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ }
205
+ },
206
+ "additional_special_tokens": [
207
+ "<|im_start|>",
208
+ "<|im_end|>",
209
+ "<|object_ref_start|>",
210
+ "<|object_ref_end|>",
211
+ "<|box_start|>",
212
+ "<|box_end|>",
213
+ "<|quad_start|>",
214
+ "<|quad_end|>",
215
+ "<|vision_start|>",
216
+ "<|vision_end|>",
217
+ "<|vision_pad|>",
218
+ "<|image_pad|>",
219
+ "<|video_pad|>",
220
+ "<|AUDIO|>",
221
+ "<|audio_bos|>",
222
+ "<|audio_eos|>"
223
+ ],
224
+ "auto_map": {
225
+ "AutoProcessor": "processing_hithinkomni.HithinkOmniProcessor"
226
+ },
227
+ "bos_token": null,
228
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
229
+ "clean_up_tokenization_spaces": false,
230
+ "eos_token": "<|im_end|>",
231
+ "errors": "replace",
232
+ "extra_special_tokens": {},
233
+ "model_max_length": 131072,
234
+ "pad_token": "<|endoftext|>",
235
+ "processor_class": "HithinkOmniProcessor",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "PreTrainedTokenizer",
238
+ "unk_token": null
239
+ }