susanping commited on
Commit
49ae32a
·
verified ·
1 Parent(s): afd9f24

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {%- if messages[0].content is string %}
5
+ {{- messages[0].content }}
6
+ {%- else %}
7
+ {%- for content in messages[0].content %}
8
+ {%- if 'text' in content %}
9
+ {{- content.text }}
10
+ {%- endif %}
11
+ {%- endfor %}
12
+ {%- endif %}
13
+ {{- '\n\n' }}
14
+ {%- endif %}
15
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
16
+ {%- for tool in tools %}
17
+ {{- "\n" }}
18
+ {{- tool | tojson }}
19
+ {%- endfor %}
20
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
21
+ {%- else %}
22
+ {%- if messages[0].role == 'system' %}
23
+ {{- '<|im_start|>system\n' }}
24
+ {%- if messages[0].content is string %}
25
+ {{- messages[0].content }}
26
+ {%- else %}
27
+ {%- for content in messages[0].content %}
28
+ {%- if 'text' in content %}
29
+ {{- content.text }}
30
+ {%- endif %}
31
+ {%- endfor %}
32
+ {%- endif %}
33
+ {{- '<|im_end|>\n' }}
34
+ {%- endif %}
35
+ {%- endif %}
36
+ {%- set image_count = namespace(value=0) %}
37
+ {%- set video_count = namespace(value=0) %}
38
+ {%- for message in messages %}
39
+ {%- if message.role == "user" %}
40
+ {{- '<|im_start|>' + message.role + '\n' }}
41
+ {%- if message.content is string %}
42
+ {{- message.content }}
43
+ {%- else %}
44
+ {%- for content in message.content %}
45
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
46
+ {%- set image_count.value = image_count.value + 1 %}
47
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
48
+ <|vision_start|><|image_pad|><|vision_end|>
49
+ {%- elif content.type == 'video' or 'video' in content %}
50
+ {%- set video_count.value = video_count.value + 1 %}
51
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
52
+ <|vision_start|><|video_pad|><|vision_end|>
53
+ {%- elif 'text' in content %}
54
+ {{- content.text }}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {{- '<|im_end|>\n' }}
59
+ {%- elif message.role == "assistant" %}
60
+ {{- '<|im_start|>' + message.role + '\n' }}
61
+ {%- if message.content is string %}
62
+ {{- message.content }}
63
+ {%- else %}
64
+ {%- for content_item in message.content %}
65
+ {%- if 'text' in content_item %}
66
+ {{- content_item.text }}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- endif %}
70
+ {%- if message.tool_calls %}
71
+ {%- for tool_call in message.tool_calls %}
72
+ {%- if (loop.first and message.content) or (not loop.first) %}
73
+ {{- '\n' }}
74
+ {%- endif %}
75
+ {%- if tool_call.function %}
76
+ {%- set tool_call = tool_call.function %}
77
+ {%- endif %}
78
+ {{- '<tool_call>\n{"name": "' }}
79
+ {{- tool_call.name }}
80
+ {{- '", "arguments": ' }}
81
+ {%- if tool_call.arguments is string %}
82
+ {{- tool_call.arguments }}
83
+ {%- else %}
84
+ {{- tool_call.arguments | tojson }}
85
+ {%- endif %}
86
+ {{- '}\n</tool_call>' }}
87
+ {%- endfor %}
88
+ {%- endif %}
89
+ {{- '<|im_end|>\n' }}
90
+ {%- elif message.role == "tool" %}
91
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
92
+ {{- '<|im_start|>user' }}
93
+ {%- endif %}
94
+ {{- '\n<tool_response>\n' }}
95
+ {%- if message.content is string %}
96
+ {{- message.content }}
97
+ {%- else %}
98
+ {%- for content in message.content %}
99
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
100
+ {%- set image_count.value = image_count.value + 1 %}
101
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
102
+ <|vision_start|><|image_pad|><|vision_end|>
103
+ {%- elif content.type == 'video' or 'video' in content %}
104
+ {%- set video_count.value = video_count.value + 1 %}
105
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
106
+ <|vision_start|><|video_pad|><|vision_end|>
107
+ {%- elif 'text' in content %}
108
+ {{- content.text }}
109
+ {%- endif %}
110
+ {%- endfor %}
111
+ {%- endif %}
112
+ {{- '\n</tool_response>' }}
113
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
114
+ {{- '<|im_end|>\n' }}
115
+ {%- endif %}
116
+ {%- endif %}
117
+ {%- endfor %}
118
+ {%- if add_generation_prompt %}
119
+ {{- '<|im_start|>assistant\n' }}
120
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 32,
3
+ "action_length": 30,
4
+ "architectures": [
5
+ "MiBoTForActionGeneration"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_mibot.MiBoTConfig",
9
+ "AutoModel": "modeling_mibot.MiBoTForActionGeneration"
10
+ },
11
+ "dit_config": {
12
+ "dtype": "float32",
13
+ "head_dim": 128,
14
+ "hidden_size": 1024,
15
+ "model_type": "dit",
16
+ "num_hidden_layers": 16,
17
+ "num_key_value_heads": 8
18
+ },
19
+ "dtype": "bfloat16",
20
+ "model_type": "mibot",
21
+ "state_dim": 32,
22
+ "state_length": 1,
23
+ "transformers_version": "4.57.1",
24
+ "vlm_config": {
25
+ "_name_or_path": "Qwen/Qwen3-VL-4B-Instruct",
26
+ "architectures": [
27
+ "Qwen3VLForConditionalGeneration"
28
+ ],
29
+ "dtype": "float32",
30
+ "image_token_id": 151655,
31
+ "model_type": "qwen3_vl",
32
+ "text_config": {
33
+ "_name_or_path": "",
34
+ "add_cross_attention": false,
35
+ "architectures": null,
36
+ "attention_bias": false,
37
+ "attention_dropout": 0.0,
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "dtype": "bfloat16",
47
+ "early_stopping": false,
48
+ "encoder_no_repeat_ngram_size": 0,
49
+ "eos_token_id": 151645,
50
+ "exponential_decay_length_penalty": null,
51
+ "finetuning_task": null,
52
+ "forced_bos_token_id": null,
53
+ "forced_eos_token_id": null,
54
+ "head_dim": 128,
55
+ "hidden_act": "silu",
56
+ "hidden_size": 2560,
57
+ "id2label": {
58
+ "0": "LABEL_0",
59
+ "1": "LABEL_1"
60
+ },
61
+ "initializer_range": 0.02,
62
+ "intermediate_size": 9728,
63
+ "is_decoder": false,
64
+ "is_encoder_decoder": false,
65
+ "label2id": {
66
+ "LABEL_0": 0,
67
+ "LABEL_1": 1
68
+ },
69
+ "length_penalty": 1.0,
70
+ "max_length": 20,
71
+ "max_position_embeddings": 262144,
72
+ "min_length": 0,
73
+ "model_type": "qwen3_vl_text",
74
+ "no_repeat_ngram_size": 0,
75
+ "num_attention_heads": 32,
76
+ "num_beam_groups": 1,
77
+ "num_beams": 1,
78
+ "num_hidden_layers": 36,
79
+ "num_key_value_heads": 8,
80
+ "num_return_sequences": 1,
81
+ "output_attentions": false,
82
+ "output_hidden_states": false,
83
+ "output_scores": false,
84
+ "pad_token_id": null,
85
+ "prefix": null,
86
+ "problem_type": null,
87
+ "pruned_heads": {},
88
+ "remove_invalid_values": false,
89
+ "repetition_penalty": 1.0,
90
+ "return_dict": true,
91
+ "return_dict_in_generate": false,
92
+ "rms_norm_eps": 1e-06,
93
+ "rope_scaling": {
94
+ "mrope_interleaved": true,
95
+ "mrope_section": [
96
+ 24,
97
+ 20,
98
+ 20
99
+ ],
100
+ "rope_type": "default"
101
+ },
102
+ "rope_theta": 5000000,
103
+ "sep_token_id": null,
104
+ "suppress_tokens": null,
105
+ "task_specific_params": null,
106
+ "temperature": 1.0,
107
+ "tf_legacy_loss": false,
108
+ "tie_encoder_decoder": false,
109
+ "tie_word_embeddings": true,
110
+ "tokenizer_class": null,
111
+ "top_k": 50,
112
+ "top_p": 1.0,
113
+ "torchscript": false,
114
+ "typical_p": 1.0,
115
+ "use_bfloat16": false,
116
+ "use_cache": true,
117
+ "vocab_size": 151936
118
+ },
119
+ "tie_word_embeddings": true,
120
+ "video_token_id": 151656,
121
+ "vision_config": {
122
+ "_name_or_path": "",
123
+ "add_cross_attention": false,
124
+ "architectures": null,
125
+ "bad_words_ids": null,
126
+ "begin_suppress_tokens": null,
127
+ "bos_token_id": null,
128
+ "chunk_size_feed_forward": 0,
129
+ "cross_attention_hidden_size": null,
130
+ "decoder_start_token_id": null,
131
+ "deepstack_visual_indexes": [
132
+ 5,
133
+ 11,
134
+ 17
135
+ ],
136
+ "depth": 24,
137
+ "diversity_penalty": 0.0,
138
+ "do_sample": false,
139
+ "dtype": null,
140
+ "early_stopping": false,
141
+ "encoder_no_repeat_ngram_size": 0,
142
+ "eos_token_id": null,
143
+ "exponential_decay_length_penalty": null,
144
+ "finetuning_task": null,
145
+ "forced_bos_token_id": null,
146
+ "forced_eos_token_id": null,
147
+ "hidden_act": "gelu_pytorch_tanh",
148
+ "hidden_size": 1024,
149
+ "id2label": {
150
+ "0": "LABEL_0",
151
+ "1": "LABEL_1"
152
+ },
153
+ "in_channels": 3,
154
+ "initializer_range": 0.02,
155
+ "intermediate_size": 4096,
156
+ "is_decoder": false,
157
+ "is_encoder_decoder": false,
158
+ "label2id": {
159
+ "LABEL_0": 0,
160
+ "LABEL_1": 1
161
+ },
162
+ "length_penalty": 1.0,
163
+ "max_length": 20,
164
+ "min_length": 0,
165
+ "model_type": "qwen3_vl",
166
+ "no_repeat_ngram_size": 0,
167
+ "num_beam_groups": 1,
168
+ "num_beams": 1,
169
+ "num_heads": 16,
170
+ "num_position_embeddings": 2304,
171
+ "num_return_sequences": 1,
172
+ "out_hidden_size": 2560,
173
+ "output_attentions": false,
174
+ "output_hidden_states": false,
175
+ "output_scores": false,
176
+ "pad_token_id": null,
177
+ "patch_size": 16,
178
+ "prefix": null,
179
+ "problem_type": null,
180
+ "pruned_heads": {},
181
+ "remove_invalid_values": false,
182
+ "repetition_penalty": 1.0,
183
+ "return_dict": true,
184
+ "return_dict_in_generate": false,
185
+ "sep_token_id": null,
186
+ "spatial_merge_size": 2,
187
+ "suppress_tokens": null,
188
+ "task_specific_params": null,
189
+ "temperature": 1.0,
190
+ "temporal_patch_size": 2,
191
+ "tf_legacy_loss": false,
192
+ "tie_encoder_decoder": false,
193
+ "tie_word_embeddings": true,
194
+ "tokenizer_class": null,
195
+ "top_k": 50,
196
+ "top_p": 1.0,
197
+ "torchscript": false,
198
+ "typical_p": 1.0,
199
+ "use_bfloat16": false
200
+ },
201
+ "vision_end_token_id": 151653,
202
+ "vision_start_token_id": 151652
203
+ }
204
+ }
configuration_mibot.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (C) 2026 Xiaomi Corporation.
3
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ from typing import Optional
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.modeling_rope_utils import rope_config_validation
20
+
21
+
22
+ class Qwen3VLVisionConfig(PretrainedConfig):
23
+ model_type = "qwen3_vl"
24
+ base_config_key = "vision_config"
25
+
26
+ def __init__(
27
+ self,
28
+ depth=27,
29
+ hidden_size=1152,
30
+ hidden_act="gelu_pytorch_tanh",
31
+ intermediate_size=4304,
32
+ num_heads=16,
33
+ in_channels=3,
34
+ patch_size=16,
35
+ spatial_merge_size=2,
36
+ temporal_patch_size=2,
37
+ out_hidden_size=3584,
38
+ num_position_embeddings=2304,
39
+ deepstack_visual_indexes=[8, 16, 24],
40
+ initializer_range=0.02,
41
+ **kwargs,
42
+ ):
43
+ super().__init__(**kwargs)
44
+
45
+ self.depth = depth
46
+ self.hidden_size = hidden_size
47
+ self.hidden_act = hidden_act
48
+ self.intermediate_size = intermediate_size
49
+ self.num_heads = num_heads
50
+ self.in_channels = in_channels
51
+ self.patch_size = patch_size
52
+ self.spatial_merge_size = spatial_merge_size
53
+ self.temporal_patch_size = temporal_patch_size
54
+ self.out_hidden_size = out_hidden_size
55
+ self.num_position_embeddings = num_position_embeddings
56
+ self.initializer_range = initializer_range
57
+ self.deepstack_visual_indexes = deepstack_visual_indexes
58
+
59
+
60
+ class Qwen3VLTextConfig(PretrainedConfig):
61
+ r"""
62
+ This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a
63
+ Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
64
+ with the defaults will yield a similar configuration to that of
65
+ Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
66
+
67
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
68
+ documentation from [`PretrainedConfig`] for more information.
69
+
70
+ Args:
71
+ vocab_size (`int`, *optional*, defaults to 151936):
72
+ Vocabulary size of the Qwen3VL model. Defines the number of different tokens that can be represented by the
73
+ `inputs_ids` passed when calling [`Qwen3VLModel`]
74
+ hidden_size (`int`, *optional*, defaults to 4096):
75
+ Dimension of the hidden representations.
76
+ intermediate_size (`int`, *optional*, defaults to 22016):
77
+ Dimension of the MLP representations.
78
+ num_hidden_layers (`int`, *optional*, defaults to 32):
79
+ Number of hidden layers in the Transformer encoder.
80
+ num_attention_heads (`int`, *optional*, defaults to 32):
81
+ Number of attention heads for each attention layer in the Transformer encoder.
82
+ num_key_value_heads (`int`, *optional*, defaults to 32):
83
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
84
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
85
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
86
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
87
+ by meanpooling all the original heads within that group. For more details, check out [this
88
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
89
+ head_dim (`int`, *optional*, defaults to 128):
90
+ The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
91
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
92
+ The non-linear activation function (function or string) in the decoder.
93
+ max_position_embeddings (`int`, *optional*, defaults to 128000):
94
+ The maximum sequence length that this model might ever be used with.
95
+ initializer_range (`float`, *optional*, defaults to 0.02):
96
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
97
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
98
+ The epsilon used by the rms normalization layers.
99
+ use_cache (`bool`, *optional*, defaults to `True`):
100
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
101
+ relevant if `config.is_decoder=True`.
102
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
103
+ Whether the model's input and output word embeddings should be tied.
104
+ rope_theta (`float`, *optional*, defaults to 5000000.0):
105
+ The base period of the RoPE embeddings.
106
+ rope_scaling (`Dict`, *optional*):
107
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
108
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
109
+ accordingly.
110
+ Expected contents:
111
+ `rope_type` (`str`):
112
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
113
+ 'llama3'], with 'default' being the original RoPE implementation.
114
+ `factor` (`float`, *optional*):
115
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
116
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
117
+ original maximum pre-trained length.
118
+ `original_max_position_embeddings` (`int`, *optional*):
119
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
120
+ pretraining.
121
+ `attention_factor` (`float`, *optional*):
122
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
123
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
124
+ `factor` field to infer the suggested value.
125
+ `beta_fast` (`float`, *optional*):
126
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
127
+ ramp function. If unspecified, it defaults to 32.
128
+ `beta_slow` (`float`, *optional*):
129
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
130
+ ramp function. If unspecified, it defaults to 1.
131
+ `short_factor` (`list[float]`, *optional*):
132
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
133
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
134
+ size divided by the number of attention heads divided by 2
135
+ `long_factor` (`list[float]`, *optional*):
136
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
137
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
138
+ size divided by the number of attention heads divided by 2
139
+ `low_freq_factor` (`float`, *optional*):
140
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
141
+ `high_freq_factor` (`float`, *optional*):
142
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
143
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
144
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
145
+ attention_dropout (`float`, *optional*, defaults to 0.0):
146
+ The dropout ratio for the attention probabilities.
147
+
148
+ ```python
149
+ >>> from transformers import Qwen3VLTextModel, Qwen3VLTextConfig
150
+
151
+ >>> # Initializing a Qwen3VL style configuration
152
+ >>> configuration = Qwen3VLTextConfig()
153
+
154
+ >>> # Initializing a model from the Qwen3-VL-7B style configuration
155
+ >>> model = Qwen3VLTextModel(configuration)
156
+
157
+ >>> # Accessing the model configuration
158
+ >>> configuration = model.config
159
+ ```"""
160
+
161
+ model_type = "qwen3_vl_text"
162
+ base_config_key = "text_config"
163
+
164
+ def __init__(
165
+ self,
166
+ vocab_size=151936,
167
+ hidden_size=4096,
168
+ intermediate_size=22016,
169
+ num_hidden_layers=32,
170
+ num_attention_heads=32,
171
+ num_key_value_heads=32,
172
+ head_dim=128,
173
+ hidden_act="silu",
174
+ max_position_embeddings=128000,
175
+ initializer_range=0.02,
176
+ rms_norm_eps=1e-6,
177
+ use_cache=True,
178
+ tie_word_embeddings=False,
179
+ rope_theta=5000000.0,
180
+ rope_scaling=None,
181
+ attention_bias=False,
182
+ attention_dropout=0.0,
183
+ **kwargs,
184
+ ):
185
+ self.vocab_size = vocab_size
186
+ self.max_position_embeddings = max_position_embeddings
187
+ self.hidden_size = hidden_size
188
+ self.intermediate_size = intermediate_size
189
+ self.num_hidden_layers = num_hidden_layers
190
+ self.num_attention_heads = num_attention_heads
191
+
192
+ # for backward compatibility
193
+ if num_key_value_heads is None:
194
+ num_key_value_heads = num_attention_heads
195
+
196
+ self.num_key_value_heads = num_key_value_heads
197
+ self.head_dim = head_dim
198
+ self.hidden_act = hidden_act
199
+ self.initializer_range = initializer_range
200
+ self.rms_norm_eps = rms_norm_eps
201
+ self.use_cache = use_cache
202
+ self.rope_theta = rope_theta
203
+ self.rope_scaling = rope_scaling
204
+ self.attention_bias = attention_bias
205
+ self.attention_dropout = attention_dropout
206
+
207
+ rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
208
+
209
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
210
+
211
+
212
+ class Qwen3VLConfig(PretrainedConfig):
213
+ r"""
214
+ This is the configuration class to store the configuration of a [`Qwen3VLModel`]. It is used to instantiate a
215
+ Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
216
+ with the defaults will yield a similar configuration to that of
217
+ Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
218
+
219
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
220
+ documentation from [`PretrainedConfig`] for more information.
221
+
222
+
223
+ Args:
224
+ text_config (`Union[PretrainedConfig, dict]`, *optional*, defaults to `Qwen3VLTextConfig`):
225
+ The config object or dictionary of the text backbone.
226
+ vision_config (`Union[PretrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
227
+ The config object or dictionary of the vision backbone.
228
+ image_token_id (`int`, *optional*, defaults to 151655):
229
+ The image token index to encode the image prompt.
230
+ video_token_id (`int`, *optional*, defaults to 151656):
231
+ The video token index to encode the image prompt.
232
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
233
+ The start token index to encode the image prompt.
234
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
235
+ The end token index to encode the image prompt.
236
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
237
+ Whether to tie the word embeddings.
238
+
239
+ ```python
240
+ >>> from transformers import Qwen3VLForConditionalGeneration, Qwen3VLConfig
241
+
242
+ >>> # Initializing a Qwen3-VL style configuration
243
+ >>> configuration = Qwen3VLConfig()
244
+
245
+ >>> # Initializing a model from the Qwen3-VL-4B style configuration
246
+ >>> model = Qwen3VLForConditionalGeneration(configuration)
247
+
248
+ >>> # Accessing the model configuration
249
+ >>> configuration = model.config
250
+ ```"""
251
+
252
+ model_type = "qwen3_vl"
253
+ sub_configs = {"vision_config": Qwen3VLVisionConfig, "text_config": Qwen3VLTextConfig}
254
+ keys_to_ignore_at_inference = ["past_key_values"]
255
+
256
+ def __init__(
257
+ self,
258
+ text_config=None,
259
+ vision_config=None,
260
+ image_token_id=151655,
261
+ video_token_id=151656,
262
+ vision_start_token_id=151652,
263
+ vision_end_token_id=151653,
264
+ tie_word_embeddings=False,
265
+ **kwargs,
266
+ ):
267
+ if isinstance(vision_config, dict):
268
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
269
+ elif vision_config is None:
270
+ self.vision_config = self.sub_configs["vision_config"]()
271
+
272
+ if isinstance(text_config, dict):
273
+ self.text_config = self.sub_configs["text_config"](**text_config)
274
+ elif text_config is None:
275
+ self.text_config = self.sub_configs["text_config"]()
276
+
277
+ self.image_token_id = image_token_id
278
+ self.video_token_id = video_token_id
279
+ self.vision_start_token_id = vision_start_token_id
280
+ self.vision_end_token_id = vision_end_token_id
281
+ super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
282
+
283
+
284
+ class DiTConfig(PretrainedConfig):
285
+ r"""
286
+ This is the configuration class to store the configuration of a [`DiT`] model. It is used to instantiate a DiT
287
+ model according to the specified arguments, defining the model architecture.
288
+
289
+ Args:
290
+ hidden_size (`int`, *optional*, defaults to 768):
291
+ Dimension of the hidden representations.
292
+ num_hidden_layers (`int`, *optional*, defaults to 8):
293
+ Number of decoder layers in the DiT.
294
+ head_dim (`int`, *optional*, defaults to 128):
295
+ Dimension of each attention head.
296
+ num_key_value_heads (`int`, *optional*, defaults to 2):
297
+ Number of key-value heads for grouped query attention.
298
+ is_causal (`bool`, *optional*, defaults to `True`):
299
+ Whether the model uses causal (autoregressive) attention.
300
+ attention_dropout (`float`, *optional*, defaults to 0.0):
301
+ Dropout ratio for the attention probabilities.
302
+ initializer_range (`float`, *optional*, defaults to 0.02):
303
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
304
+ """
305
+
306
+ model_type = "dit"
307
+
308
+ def __init__(
309
+ self,
310
+ hidden_size: int = 1024,
311
+ num_hidden_layers: int = 16,
312
+ head_dim: int = 128,
313
+ num_key_value_heads: int = 8,
314
+ **kwargs,
315
+ ):
316
+ super().__init__(**kwargs)
317
+ self.hidden_size = hidden_size
318
+ self.num_hidden_layers = num_hidden_layers
319
+ self.head_dim = head_dim
320
+ self.num_key_value_heads = num_key_value_heads
321
+
322
+ class MiBoTConfig(PretrainedConfig):
323
+ r"""
324
+ This is the configuration class to store the configuration of a [`MiBoTModel`]. It is used to instantiate a MiBoT
325
+ model according to the specified arguments, defining the model architecture. The MiBoT model consists of a
326
+ vision-language model (Qwen3-VL) and a policy head (DiT).
327
+
328
+ Args:
329
+ vlm_config (`Union[PretrainedConfig, dict]`, *optional*):
330
+ The config object or dictionary for the vision-language model (Qwen3-VL).
331
+ dit_config (`Union[PretrainedConfig, dict]`, *optional*):
332
+ The config object or dictionary for the DiT policy head.
333
+ state_length (`int`, *optional*, defaults to 1):
334
+ Number of historical states to include as input.
335
+ state_dim (`int`, *optional*, defaults to 16):
336
+ Dimension of each state vector.
337
+ action_length (`int`, *optional*, defaults to 30):
338
+ Length of the output action sequence.
339
+ action_dim (`int`, *optional*, defaults to 32):
340
+ Dimension of each action vector.
341
+ **kwargs: Additional keyword arguments passed to PretrainedConfig.
342
+ """
343
+
344
+ model_type = "mibot"
345
+ sub_configs = {
346
+ "vlm_config": Qwen3VLConfig,
347
+ "dit_config": DiTConfig,
348
+ }
349
+ keys_to_ignore_at_inference = ["past_key_values"]
350
+
351
+ def __init__(
352
+ self,
353
+ vlm_config: Optional[dict] = None,
354
+ dit_config: Optional[dict] = None,
355
+ state_length: int = 1,
356
+ state_dim: int = 32,
357
+ action_length: int = 30,
358
+ action_dim: int = 32,
359
+ **kwargs,
360
+ ):
361
+ if isinstance(vlm_config, dict):
362
+ self.vlm_config = self.sub_configs["vlm_config"](**vlm_config)
363
+ else:
364
+ self.vlm_config = self.sub_configs["vlm_config"]()
365
+
366
+ if isinstance(dit_config, dict):
367
+ self.dit_config = self.sub_configs["dit_config"](**dit_config)
368
+ else:
369
+ self.dit_config = self.sub_configs["dit_config"]()
370
+
371
+ self.state_length = state_length
372
+ self.state_dim = state_dim
373
+ self.action_length = action_length
374
+ self.action_dim = action_dim
375
+
376
+ super().__init__(**kwargs)
377
+
378
+ __all__ = ["MiBoTConfig", "DiTConfig"]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76c81ad1e97d8003f660639167083108be81b0765cdb6c4ec51b7e49353548bb
3
+ size 4990499880
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e94f2ce9bedafa690d234ab92a8640861fda0ac460b520361dba3bb2ca761d7
3
+ size 4442162272
model.safetensors.index.json ADDED
@@ -0,0 +1,940 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 5105230336,
4
+ "total_size": 9432548352
5
+ },
6
+ "weight_map": {
7
+ "action_output_layer.layers.0.weight": "model-00002-of-00002.safetensors",
8
+ "action_output_layer.layers.2.weight": "model-00002-of-00002.safetensors",
9
+ "action_projector.layers.0.weight": "model-00002-of-00002.safetensors",
10
+ "action_projector.layers.2.weight": "model-00002-of-00002.safetensors",
11
+ "dit.layers.0.adaln_table": "model-00002-of-00002.safetensors",
12
+ "dit.layers.0.attn.k_norm.weight": "model-00002-of-00002.safetensors",
13
+ "dit.layers.0.attn.o_proj.weight": "model-00002-of-00002.safetensors",
14
+ "dit.layers.0.attn.q_norm.weight": "model-00002-of-00002.safetensors",
15
+ "dit.layers.0.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
16
+ "dit.layers.0.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
17
+ "dit.layers.0.final_layernorm.weight": "model-00002-of-00002.safetensors",
18
+ "dit.layers.0.input_layernorm.weight": "model-00002-of-00002.safetensors",
19
+ "dit.layers.0.middle_layernorm.weight": "model-00002-of-00002.safetensors",
20
+ "dit.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
21
+ "dit.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
22
+ "dit.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
23
+ "dit.layers.0.post_layernorm.weight": "model-00002-of-00002.safetensors",
24
+ "dit.layers.1.adaln_table": "model-00002-of-00002.safetensors",
25
+ "dit.layers.1.attn.k_norm.weight": "model-00002-of-00002.safetensors",
26
+ "dit.layers.1.attn.o_proj.weight": "model-00002-of-00002.safetensors",
27
+ "dit.layers.1.attn.q_norm.weight": "model-00002-of-00002.safetensors",
28
+ "dit.layers.1.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
29
+ "dit.layers.1.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
30
+ "dit.layers.1.final_layernorm.weight": "model-00002-of-00002.safetensors",
31
+ "dit.layers.1.input_layernorm.weight": "model-00002-of-00002.safetensors",
32
+ "dit.layers.1.middle_layernorm.weight": "model-00002-of-00002.safetensors",
33
+ "dit.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
34
+ "dit.layers.1.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
35
+ "dit.layers.1.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
36
+ "dit.layers.1.post_layernorm.weight": "model-00002-of-00002.safetensors",
37
+ "dit.layers.10.adaln_table": "model-00002-of-00002.safetensors",
38
+ "dit.layers.10.attn.k_norm.weight": "model-00002-of-00002.safetensors",
39
+ "dit.layers.10.attn.o_proj.weight": "model-00002-of-00002.safetensors",
40
+ "dit.layers.10.attn.q_norm.weight": "model-00002-of-00002.safetensors",
41
+ "dit.layers.10.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
42
+ "dit.layers.10.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
43
+ "dit.layers.10.final_layernorm.weight": "model-00002-of-00002.safetensors",
44
+ "dit.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors",
45
+ "dit.layers.10.middle_layernorm.weight": "model-00002-of-00002.safetensors",
46
+ "dit.layers.10.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
47
+ "dit.layers.10.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
48
+ "dit.layers.10.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
49
+ "dit.layers.10.post_layernorm.weight": "model-00002-of-00002.safetensors",
50
+ "dit.layers.11.adaln_table": "model-00002-of-00002.safetensors",
51
+ "dit.layers.11.attn.k_norm.weight": "model-00002-of-00002.safetensors",
52
+ "dit.layers.11.attn.o_proj.weight": "model-00002-of-00002.safetensors",
53
+ "dit.layers.11.attn.q_norm.weight": "model-00002-of-00002.safetensors",
54
+ "dit.layers.11.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
55
+ "dit.layers.11.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
56
+ "dit.layers.11.final_layernorm.weight": "model-00002-of-00002.safetensors",
57
+ "dit.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors",
58
+ "dit.layers.11.middle_layernorm.weight": "model-00002-of-00002.safetensors",
59
+ "dit.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
60
+ "dit.layers.11.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
61
+ "dit.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
62
+ "dit.layers.11.post_layernorm.weight": "model-00002-of-00002.safetensors",
63
+ "dit.layers.12.adaln_table": "model-00002-of-00002.safetensors",
64
+ "dit.layers.12.attn.k_norm.weight": "model-00002-of-00002.safetensors",
65
+ "dit.layers.12.attn.o_proj.weight": "model-00002-of-00002.safetensors",
66
+ "dit.layers.12.attn.q_norm.weight": "model-00002-of-00002.safetensors",
67
+ "dit.layers.12.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
68
+ "dit.layers.12.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
69
+ "dit.layers.12.final_layernorm.weight": "model-00002-of-00002.safetensors",
70
+ "dit.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
71
+ "dit.layers.12.middle_layernorm.weight": "model-00002-of-00002.safetensors",
72
+ "dit.layers.12.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
73
+ "dit.layers.12.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
74
+ "dit.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
75
+ "dit.layers.12.post_layernorm.weight": "model-00002-of-00002.safetensors",
76
+ "dit.layers.13.adaln_table": "model-00002-of-00002.safetensors",
77
+ "dit.layers.13.attn.k_norm.weight": "model-00002-of-00002.safetensors",
78
+ "dit.layers.13.attn.o_proj.weight": "model-00002-of-00002.safetensors",
79
+ "dit.layers.13.attn.q_norm.weight": "model-00002-of-00002.safetensors",
80
+ "dit.layers.13.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
81
+ "dit.layers.13.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
82
+ "dit.layers.13.final_layernorm.weight": "model-00002-of-00002.safetensors",
83
+ "dit.layers.13.input_layernorm.weight": "model-00002-of-00002.safetensors",
84
+ "dit.layers.13.middle_layernorm.weight": "model-00002-of-00002.safetensors",
85
+ "dit.layers.13.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
86
+ "dit.layers.13.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
87
+ "dit.layers.13.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
88
+ "dit.layers.13.post_layernorm.weight": "model-00002-of-00002.safetensors",
89
+ "dit.layers.14.adaln_table": "model-00002-of-00002.safetensors",
90
+ "dit.layers.14.attn.k_norm.weight": "model-00002-of-00002.safetensors",
91
+ "dit.layers.14.attn.o_proj.weight": "model-00002-of-00002.safetensors",
92
+ "dit.layers.14.attn.q_norm.weight": "model-00002-of-00002.safetensors",
93
+ "dit.layers.14.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
94
+ "dit.layers.14.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
95
+ "dit.layers.14.final_layernorm.weight": "model-00002-of-00002.safetensors",
96
+ "dit.layers.14.input_layernorm.weight": "model-00002-of-00002.safetensors",
97
+ "dit.layers.14.middle_layernorm.weight": "model-00002-of-00002.safetensors",
98
+ "dit.layers.14.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
99
+ "dit.layers.14.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
100
+ "dit.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
101
+ "dit.layers.14.post_layernorm.weight": "model-00002-of-00002.safetensors",
102
+ "dit.layers.15.adaln_table": "model-00002-of-00002.safetensors",
103
+ "dit.layers.15.attn.k_norm.weight": "model-00002-of-00002.safetensors",
104
+ "dit.layers.15.attn.o_proj.weight": "model-00002-of-00002.safetensors",
105
+ "dit.layers.15.attn.q_norm.weight": "model-00002-of-00002.safetensors",
106
+ "dit.layers.15.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
107
+ "dit.layers.15.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
108
+ "dit.layers.15.final_layernorm.weight": "model-00002-of-00002.safetensors",
109
+ "dit.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors",
110
+ "dit.layers.15.middle_layernorm.weight": "model-00002-of-00002.safetensors",
111
+ "dit.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
112
+ "dit.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
113
+ "dit.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
114
+ "dit.layers.15.post_layernorm.weight": "model-00002-of-00002.safetensors",
115
+ "dit.layers.2.adaln_table": "model-00002-of-00002.safetensors",
116
+ "dit.layers.2.attn.k_norm.weight": "model-00002-of-00002.safetensors",
117
+ "dit.layers.2.attn.o_proj.weight": "model-00002-of-00002.safetensors",
118
+ "dit.layers.2.attn.q_norm.weight": "model-00002-of-00002.safetensors",
119
+ "dit.layers.2.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
120
+ "dit.layers.2.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
121
+ "dit.layers.2.final_layernorm.weight": "model-00002-of-00002.safetensors",
122
+ "dit.layers.2.input_layernorm.weight": "model-00002-of-00002.safetensors",
123
+ "dit.layers.2.middle_layernorm.weight": "model-00002-of-00002.safetensors",
124
+ "dit.layers.2.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
125
+ "dit.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
126
+ "dit.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
127
+ "dit.layers.2.post_layernorm.weight": "model-00002-of-00002.safetensors",
128
+ "dit.layers.3.adaln_table": "model-00002-of-00002.safetensors",
129
+ "dit.layers.3.attn.k_norm.weight": "model-00002-of-00002.safetensors",
130
+ "dit.layers.3.attn.o_proj.weight": "model-00002-of-00002.safetensors",
131
+ "dit.layers.3.attn.q_norm.weight": "model-00002-of-00002.safetensors",
132
+ "dit.layers.3.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
133
+ "dit.layers.3.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
134
+ "dit.layers.3.final_layernorm.weight": "model-00002-of-00002.safetensors",
135
+ "dit.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors",
136
+ "dit.layers.3.middle_layernorm.weight": "model-00002-of-00002.safetensors",
137
+ "dit.layers.3.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
138
+ "dit.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
139
+ "dit.layers.3.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
140
+ "dit.layers.3.post_layernorm.weight": "model-00002-of-00002.safetensors",
141
+ "dit.layers.4.adaln_table": "model-00002-of-00002.safetensors",
142
+ "dit.layers.4.attn.k_norm.weight": "model-00002-of-00002.safetensors",
143
+ "dit.layers.4.attn.o_proj.weight": "model-00002-of-00002.safetensors",
144
+ "dit.layers.4.attn.q_norm.weight": "model-00002-of-00002.safetensors",
145
+ "dit.layers.4.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
146
+ "dit.layers.4.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
147
+ "dit.layers.4.final_layernorm.weight": "model-00002-of-00002.safetensors",
148
+ "dit.layers.4.input_layernorm.weight": "model-00002-of-00002.safetensors",
149
+ "dit.layers.4.middle_layernorm.weight": "model-00002-of-00002.safetensors",
150
+ "dit.layers.4.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
151
+ "dit.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
152
+ "dit.layers.4.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
153
+ "dit.layers.4.post_layernorm.weight": "model-00002-of-00002.safetensors",
154
+ "dit.layers.5.adaln_table": "model-00002-of-00002.safetensors",
155
+ "dit.layers.5.attn.k_norm.weight": "model-00002-of-00002.safetensors",
156
+ "dit.layers.5.attn.o_proj.weight": "model-00002-of-00002.safetensors",
157
+ "dit.layers.5.attn.q_norm.weight": "model-00002-of-00002.safetensors",
158
+ "dit.layers.5.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
159
+ "dit.layers.5.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
160
+ "dit.layers.5.final_layernorm.weight": "model-00002-of-00002.safetensors",
161
+ "dit.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors",
162
+ "dit.layers.5.middle_layernorm.weight": "model-00002-of-00002.safetensors",
163
+ "dit.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
+ "dit.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
+ "dit.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
+ "dit.layers.5.post_layernorm.weight": "model-00002-of-00002.safetensors",
167
+ "dit.layers.6.adaln_table": "model-00002-of-00002.safetensors",
168
+ "dit.layers.6.attn.k_norm.weight": "model-00002-of-00002.safetensors",
169
+ "dit.layers.6.attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
+ "dit.layers.6.attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
+ "dit.layers.6.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
172
+ "dit.layers.6.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
173
+ "dit.layers.6.final_layernorm.weight": "model-00002-of-00002.safetensors",
174
+ "dit.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors",
175
+ "dit.layers.6.middle_layernorm.weight": "model-00002-of-00002.safetensors",
176
+ "dit.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
177
+ "dit.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
178
+ "dit.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
179
+ "dit.layers.6.post_layernorm.weight": "model-00002-of-00002.safetensors",
180
+ "dit.layers.7.adaln_table": "model-00002-of-00002.safetensors",
181
+ "dit.layers.7.attn.k_norm.weight": "model-00002-of-00002.safetensors",
182
+ "dit.layers.7.attn.o_proj.weight": "model-00002-of-00002.safetensors",
183
+ "dit.layers.7.attn.q_norm.weight": "model-00002-of-00002.safetensors",
184
+ "dit.layers.7.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
185
+ "dit.layers.7.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
186
+ "dit.layers.7.final_layernorm.weight": "model-00002-of-00002.safetensors",
187
+ "dit.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
188
+ "dit.layers.7.middle_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "dit.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
190
+ "dit.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
191
+ "dit.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
192
+ "dit.layers.7.post_layernorm.weight": "model-00002-of-00002.safetensors",
193
+ "dit.layers.8.adaln_table": "model-00002-of-00002.safetensors",
194
+ "dit.layers.8.attn.k_norm.weight": "model-00002-of-00002.safetensors",
195
+ "dit.layers.8.attn.o_proj.weight": "model-00002-of-00002.safetensors",
196
+ "dit.layers.8.attn.q_norm.weight": "model-00002-of-00002.safetensors",
197
+ "dit.layers.8.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
198
+ "dit.layers.8.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
199
+ "dit.layers.8.final_layernorm.weight": "model-00002-of-00002.safetensors",
200
+ "dit.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors",
201
+ "dit.layers.8.middle_layernorm.weight": "model-00002-of-00002.safetensors",
202
+ "dit.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
203
+ "dit.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
204
+ "dit.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
205
+ "dit.layers.8.post_layernorm.weight": "model-00002-of-00002.safetensors",
206
+ "dit.layers.9.adaln_table": "model-00002-of-00002.safetensors",
207
+ "dit.layers.9.attn.k_norm.weight": "model-00002-of-00002.safetensors",
208
+ "dit.layers.9.attn.o_proj.weight": "model-00002-of-00002.safetensors",
209
+ "dit.layers.9.attn.q_norm.weight": "model-00002-of-00002.safetensors",
210
+ "dit.layers.9.attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
211
+ "dit.layers.9.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
212
+ "dit.layers.9.final_layernorm.weight": "model-00002-of-00002.safetensors",
213
+ "dit.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors",
214
+ "dit.layers.9.middle_layernorm.weight": "model-00002-of-00002.safetensors",
215
+ "dit.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
216
+ "dit.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
217
+ "dit.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
218
+ "dit.layers.9.post_layernorm.weight": "model-00002-of-00002.safetensors",
219
+ "sink.weight": "model-00002-of-00002.safetensors",
220
+ "state_projector.layers.0.weight": "model-00002-of-00002.safetensors",
221
+ "state_projector.layers.2.weight": "model-00002-of-00002.safetensors",
222
+ "t_embedder.mlp.0.weight": "model-00002-of-00002.safetensors",
223
+ "t_embedder.mlp.2.weight": "model-00002-of-00002.safetensors",
224
+ "t_projector.layers.0.bias": "model-00002-of-00002.safetensors",
225
+ "t_projector.layers.0.weight": "model-00002-of-00002.safetensors",
226
+ "vlm.model.language_model.embed_tokens.weight": "model-00001-of-00002.safetensors",
227
+ "vlm.model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
228
+ "vlm.model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
229
+ "vlm.model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
230
+ "vlm.model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
231
+ "vlm.model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
232
+ "vlm.model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
233
+ "vlm.model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
234
+ "vlm.model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
235
+ "vlm.model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
236
+ "vlm.model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
237
+ "vlm.model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
238
+ "vlm.model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
239
+ "vlm.model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
240
+ "vlm.model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
241
+ "vlm.model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
242
+ "vlm.model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
243
+ "vlm.model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
244
+ "vlm.model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
245
+ "vlm.model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
246
+ "vlm.model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
247
+ "vlm.model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
248
+ "vlm.model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
249
+ "vlm.model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
250
+ "vlm.model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
251
+ "vlm.model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
252
+ "vlm.model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
253
+ "vlm.model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
254
+ "vlm.model.language_model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
255
+ "vlm.model.language_model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
256
+ "vlm.model.language_model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
257
+ "vlm.model.language_model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
258
+ "vlm.model.language_model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
259
+ "vlm.model.language_model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
260
+ "vlm.model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
261
+ "vlm.model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
262
+ "vlm.model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
263
+ "vlm.model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
264
+ "vlm.model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
265
+ "vlm.model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
266
+ "vlm.model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
267
+ "vlm.model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
268
+ "vlm.model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
269
+ "vlm.model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
270
+ "vlm.model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
271
+ "vlm.model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
272
+ "vlm.model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
273
+ "vlm.model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
274
+ "vlm.model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
275
+ "vlm.model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
276
+ "vlm.model.language_model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
277
+ "vlm.model.language_model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
278
+ "vlm.model.language_model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
279
+ "vlm.model.language_model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
280
+ "vlm.model.language_model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
281
+ "vlm.model.language_model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
282
+ "vlm.model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
283
+ "vlm.model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
284
+ "vlm.model.language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
285
+ "vlm.model.language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
286
+ "vlm.model.language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
287
+ "vlm.model.language_model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
288
+ "vlm.model.language_model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
289
+ "vlm.model.language_model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
290
+ "vlm.model.language_model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
291
+ "vlm.model.language_model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
292
+ "vlm.model.language_model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
293
+ "vlm.model.language_model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
294
+ "vlm.model.language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
295
+ "vlm.model.language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
296
+ "vlm.model.language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
297
+ "vlm.model.language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
298
+ "vlm.model.language_model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
299
+ "vlm.model.language_model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
300
+ "vlm.model.language_model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
301
+ "vlm.model.language_model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
302
+ "vlm.model.language_model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
303
+ "vlm.model.language_model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
304
+ "vlm.model.language_model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
305
+ "vlm.model.language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
306
+ "vlm.model.language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
307
+ "vlm.model.language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
308
+ "vlm.model.language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
309
+ "vlm.model.language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
310
+ "vlm.model.language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
311
+ "vlm.model.language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
312
+ "vlm.model.language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
313
+ "vlm.model.language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
314
+ "vlm.model.language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
315
+ "vlm.model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
316
+ "vlm.model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
317
+ "vlm.model.language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
318
+ "vlm.model.language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
319
+ "vlm.model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
320
+ "vlm.model.language_model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
321
+ "vlm.model.language_model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
322
+ "vlm.model.language_model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
323
+ "vlm.model.language_model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
324
+ "vlm.model.language_model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
325
+ "vlm.model.language_model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
326
+ "vlm.model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
327
+ "vlm.model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
328
+ "vlm.model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
329
+ "vlm.model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
330
+ "vlm.model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
331
+ "vlm.model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
332
+ "vlm.model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
333
+ "vlm.model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
334
+ "vlm.model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
335
+ "vlm.model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
336
+ "vlm.model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
337
+ "vlm.model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
338
+ "vlm.model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
339
+ "vlm.model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
340
+ "vlm.model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
341
+ "vlm.model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
342
+ "vlm.model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
343
+ "vlm.model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
344
+ "vlm.model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
345
+ "vlm.model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
346
+ "vlm.model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
347
+ "vlm.model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
348
+ "vlm.model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
349
+ "vlm.model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
350
+ "vlm.model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
351
+ "vlm.model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
352
+ "vlm.model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
353
+ "vlm.model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
354
+ "vlm.model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
355
+ "vlm.model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
356
+ "vlm.model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
357
+ "vlm.model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
358
+ "vlm.model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
359
+ "vlm.model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
360
+ "vlm.model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
361
+ "vlm.model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
362
+ "vlm.model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
363
+ "vlm.model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
364
+ "vlm.model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
365
+ "vlm.model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
366
+ "vlm.model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
367
+ "vlm.model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
368
+ "vlm.model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
369
+ "vlm.model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
370
+ "vlm.model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
371
+ "vlm.model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
372
+ "vlm.model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
373
+ "vlm.model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
374
+ "vlm.model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
375
+ "vlm.model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
376
+ "vlm.model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
377
+ "vlm.model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
378
+ "vlm.model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
379
+ "vlm.model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
380
+ "vlm.model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
381
+ "vlm.model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
382
+ "vlm.model.language_model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
383
+ "vlm.model.language_model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
384
+ "vlm.model.language_model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
385
+ "vlm.model.language_model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
386
+ "vlm.model.language_model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
387
+ "vlm.model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
388
+ "vlm.model.language_model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
389
+ "vlm.model.language_model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
390
+ "vlm.model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
391
+ "vlm.model.language_model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
392
+ "vlm.model.language_model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
393
+ "vlm.model.language_model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
394
+ "vlm.model.language_model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
395
+ "vlm.model.language_model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
396
+ "vlm.model.language_model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
397
+ "vlm.model.language_model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
398
+ "vlm.model.language_model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
399
+ "vlm.model.language_model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
400
+ "vlm.model.language_model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
401
+ "vlm.model.language_model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
402
+ "vlm.model.language_model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
403
+ "vlm.model.language_model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
404
+ "vlm.model.language_model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
405
+ "vlm.model.language_model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
406
+ "vlm.model.language_model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
407
+ "vlm.model.language_model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
408
+ "vlm.model.language_model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
409
+ "vlm.model.language_model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
410
+ "vlm.model.language_model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
411
+ "vlm.model.language_model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
412
+ "vlm.model.language_model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
413
+ "vlm.model.language_model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
414
+ "vlm.model.language_model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
415
+ "vlm.model.language_model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
416
+ "vlm.model.language_model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
417
+ "vlm.model.language_model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
418
+ "vlm.model.language_model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
419
+ "vlm.model.language_model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
420
+ "vlm.model.language_model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
421
+ "vlm.model.language_model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
422
+ "vlm.model.language_model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
423
+ "vlm.model.language_model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
424
+ "vlm.model.language_model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
425
+ "vlm.model.language_model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
426
+ "vlm.model.language_model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
427
+ "vlm.model.language_model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
428
+ "vlm.model.language_model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
429
+ "vlm.model.language_model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
430
+ "vlm.model.language_model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
431
+ "vlm.model.language_model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
432
+ "vlm.model.language_model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
433
+ "vlm.model.language_model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
434
+ "vlm.model.language_model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
435
+ "vlm.model.language_model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
436
+ "vlm.model.language_model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
437
+ "vlm.model.language_model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
438
+ "vlm.model.language_model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
439
+ "vlm.model.language_model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
440
+ "vlm.model.language_model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
441
+ "vlm.model.language_model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
442
+ "vlm.model.language_model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
443
+ "vlm.model.language_model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
444
+ "vlm.model.language_model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
445
+ "vlm.model.language_model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
446
+ "vlm.model.language_model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
447
+ "vlm.model.language_model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
448
+ "vlm.model.language_model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
449
+ "vlm.model.language_model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
450
+ "vlm.model.language_model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
451
+ "vlm.model.language_model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
452
+ "vlm.model.language_model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
453
+ "vlm.model.language_model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
454
+ "vlm.model.language_model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
455
+ "vlm.model.language_model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
456
+ "vlm.model.language_model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
457
+ "vlm.model.language_model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
458
+ "vlm.model.language_model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
459
+ "vlm.model.language_model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
460
+ "vlm.model.language_model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
461
+ "vlm.model.language_model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
462
+ "vlm.model.language_model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
463
+ "vlm.model.language_model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
464
+ "vlm.model.language_model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
465
+ "vlm.model.language_model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
466
+ "vlm.model.language_model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
467
+ "vlm.model.language_model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
468
+ "vlm.model.language_model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
469
+ "vlm.model.language_model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
470
+ "vlm.model.language_model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
471
+ "vlm.model.language_model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
472
+ "vlm.model.language_model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
473
+ "vlm.model.language_model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
474
+ "vlm.model.language_model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
475
+ "vlm.model.language_model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
476
+ "vlm.model.language_model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
477
+ "vlm.model.language_model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
478
+ "vlm.model.language_model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
479
+ "vlm.model.language_model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
480
+ "vlm.model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
481
+ "vlm.model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
482
+ "vlm.model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
483
+ "vlm.model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
484
+ "vlm.model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
485
+ "vlm.model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
486
+ "vlm.model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
487
+ "vlm.model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
488
+ "vlm.model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
489
+ "vlm.model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
490
+ "vlm.model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
491
+ "vlm.model.language_model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
492
+ "vlm.model.language_model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
493
+ "vlm.model.language_model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
494
+ "vlm.model.language_model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
495
+ "vlm.model.language_model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
496
+ "vlm.model.language_model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
497
+ "vlm.model.language_model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
498
+ "vlm.model.language_model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
499
+ "vlm.model.language_model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
500
+ "vlm.model.language_model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
501
+ "vlm.model.language_model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
502
+ "vlm.model.language_model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
503
+ "vlm.model.language_model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
504
+ "vlm.model.language_model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
505
+ "vlm.model.language_model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
506
+ "vlm.model.language_model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
507
+ "vlm.model.language_model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
508
+ "vlm.model.language_model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
509
+ "vlm.model.language_model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
510
+ "vlm.model.language_model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
511
+ "vlm.model.language_model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
512
+ "vlm.model.language_model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
513
+ "vlm.model.language_model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
514
+ "vlm.model.language_model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
515
+ "vlm.model.language_model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
516
+ "vlm.model.language_model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
517
+ "vlm.model.language_model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
518
+ "vlm.model.language_model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
519
+ "vlm.model.language_model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
520
+ "vlm.model.language_model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
521
+ "vlm.model.language_model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
522
+ "vlm.model.language_model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
523
+ "vlm.model.language_model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
524
+ "vlm.model.language_model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
525
+ "vlm.model.language_model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
526
+ "vlm.model.language_model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
527
+ "vlm.model.language_model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
528
+ "vlm.model.language_model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
529
+ "vlm.model.language_model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
530
+ "vlm.model.language_model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
531
+ "vlm.model.language_model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
532
+ "vlm.model.language_model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
533
+ "vlm.model.language_model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
534
+ "vlm.model.language_model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
535
+ "vlm.model.language_model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
536
+ "vlm.model.language_model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
537
+ "vlm.model.language_model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
538
+ "vlm.model.language_model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
539
+ "vlm.model.language_model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
540
+ "vlm.model.language_model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
541
+ "vlm.model.language_model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
542
+ "vlm.model.language_model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
543
+ "vlm.model.language_model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
544
+ "vlm.model.language_model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
545
+ "vlm.model.language_model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
546
+ "vlm.model.language_model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
547
+ "vlm.model.language_model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
548
+ "vlm.model.language_model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
549
+ "vlm.model.language_model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
550
+ "vlm.model.language_model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
551
+ "vlm.model.language_model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
552
+ "vlm.model.language_model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
553
+ "vlm.model.language_model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
554
+ "vlm.model.language_model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
555
+ "vlm.model.language_model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
556
+ "vlm.model.language_model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
557
+ "vlm.model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
558
+ "vlm.model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
559
+ "vlm.model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
560
+ "vlm.model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
561
+ "vlm.model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
562
+ "vlm.model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
563
+ "vlm.model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
564
+ "vlm.model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
565
+ "vlm.model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
566
+ "vlm.model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
567
+ "vlm.model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
568
+ "vlm.model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
569
+ "vlm.model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
570
+ "vlm.model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
571
+ "vlm.model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
572
+ "vlm.model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
573
+ "vlm.model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
574
+ "vlm.model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
575
+ "vlm.model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
576
+ "vlm.model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
577
+ "vlm.model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
578
+ "vlm.model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
579
+ "vlm.model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
580
+ "vlm.model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
581
+ "vlm.model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
582
+ "vlm.model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
583
+ "vlm.model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
584
+ "vlm.model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
585
+ "vlm.model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
586
+ "vlm.model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
587
+ "vlm.model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
588
+ "vlm.model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
589
+ "vlm.model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
590
+ "vlm.model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
591
+ "vlm.model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
592
+ "vlm.model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
593
+ "vlm.model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
594
+ "vlm.model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
595
+ "vlm.model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
596
+ "vlm.model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
597
+ "vlm.model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
598
+ "vlm.model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
599
+ "vlm.model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
600
+ "vlm.model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
601
+ "vlm.model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
602
+ "vlm.model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
603
+ "vlm.model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
604
+ "vlm.model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
605
+ "vlm.model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
606
+ "vlm.model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
607
+ "vlm.model.language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
608
+ "vlm.model.language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
609
+ "vlm.model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
610
+ "vlm.model.language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
611
+ "vlm.model.language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
612
+ "vlm.model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
613
+ "vlm.model.language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
614
+ "vlm.model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
615
+ "vlm.model.language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
616
+ "vlm.model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
617
+ "vlm.model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
618
+ "vlm.model.language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
619
+ "vlm.model.language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
620
+ "vlm.model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
621
+ "vlm.model.language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
622
+ "vlm.model.language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
623
+ "vlm.model.language_model.norm.weight": "model-00002-of-00002.safetensors",
624
+ "vlm.model.visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
625
+ "vlm.model.visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
626
+ "vlm.model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
627
+ "vlm.model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
628
+ "vlm.model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
629
+ "vlm.model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
630
+ "vlm.model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
631
+ "vlm.model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
632
+ "vlm.model.visual.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
633
+ "vlm.model.visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
634
+ "vlm.model.visual.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
635
+ "vlm.model.visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
636
+ "vlm.model.visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
637
+ "vlm.model.visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
638
+ "vlm.model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
639
+ "vlm.model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
640
+ "vlm.model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
641
+ "vlm.model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
642
+ "vlm.model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
643
+ "vlm.model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
644
+ "vlm.model.visual.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
645
+ "vlm.model.visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
646
+ "vlm.model.visual.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
647
+ "vlm.model.visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
648
+ "vlm.model.visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
649
+ "vlm.model.visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
650
+ "vlm.model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
651
+ "vlm.model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
652
+ "vlm.model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
653
+ "vlm.model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
654
+ "vlm.model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
655
+ "vlm.model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
656
+ "vlm.model.visual.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
657
+ "vlm.model.visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
658
+ "vlm.model.visual.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
659
+ "vlm.model.visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
660
+ "vlm.model.visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
661
+ "vlm.model.visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
662
+ "vlm.model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
663
+ "vlm.model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
664
+ "vlm.model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
665
+ "vlm.model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
666
+ "vlm.model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
667
+ "vlm.model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
668
+ "vlm.model.visual.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
669
+ "vlm.model.visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
670
+ "vlm.model.visual.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
671
+ "vlm.model.visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
672
+ "vlm.model.visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
673
+ "vlm.model.visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
674
+ "vlm.model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
675
+ "vlm.model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
676
+ "vlm.model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
677
+ "vlm.model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
678
+ "vlm.model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
679
+ "vlm.model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
680
+ "vlm.model.visual.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
681
+ "vlm.model.visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
682
+ "vlm.model.visual.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
683
+ "vlm.model.visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
684
+ "vlm.model.visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
685
+ "vlm.model.visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
686
+ "vlm.model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
687
+ "vlm.model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
688
+ "vlm.model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
689
+ "vlm.model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
690
+ "vlm.model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
691
+ "vlm.model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
692
+ "vlm.model.visual.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
693
+ "vlm.model.visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
694
+ "vlm.model.visual.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
695
+ "vlm.model.visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
696
+ "vlm.model.visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
697
+ "vlm.model.visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
698
+ "vlm.model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
699
+ "vlm.model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
700
+ "vlm.model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
701
+ "vlm.model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
702
+ "vlm.model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
703
+ "vlm.model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
704
+ "vlm.model.visual.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
705
+ "vlm.model.visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
706
+ "vlm.model.visual.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
707
+ "vlm.model.visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
708
+ "vlm.model.visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
709
+ "vlm.model.visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
710
+ "vlm.model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
711
+ "vlm.model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
712
+ "vlm.model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
713
+ "vlm.model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
714
+ "vlm.model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
715
+ "vlm.model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
716
+ "vlm.model.visual.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
717
+ "vlm.model.visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
718
+ "vlm.model.visual.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
719
+ "vlm.model.visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
720
+ "vlm.model.visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
721
+ "vlm.model.visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
722
+ "vlm.model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
723
+ "vlm.model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
724
+ "vlm.model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
725
+ "vlm.model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
726
+ "vlm.model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
727
+ "vlm.model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
728
+ "vlm.model.visual.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
729
+ "vlm.model.visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
730
+ "vlm.model.visual.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
731
+ "vlm.model.visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
732
+ "vlm.model.visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
733
+ "vlm.model.visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
734
+ "vlm.model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
735
+ "vlm.model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
736
+ "vlm.model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
737
+ "vlm.model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
738
+ "vlm.model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
739
+ "vlm.model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
740
+ "vlm.model.visual.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
741
+ "vlm.model.visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
742
+ "vlm.model.visual.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
743
+ "vlm.model.visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
744
+ "vlm.model.visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
745
+ "vlm.model.visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
746
+ "vlm.model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
747
+ "vlm.model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
748
+ "vlm.model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
749
+ "vlm.model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
750
+ "vlm.model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
751
+ "vlm.model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
752
+ "vlm.model.visual.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
753
+ "vlm.model.visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
754
+ "vlm.model.visual.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
755
+ "vlm.model.visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
756
+ "vlm.model.visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
757
+ "vlm.model.visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
758
+ "vlm.model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
759
+ "vlm.model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
760
+ "vlm.model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
761
+ "vlm.model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
762
+ "vlm.model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
763
+ "vlm.model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
764
+ "vlm.model.visual.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
765
+ "vlm.model.visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
766
+ "vlm.model.visual.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
767
+ "vlm.model.visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
768
+ "vlm.model.visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
769
+ "vlm.model.visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
770
+ "vlm.model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
771
+ "vlm.model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
772
+ "vlm.model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
773
+ "vlm.model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
774
+ "vlm.model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
775
+ "vlm.model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
776
+ "vlm.model.visual.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
777
+ "vlm.model.visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
778
+ "vlm.model.visual.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
779
+ "vlm.model.visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
780
+ "vlm.model.visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
781
+ "vlm.model.visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
782
+ "vlm.model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
783
+ "vlm.model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
784
+ "vlm.model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
785
+ "vlm.model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
786
+ "vlm.model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
787
+ "vlm.model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
788
+ "vlm.model.visual.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
789
+ "vlm.model.visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
790
+ "vlm.model.visual.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
791
+ "vlm.model.visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
792
+ "vlm.model.visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
793
+ "vlm.model.visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
794
+ "vlm.model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
795
+ "vlm.model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
796
+ "vlm.model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
797
+ "vlm.model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
798
+ "vlm.model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
799
+ "vlm.model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
800
+ "vlm.model.visual.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
801
+ "vlm.model.visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
802
+ "vlm.model.visual.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
803
+ "vlm.model.visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
804
+ "vlm.model.visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
805
+ "vlm.model.visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
806
+ "vlm.model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
807
+ "vlm.model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
808
+ "vlm.model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
809
+ "vlm.model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
810
+ "vlm.model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
811
+ "vlm.model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
812
+ "vlm.model.visual.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
813
+ "vlm.model.visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
814
+ "vlm.model.visual.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
815
+ "vlm.model.visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
816
+ "vlm.model.visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
817
+ "vlm.model.visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
818
+ "vlm.model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
819
+ "vlm.model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
820
+ "vlm.model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
821
+ "vlm.model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
822
+ "vlm.model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
823
+ "vlm.model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
824
+ "vlm.model.visual.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
825
+ "vlm.model.visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
826
+ "vlm.model.visual.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
827
+ "vlm.model.visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
828
+ "vlm.model.visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
829
+ "vlm.model.visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
830
+ "vlm.model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
831
+ "vlm.model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
832
+ "vlm.model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
833
+ "vlm.model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
834
+ "vlm.model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
835
+ "vlm.model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
836
+ "vlm.model.visual.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
837
+ "vlm.model.visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
838
+ "vlm.model.visual.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
839
+ "vlm.model.visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
840
+ "vlm.model.visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
841
+ "vlm.model.visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
842
+ "vlm.model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
843
+ "vlm.model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
844
+ "vlm.model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
845
+ "vlm.model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
846
+ "vlm.model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
847
+ "vlm.model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
848
+ "vlm.model.visual.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
849
+ "vlm.model.visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
850
+ "vlm.model.visual.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
851
+ "vlm.model.visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
852
+ "vlm.model.visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
853
+ "vlm.model.visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
854
+ "vlm.model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
855
+ "vlm.model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
856
+ "vlm.model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
857
+ "vlm.model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
858
+ "vlm.model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
859
+ "vlm.model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
860
+ "vlm.model.visual.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
861
+ "vlm.model.visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
862
+ "vlm.model.visual.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
863
+ "vlm.model.visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
864
+ "vlm.model.visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
865
+ "vlm.model.visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
866
+ "vlm.model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
867
+ "vlm.model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
868
+ "vlm.model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
869
+ "vlm.model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
870
+ "vlm.model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
871
+ "vlm.model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
872
+ "vlm.model.visual.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
873
+ "vlm.model.visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
874
+ "vlm.model.visual.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
875
+ "vlm.model.visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
876
+ "vlm.model.visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
877
+ "vlm.model.visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
878
+ "vlm.model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
879
+ "vlm.model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
880
+ "vlm.model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
881
+ "vlm.model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
882
+ "vlm.model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
883
+ "vlm.model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
884
+ "vlm.model.visual.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
885
+ "vlm.model.visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
886
+ "vlm.model.visual.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
887
+ "vlm.model.visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
888
+ "vlm.model.visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
889
+ "vlm.model.visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
890
+ "vlm.model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
891
+ "vlm.model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
892
+ "vlm.model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
893
+ "vlm.model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
894
+ "vlm.model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
895
+ "vlm.model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
896
+ "vlm.model.visual.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
897
+ "vlm.model.visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
898
+ "vlm.model.visual.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
899
+ "vlm.model.visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
900
+ "vlm.model.visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
901
+ "vlm.model.visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
902
+ "vlm.model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
903
+ "vlm.model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
904
+ "vlm.model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
905
+ "vlm.model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
906
+ "vlm.model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
907
+ "vlm.model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
908
+ "vlm.model.visual.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
909
+ "vlm.model.visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
910
+ "vlm.model.visual.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
911
+ "vlm.model.visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
912
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00002.safetensors",
913
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00002.safetensors",
914
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00002.safetensors",
915
+ "vlm.model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00002.safetensors",
916
+ "vlm.model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00002.safetensors",
917
+ "vlm.model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00002.safetensors",
918
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00002.safetensors",
919
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00002.safetensors",
920
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00002.safetensors",
921
+ "vlm.model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00002.safetensors",
922
+ "vlm.model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00002.safetensors",
923
+ "vlm.model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00002.safetensors",
924
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00002.safetensors",
925
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00002.safetensors",
926
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00002.safetensors",
927
+ "vlm.model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00002.safetensors",
928
+ "vlm.model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00002.safetensors",
929
+ "vlm.model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00002.safetensors",
930
+ "vlm.model.visual.merger.linear_fc1.bias": "model-00001-of-00002.safetensors",
931
+ "vlm.model.visual.merger.linear_fc1.weight": "model-00001-of-00002.safetensors",
932
+ "vlm.model.visual.merger.linear_fc2.bias": "model-00001-of-00002.safetensors",
933
+ "vlm.model.visual.merger.linear_fc2.weight": "model-00001-of-00002.safetensors",
934
+ "vlm.model.visual.merger.norm.bias": "model-00001-of-00002.safetensors",
935
+ "vlm.model.visual.merger.norm.weight": "model-00001-of-00002.safetensors",
936
+ "vlm.model.visual.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
937
+ "vlm.model.visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
938
+ "vlm.model.visual.pos_embed.weight": "model-00001-of-00002.safetensors"
939
+ }
940
+ }
modeling_mibot.py ADDED
@@ -0,0 +1,1877 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (C) 2026 Xiaomi Corporation.
3
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from collections.abc import Callable
18
+ from dataclasses import dataclass
19
+ from typing import Any, Optional, Union
20
+ import math
21
+ import hashlib
22
+
23
+ import torch
24
+ import torch.nn as nn
25
+ import torch.nn.functional as F
26
+
27
+ from transformers.activations import ACT2FN
28
+ from transformers.cache_utils import Cache, DynamicCache
29
+ from transformers.generation import GenerationMixin
30
+ from transformers.integrations import use_kernel_forward_from_hub
31
+ from transformers.masking_utils import create_causal_mask
32
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
33
+ from transformers.modeling_layers import GradientCheckpointingLayer
34
+ from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
35
+ from transformers.modeling_rope_utils import dynamic_rope_update, ROPE_INIT_FUNCTIONS
36
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
37
+ from transformers.processing_utils import Unpack
38
+ from transformers.utils import is_torchdynamo_compiling, TransformersKwargs
39
+
40
+ from .configuration_mibot import MiBoTConfig, DiTConfig, Qwen3VLConfig, Qwen3VLTextConfig, Qwen3VLVisionConfig
41
+
42
+
43
+ class Qwen3VLVisionMLP(nn.Module):
44
+ def __init__(self, config):
45
+ super().__init__()
46
+ self.hidden_size = config.hidden_size
47
+ self.intermediate_size = config.intermediate_size
48
+ self.linear_fc1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=True)
49
+ self.linear_fc2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
50
+ self.act_fn = ACT2FN[config.hidden_act]
51
+
52
+ def forward(self, hidden_state):
53
+ return self.linear_fc2(self.act_fn(self.linear_fc1(hidden_state)))
54
+
55
+
56
+ class Qwen3VLVisionPatchEmbed(nn.Module):
57
+ def __init__(self, config) -> None:
58
+ super().__init__()
59
+ self.patch_size = config.patch_size
60
+ self.temporal_patch_size = config.temporal_patch_size
61
+ self.in_channels = config.in_channels
62
+ self.embed_dim = config.hidden_size
63
+
64
+ kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
65
+ self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True)
66
+
67
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
68
+ target_dtype = self.proj.weight.dtype
69
+ hidden_states = hidden_states.view(
70
+ -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
71
+ )
72
+ hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
73
+ return hidden_states
74
+
75
+
76
+ class Qwen3VLVisionRotaryEmbedding(nn.Module):
77
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
78
+
79
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
80
+ super().__init__()
81
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
82
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
83
+
84
+ def forward(self, seqlen: int) -> torch.Tensor:
85
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
86
+ freqs = torch.outer(seq, self.inv_freq)
87
+ return freqs
88
+
89
+
90
+ class Qwen3VLVisionPatchMerger(nn.Module):
91
+ def __init__(self, config: Qwen3VLVisionConfig, use_postshuffle_norm=False) -> None:
92
+ super().__init__()
93
+ self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
94
+ self.use_postshuffle_norm = use_postshuffle_norm
95
+ self.norm = nn.LayerNorm(self.hidden_size if use_postshuffle_norm else config.hidden_size, eps=1e-6)
96
+ self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size)
97
+ self.act_fn = nn.GELU()
98
+ self.linear_fc2 = nn.Linear(self.hidden_size, config.out_hidden_size)
99
+
100
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
101
+ x = self.norm(x.view(-1, self.hidden_size) if self.use_postshuffle_norm else x).view(-1, self.hidden_size)
102
+ x = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
103
+ return x
104
+
105
+
106
+ def rotate_half(x):
107
+ """Rotates half the hidden dims of the input."""
108
+ x1 = x[..., : x.shape[-1] // 2]
109
+ x2 = x[..., x.shape[-1] // 2 :]
110
+ return torch.cat((-x2, x1), dim=-1)
111
+
112
+
113
+ def apply_rotary_pos_emb_vision(
114
+ q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
115
+ ) -> tuple[torch.Tensor, torch.Tensor]:
116
+ orig_q_dtype = q.dtype
117
+ orig_k_dtype = k.dtype
118
+ q, k = q.float(), k.float()
119
+ cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
120
+ q_embed = (q * cos) + (rotate_half(q) * sin)
121
+ k_embed = (k * cos) + (rotate_half(k) * sin)
122
+ q_embed = q_embed.to(orig_q_dtype)
123
+ k_embed = k_embed.to(orig_k_dtype)
124
+ return q_embed, k_embed
125
+
126
+
127
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
128
+ """
129
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
130
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
131
+ """
132
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
133
+ if n_rep == 1:
134
+ return hidden_states
135
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
136
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
137
+
138
+
139
+ def eager_attention_forward(
140
+ module: nn.Module,
141
+ query: torch.Tensor,
142
+ key: torch.Tensor,
143
+ value: torch.Tensor,
144
+ attention_mask: Optional[torch.Tensor],
145
+ scaling: float,
146
+ dropout: float = 0.0,
147
+ **kwargs: Unpack[TransformersKwargs],
148
+ ):
149
+ key_states = repeat_kv(key, module.num_key_value_groups)
150
+ value_states = repeat_kv(value, module.num_key_value_groups)
151
+
152
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
153
+ if attention_mask is not None:
154
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
155
+ attn_weights = attn_weights + causal_mask
156
+
157
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
158
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
159
+ attn_output = torch.matmul(attn_weights, value_states)
160
+ attn_output = attn_output.transpose(1, 2).contiguous()
161
+
162
+ return attn_output, attn_weights
163
+
164
+
165
+ class Qwen3VLVisionAttention(nn.Module):
166
+ def __init__(self, config: Qwen3VLVisionConfig) -> None:
167
+ super().__init__()
168
+ self.dim = config.hidden_size
169
+ self.num_heads = config.num_heads
170
+ self.head_dim = self.dim // self.num_heads
171
+ self.num_key_value_groups = 1 # needed for eager attention
172
+ self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True)
173
+ self.proj = nn.Linear(self.dim, self.dim)
174
+ self.scaling = self.head_dim**-0.5
175
+ self.config = config
176
+ self.attention_dropout = 0.0
177
+ self.is_causal = False
178
+
179
+ def forward(
180
+ self,
181
+ hidden_states: torch.Tensor,
182
+ cu_seqlens: torch.Tensor,
183
+ rotary_pos_emb: Optional[torch.Tensor] = None,
184
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
185
+ **kwargs,
186
+ ) -> torch.Tensor:
187
+ seq_length = hidden_states.shape[0]
188
+ query_states, key_states, value_states = (
189
+ self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
190
+ )
191
+ cos, sin = position_embeddings
192
+ query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
193
+
194
+ query_states = query_states.transpose(0, 1).unsqueeze(0)
195
+ key_states = key_states.transpose(0, 1).unsqueeze(0)
196
+ value_states = value_states.transpose(0, 1).unsqueeze(0)
197
+
198
+ attention_interface: Callable = eager_attention_forward
199
+ if self.config._attn_implementation != "eager":
200
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
201
+
202
+ if self.config._attn_implementation == "flash_attention_2":
203
+ # Flash Attention 2: Use cu_seqlens for variable length attention
204
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
205
+ attn_output, _ = attention_interface(
206
+ self,
207
+ query_states,
208
+ key_states,
209
+ value_states,
210
+ attention_mask=None,
211
+ scaling=self.scaling,
212
+ dropout=0.0 if not self.training else self.attention_dropout,
213
+ cu_seq_lens_q=cu_seqlens,
214
+ cu_seq_lens_k=cu_seqlens,
215
+ max_length_q=max_seqlen,
216
+ max_length_k=max_seqlen,
217
+ is_causal=False,
218
+ **kwargs,
219
+ )
220
+ else:
221
+ # Other implementations: Process each chunk separately
222
+ lengths = cu_seqlens[1:] - cu_seqlens[:-1]
223
+ splits = [
224
+ torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
225
+ ]
226
+
227
+ attn_outputs = [
228
+ attention_interface(
229
+ self,
230
+ q,
231
+ k,
232
+ v,
233
+ attention_mask=None,
234
+ scaling=self.scaling,
235
+ dropout=0.0 if not self.training else self.attention_dropout,
236
+ is_causal=False,
237
+ **kwargs,
238
+ )[0]
239
+ for q, k, v in zip(*splits)
240
+ ]
241
+ attn_output = torch.cat(attn_outputs, dim=1)
242
+
243
+ attn_output = attn_output.reshape(seq_length, -1).contiguous()
244
+ attn_output = self.proj(attn_output)
245
+ return attn_output
246
+
247
+
248
+ class Qwen3VLVisionBlock(GradientCheckpointingLayer):
249
+ def __init__(self, config, attn_implementation: str = "sdpa") -> None:
250
+ super().__init__()
251
+ self.norm1 = nn.LayerNorm(config.hidden_size, eps=1e-6)
252
+ self.norm2 = nn.LayerNorm(config.hidden_size, eps=1e-6)
253
+ self.attn = Qwen3VLVisionAttention(config=config)
254
+ self.mlp = Qwen3VLVisionMLP(config=config)
255
+
256
+ def forward(
257
+ self,
258
+ hidden_states: torch.Tensor,
259
+ cu_seqlens: torch.Tensor,
260
+ rotary_pos_emb: Optional[torch.Tensor] = None,
261
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
262
+ **kwargs,
263
+ ) -> torch.Tensor:
264
+ hidden_states = hidden_states + self.attn(
265
+ self.norm1(hidden_states),
266
+ cu_seqlens=cu_seqlens,
267
+ rotary_pos_emb=rotary_pos_emb,
268
+ position_embeddings=position_embeddings,
269
+ **kwargs,
270
+ )
271
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
272
+ return hidden_states
273
+
274
+
275
+ class Qwen3VLTextRotaryEmbedding(nn.Module):
276
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
277
+
278
+ def __init__(self, config: Qwen3VLTextConfig, device=None):
279
+ super().__init__()
280
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
281
+ self.rope_type = config.rope_scaling.get("rope_type", "default")
282
+ else:
283
+ self.rope_type = "default"
284
+ self.max_seq_len_cached = config.max_position_embeddings
285
+ self.original_max_seq_len = config.max_position_embeddings
286
+
287
+ self.config = config
288
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
289
+
290
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
291
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
292
+ self.original_inv_freq = self.inv_freq
293
+
294
+ self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20])
295
+
296
+ def apply_interleaved_mrope(self, freqs, mrope_section):
297
+ """Apply interleaved MRoPE to 3D rotary embeddings.
298
+ Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
299
+ interleaved [THTHWHTHW...TT], preserving frequency continuity.
300
+ args:
301
+ x: (3, bs, seq_len, head_dim // 2)
302
+ mrope_section: (3,)
303
+ returns:
304
+ x_t: (bs, seq_len, head_dim // 2)
305
+ """
306
+ freqs_t = freqs[0] # just overwrite the first dimension T
307
+ for dim, offset in enumerate((1, 2), start=1): # H, W
308
+ length = mrope_section[dim] * 3
309
+ idx = slice(offset, length, 3)
310
+ freqs_t[..., idx] = freqs[dim, ..., idx]
311
+ return freqs_t
312
+
313
+ @torch.no_grad()
314
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
315
+ def forward(self, x, position_ids):
316
+ # In contrast to other models, Qwen3VL has different position ids for the grids
317
+ # So we expand the inv_freq to shape (3, ...)
318
+ if position_ids.ndim == 2:
319
+ position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
320
+ inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
321
+ position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
322
+
323
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
324
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
325
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
326
+ freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
327
+ emb = torch.cat((freqs, freqs), dim=-1)
328
+ cos = emb.cos() * self.attention_scaling
329
+ sin = emb.sin() * self.attention_scaling
330
+
331
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
332
+
333
+
334
+ @use_kernel_forward_from_hub("RMSNorm")
335
+ class Qwen3VLTextRMSNorm(nn.Module):
336
+ def __init__(self, hidden_size, eps: float = 1e-6) -> None:
337
+ """
338
+ Qwen3VLTextRMSNorm is equivalent to T5LayerNorm
339
+ """
340
+ super().__init__()
341
+ self.weight = nn.Parameter(torch.ones(hidden_size))
342
+ self.variance_epsilon = eps
343
+
344
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
345
+ input_dtype = hidden_states.dtype
346
+ hidden_states = hidden_states.to(torch.float32)
347
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
348
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
349
+ return self.weight * hidden_states.to(input_dtype)
350
+
351
+ def extra_repr(self):
352
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
353
+
354
+
355
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
356
+ """Applies Rotary Position Embedding to the query and key tensors.
357
+
358
+ Args:
359
+ q (`torch.Tensor`): The query tensor.
360
+ k (`torch.Tensor`): The key tensor.
361
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
362
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
363
+ position_ids (`torch.Tensor`, *optional*):
364
+ Deprecated and unused.
365
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
366
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
367
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
368
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
369
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
370
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
371
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
372
+ Returns:
373
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
374
+ """
375
+ cos = cos.unsqueeze(unsqueeze_dim)
376
+ sin = sin.unsqueeze(unsqueeze_dim)
377
+ q_embed = (q * cos) + (rotate_half(q) * sin)
378
+ k_embed = (k * cos) + (rotate_half(k) * sin)
379
+ return q_embed, k_embed
380
+
381
+
382
+ class Qwen3VLTextAttention(nn.Module):
383
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
384
+
385
+ def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
386
+ super().__init__()
387
+ self.config = config
388
+ self.layer_idx = layer_idx
389
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
390
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
391
+ self.scaling = self.head_dim**-0.5
392
+ self.attention_dropout = config.attention_dropout
393
+ self.is_causal = True
394
+
395
+ self.q_proj = nn.Linear(
396
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
397
+ )
398
+ self.k_proj = nn.Linear(
399
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
400
+ )
401
+ self.v_proj = nn.Linear(
402
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
403
+ )
404
+ self.o_proj = nn.Linear(
405
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
406
+ )
407
+ self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim!
408
+ self.k_norm = Qwen3VLTextRMSNorm(
409
+ self.head_dim, eps=config.rms_norm_eps
410
+ ) # thus post q_norm does not need reshape
411
+
412
+ def forward(
413
+ self,
414
+ hidden_states: torch.Tensor,
415
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
416
+ attention_mask: Optional[torch.Tensor],
417
+ past_key_values: Optional[Cache] = None,
418
+ cache_position: Optional[torch.LongTensor] = None,
419
+ **kwargs: Unpack[FlashAttentionKwargs],
420
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
421
+ input_shape = hidden_states.shape[:-1]
422
+ hidden_shape = (*input_shape, -1, self.head_dim)
423
+
424
+ query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
425
+ key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
426
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
427
+
428
+ cos, sin = position_embeddings
429
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
430
+
431
+ if past_key_values is not None:
432
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
433
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
434
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
435
+
436
+ attention_interface: Callable = eager_attention_forward
437
+ if self.config._attn_implementation != "eager":
438
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
439
+
440
+ attn_output, attn_weights = attention_interface(
441
+ self,
442
+ query_states,
443
+ key_states,
444
+ value_states,
445
+ attention_mask,
446
+ dropout=0.0 if not self.training else self.attention_dropout,
447
+ scaling=self.scaling,
448
+ **kwargs,
449
+ )
450
+
451
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
452
+ attn_output = self.o_proj(attn_output)
453
+ return attn_output, attn_weights
454
+
455
+
456
+ class Qwen3VLTextMLP(nn.Module):
457
+ def __init__(self, config):
458
+ super().__init__()
459
+ self.config = config
460
+ self.hidden_size = config.hidden_size
461
+ self.intermediate_size = config.intermediate_size
462
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
463
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
464
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
465
+ self.act_fn = ACT2FN[config.hidden_act]
466
+
467
+ def forward(self, x):
468
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
469
+ return down_proj
470
+
471
+
472
+ class Qwen3VLTextDecoderLayer(GradientCheckpointingLayer):
473
+ def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
474
+ super().__init__()
475
+ self.hidden_size = config.hidden_size
476
+
477
+ self.self_attn = Qwen3VLTextAttention(config=config, layer_idx=layer_idx)
478
+
479
+ self.mlp = Qwen3VLTextMLP(config)
480
+ self.input_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
481
+ self.post_attention_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
482
+
483
+ def forward(
484
+ self,
485
+ hidden_states: torch.Tensor,
486
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
487
+ attention_mask: Optional[torch.Tensor] = None,
488
+ position_ids: Optional[torch.LongTensor] = None,
489
+ past_key_values: Optional[Cache] = None,
490
+ use_cache: Optional[bool] = False,
491
+ cache_position: Optional[torch.LongTensor] = None,
492
+ **kwargs: Unpack[TransformersKwargs],
493
+ ) -> torch.Tensor:
494
+ residual = hidden_states
495
+ hidden_states = self.input_layernorm(hidden_states)
496
+ # Self Attention
497
+ hidden_states, _ = self.self_attn(
498
+ hidden_states=hidden_states,
499
+ attention_mask=attention_mask,
500
+ position_ids=position_ids,
501
+ past_key_values=past_key_values,
502
+ use_cache=use_cache,
503
+ cache_position=cache_position,
504
+ position_embeddings=position_embeddings,
505
+ **kwargs,
506
+ )
507
+ hidden_states = residual + hidden_states
508
+
509
+ # Fully Connected
510
+ residual = hidden_states
511
+ hidden_states = self.post_attention_layernorm(hidden_states)
512
+ hidden_states = self.mlp(hidden_states)
513
+ hidden_states = residual + hidden_states
514
+ return hidden_states
515
+
516
+
517
+ @dataclass
518
+ class Qwen3VLModelOutputWithPast(ModelOutput):
519
+ r"""
520
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
521
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
522
+
523
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
524
+ `past_key_values` input) to speed up sequential decoding.
525
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
526
+ The rope index difference between sequence length and multimodal rope.
527
+ """
528
+
529
+ last_hidden_state: Optional[torch.FloatTensor] = None
530
+ past_key_values: Optional[Cache] = None
531
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
532
+ attentions: Optional[tuple[torch.FloatTensor]] = None
533
+ rope_deltas: Optional[torch.LongTensor] = None
534
+ position_ids: Optional[torch.LongTensor] = None
535
+
536
+
537
+ class Qwen3VLPreTrainedModel(PreTrainedModel):
538
+ config: Qwen3VLConfig
539
+ base_model_prefix = "model"
540
+ supports_gradient_checkpointing = True
541
+ _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
542
+ _skip_keys_device_placement = "past_key_values"
543
+ _supports_flash_attn = True
544
+ _supports_sdpa = True
545
+
546
+ _can_compile_fullgraph = True
547
+ _supports_attention_backend = True
548
+ _can_record_outputs = {
549
+ "hidden_states": Qwen3VLTextDecoderLayer,
550
+ "attentions": Qwen3VLTextAttention,
551
+ }
552
+
553
+
554
+ class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
555
+ config: Qwen3VLVisionConfig
556
+ _no_split_modules = ["Qwen3VLVisionBlock"]
557
+
558
+ def __init__(self, config, *inputs, **kwargs) -> None:
559
+ super().__init__(config, *inputs, **kwargs)
560
+ self.spatial_merge_size = config.spatial_merge_size
561
+ self.patch_size = config.patch_size
562
+ self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
563
+
564
+ self.patch_embed = Qwen3VLVisionPatchEmbed(
565
+ config=config,
566
+ )
567
+
568
+ self.pos_embed = nn.Embedding(config.num_position_embeddings, config.hidden_size)
569
+ self.num_grid_per_side = int(config.num_position_embeddings**0.5)
570
+
571
+ head_dim = config.hidden_size // config.num_heads
572
+ self.rotary_pos_emb = Qwen3VLVisionRotaryEmbedding(head_dim // 2)
573
+
574
+ self.blocks = nn.ModuleList([Qwen3VLVisionBlock(config) for _ in range(config.depth)])
575
+ self.merger = Qwen3VLVisionPatchMerger(
576
+ config=config,
577
+ use_postshuffle_norm=False,
578
+ )
579
+
580
+ self.deepstack_visual_indexes = config.deepstack_visual_indexes
581
+ self.deepstack_merger_list = nn.ModuleList(
582
+ [
583
+ Qwen3VLVisionPatchMerger(
584
+ config=config,
585
+ use_postshuffle_norm=True,
586
+ )
587
+ for _ in range(len(config.deepstack_visual_indexes))
588
+ ]
589
+ )
590
+
591
+ self.gradient_checkpointing = False
592
+
593
+ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
594
+ merge_size = self.spatial_merge_size
595
+
596
+ max_hw = int(grid_thw[:, 1:].max().item())
597
+ freq_table = self.rotary_pos_emb(max_hw) # (max_hw, dim // 2)
598
+ device = freq_table.device
599
+
600
+ total_tokens = int(torch.prod(grid_thw, dim=1).sum().item())
601
+ pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)
602
+
603
+ offset = 0
604
+ for num_frames, height, width in grid_thw:
605
+ merged_h, merged_w = height // merge_size, width // merge_size
606
+
607
+ block_rows = torch.arange(merged_h, device=device) # block row indices
608
+ block_cols = torch.arange(merged_w, device=device) # block col indices
609
+ intra_row = torch.arange(merge_size, device=device) # intra-block row offsets
610
+ intra_col = torch.arange(merge_size, device=device) # intra-block col offsets
611
+
612
+ # Compute full-resolution positions
613
+ row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
614
+ col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]
615
+
616
+ row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
617
+ col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
618
+
619
+ coords = torch.stack((row_idx, col_idx), dim=-1)
620
+
621
+ if num_frames > 1:
622
+ coords = coords.repeat(num_frames, 1)
623
+
624
+ num_tokens = coords.shape[0]
625
+ pos_ids[offset : offset + num_tokens] = coords
626
+ offset += num_tokens
627
+
628
+ embeddings = freq_table[pos_ids] # lookup rotary embeddings
629
+ embeddings = embeddings.flatten(1)
630
+ return embeddings
631
+
632
+ def fast_pos_embed_interpolate(self, grid_thw):
633
+ grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
634
+
635
+ idx_list = [[] for _ in range(4)]
636
+ weight_list = [[] for _ in range(4)]
637
+
638
+ for t, h, w in zip(grid_ts, grid_hs, grid_ws):
639
+ h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h)
640
+ w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w)
641
+
642
+ h_idxs_floor = h_idxs.int()
643
+ w_idxs_floor = w_idxs.int()
644
+ h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
645
+ w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
646
+
647
+ dh = h_idxs - h_idxs_floor
648
+ dw = w_idxs - w_idxs_floor
649
+
650
+ base_h = h_idxs_floor * self.num_grid_per_side
651
+ base_h_ceil = h_idxs_ceil * self.num_grid_per_side
652
+
653
+ indices = [
654
+ (base_h[None].T + w_idxs_floor[None]).flatten(),
655
+ (base_h[None].T + w_idxs_ceil[None]).flatten(),
656
+ (base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
657
+ (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
658
+ ]
659
+
660
+ weights = [
661
+ ((1 - dh)[None].T * (1 - dw)[None]).flatten(),
662
+ ((1 - dh)[None].T * dw[None]).flatten(),
663
+ (dh[None].T * (1 - dw)[None]).flatten(),
664
+ (dh[None].T * dw[None]).flatten(),
665
+ ]
666
+
667
+ for i in range(4):
668
+ idx_list[i].extend(indices[i].tolist())
669
+ weight_list[i].extend(weights[i].tolist())
670
+
671
+ idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=self.pos_embed.weight.device)
672
+ weight_tensor = torch.tensor(
673
+ weight_list, dtype=self.pos_embed.weight.dtype, device=self.pos_embed.weight.device
674
+ )
675
+ pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None]
676
+ patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
677
+
678
+ patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)])
679
+
680
+ patch_pos_embeds_permute = []
681
+ merge_size = self.config.spatial_merge_size
682
+ for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
683
+ pos_embed = pos_embed.repeat(t, 1)
684
+ pos_embed = (
685
+ pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
686
+ .permute(0, 1, 3, 2, 4, 5)
687
+ .flatten(0, 4)
688
+ )
689
+ patch_pos_embeds_permute.append(pos_embed)
690
+ patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
691
+ return patch_pos_embeds
692
+
693
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
694
+ """
695
+ Args:
696
+ hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
697
+ The final hidden states of the model.
698
+ grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
699
+ The temporal, height and width of feature shape of each image in LLM.
700
+
701
+ Returns:
702
+ `torch.Tensor`: hidden_states.
703
+ """
704
+ hidden_states = self.patch_embed(hidden_states)
705
+
706
+ pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
707
+ hidden_states = hidden_states + pos_embeds
708
+
709
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
710
+
711
+ seq_len, _ = hidden_states.size()
712
+ hidden_states = hidden_states.reshape(seq_len, -1)
713
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
714
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
715
+ position_embeddings = (emb.cos(), emb.sin())
716
+
717
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
718
+ dim=0,
719
+ # Select dtype based on the following factors:
720
+ # - FA2 requires that cu_seqlens_q must have dtype int32
721
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
722
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
723
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
724
+ )
725
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
726
+
727
+ deepstack_feature_lists = []
728
+ for layer_num, blk in enumerate(self.blocks):
729
+ hidden_states = blk(
730
+ hidden_states,
731
+ cu_seqlens=cu_seqlens,
732
+ position_embeddings=position_embeddings,
733
+ **kwargs,
734
+ )
735
+ if layer_num in self.deepstack_visual_indexes:
736
+ deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](
737
+ hidden_states
738
+ )
739
+ deepstack_feature_lists.append(deepstack_feature)
740
+
741
+ hidden_states = self.merger(hidden_states)
742
+
743
+ return hidden_states, deepstack_feature_lists
744
+
745
+
746
+ class Qwen3VLTextModel(Qwen3VLPreTrainedModel):
747
+ config: Qwen3VLTextConfig
748
+ _no_split_modules = ["Qwen3VLTextDecoderLayer"]
749
+
750
+ def __init__(self, config: Qwen3VLTextConfig):
751
+ super().__init__(config)
752
+ self.padding_idx = config.pad_token_id
753
+ self.vocab_size = config.vocab_size
754
+
755
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
756
+ self.layers = nn.ModuleList(
757
+ [Qwen3VLTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
758
+ )
759
+ self.norm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
760
+ self.rotary_emb = Qwen3VLTextRotaryEmbedding(config=config)
761
+ self.gradient_checkpointing = False
762
+
763
+ # Initialize weights and apply final processing
764
+ self.post_init()
765
+
766
+ def forward(
767
+ self,
768
+ input_ids: Optional[torch.LongTensor] = None,
769
+ attention_mask: Optional[torch.Tensor] = None,
770
+ position_ids: Optional[torch.LongTensor] = None,
771
+ past_key_values: Optional[Cache] = None,
772
+ inputs_embeds: Optional[torch.FloatTensor] = None,
773
+ use_cache: Optional[bool] = None,
774
+ cache_position: Optional[torch.LongTensor] = None,
775
+ # args for deepstack
776
+ visual_pos_masks: Optional[torch.Tensor] = None,
777
+ deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
778
+ **kwargs: Unpack[FlashAttentionKwargs],
779
+ ) -> Union[tuple, BaseModelOutputWithPast]:
780
+ r"""
781
+ visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
782
+ The mask of the visual positions.
783
+ deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):
784
+ The deepstack visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim).
785
+ The feature is extracted from the different visual encoder layers, and fed to the decoder
786
+ hidden states. It's from the paper DeepStack(https://arxiv.org/abs/2406.04334).
787
+ """
788
+ if (input_ids is None) ^ (inputs_embeds is not None):
789
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
790
+
791
+ # torch.jit.trace() doesn't support cache objects in the output
792
+ if use_cache and past_key_values is None and not torch.jit.is_tracing():
793
+ past_key_values = DynamicCache(config=self.config)
794
+
795
+ if inputs_embeds is None:
796
+ inputs_embeds = self.embed_tokens(input_ids)
797
+
798
+ if cache_position is None:
799
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
800
+ cache_position = torch.arange(
801
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
802
+ )
803
+
804
+ # the hard coded `3` is for temporal, height and width.
805
+ if position_ids is None:
806
+ position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
807
+ elif position_ids.ndim == 2:
808
+ position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
809
+
810
+ if position_ids.ndim == 3 and position_ids.shape[0] == 4:
811
+ text_position_ids = position_ids[0]
812
+ position_ids = position_ids[1:]
813
+ else:
814
+ text_position_ids = position_ids[0]
815
+
816
+ attention_mask = create_causal_mask(
817
+ config=self.config,
818
+ input_embeds=inputs_embeds,
819
+ attention_mask=attention_mask,
820
+ cache_position=cache_position,
821
+ past_key_values=past_key_values,
822
+ position_ids=text_position_ids,
823
+ )
824
+
825
+ hidden_states = inputs_embeds
826
+
827
+ # create position embeddings to be shared across the decoder layers
828
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
829
+
830
+ # decoder layers
831
+ for layer_idx, decoder_layer in enumerate(self.layers):
832
+ layer_outputs = decoder_layer(
833
+ hidden_states,
834
+ attention_mask=attention_mask,
835
+ position_ids=text_position_ids,
836
+ past_key_values=past_key_values,
837
+ cache_position=cache_position,
838
+ position_embeddings=position_embeddings,
839
+ **kwargs,
840
+ )
841
+ hidden_states = layer_outputs
842
+
843
+ # add visual features to the hidden states of first several layers
844
+ if deepstack_visual_embeds is not None and layer_idx in range(len(deepstack_visual_embeds)):
845
+ hidden_states = self._deepstack_process(
846
+ hidden_states,
847
+ visual_pos_masks,
848
+ deepstack_visual_embeds[layer_idx],
849
+ )
850
+
851
+ hidden_states = self.norm(hidden_states)
852
+
853
+ return BaseModelOutputWithPast(
854
+ last_hidden_state=hidden_states,
855
+ past_key_values=past_key_values,
856
+ )
857
+
858
+ def _deepstack_process(
859
+ self, hidden_states: torch.Tensor, visual_pos_masks: torch.Tensor, visual_embeds: torch.Tensor
860
+ ):
861
+ visual_pos_masks = visual_pos_masks.to(hidden_states.device)
862
+ visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype)
863
+ local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds
864
+ hidden_states[visual_pos_masks, :] = local_this
865
+ return hidden_states
866
+
867
+
868
+ class Qwen3VLModel(Qwen3VLPreTrainedModel):
869
+ base_model_prefix = ""
870
+ _checkpoint_conversion_mapping = {}
871
+ # Reference: fix gemma3 grad acc #37208
872
+ accepts_loss_kwargs = False
873
+ config: Qwen3VLConfig
874
+ _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
875
+
876
+ def __init__(self, config):
877
+ super().__init__(config)
878
+ self.visual = Qwen3VLVisionModel._from_config(config.vision_config)
879
+ self.language_model = Qwen3VLTextModel._from_config(config.text_config)
880
+ self.rope_deltas = None # cache rope_deltas here
881
+
882
+ # Initialize weights and apply final processing
883
+ self.post_init()
884
+
885
+ def get_input_embeddings(self):
886
+ return self.language_model.get_input_embeddings()
887
+
888
+ def set_input_embeddings(self, value):
889
+ self.language_model.set_input_embeddings(value)
890
+
891
+ def set_decoder(self, decoder):
892
+ self.language_model = decoder
893
+
894
+ def get_decoder(self):
895
+ return self.language_model
896
+
897
+ def get_rope_index(
898
+ self,
899
+ input_ids: Optional[torch.LongTensor] = None,
900
+ image_grid_thw: Optional[torch.LongTensor] = None,
901
+ video_grid_thw: Optional[torch.LongTensor] = None,
902
+ attention_mask: Optional[torch.Tensor] = None,
903
+ ) -> tuple[torch.Tensor, torch.Tensor]:
904
+ """Different from the original implementation, Qwen3VL use timestamps rather than absolute time position ids."""
905
+
906
+ # Since we use timestamps to seperate videos, like <t1> <vision_start> <frame1> <vision_end> <t2> <vision_start> <frame2> <vision_end>, the video_grid_thw should also be split
907
+ if video_grid_thw is not None:
908
+ video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0)
909
+ video_grid_thw[:, 0] = 1
910
+
911
+ spatial_merge_size = self.config.vision_config.spatial_merge_size
912
+ image_token_id = self.config.image_token_id
913
+ video_token_id = self.config.video_token_id
914
+ vision_start_token_id = self.config.vision_start_token_id
915
+ mrope_position_deltas = []
916
+ if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
917
+ total_input_ids = input_ids
918
+ if attention_mask is None:
919
+ attention_mask = torch.ones_like(total_input_ids)
920
+ position_ids = torch.ones(
921
+ 3,
922
+ input_ids.shape[0],
923
+ input_ids.shape[1],
924
+ dtype=input_ids.dtype,
925
+ device=input_ids.device,
926
+ )
927
+ image_index, video_index = 0, 0
928
+ attention_mask = attention_mask.to(total_input_ids.device)
929
+ for i, input_ids in enumerate(total_input_ids):
930
+ input_ids = input_ids[attention_mask[i] == 1]
931
+ image_nums, video_nums = 0, 0
932
+ vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
933
+ vision_tokens = input_ids[vision_start_indices + 1]
934
+ image_nums = (vision_tokens == image_token_id).sum()
935
+ video_nums = (vision_tokens == video_token_id).sum()
936
+ input_tokens = input_ids.tolist()
937
+ llm_pos_ids_list: list = []
938
+ st = 0
939
+ remain_images, remain_videos = image_nums, video_nums
940
+ for _ in range(image_nums + video_nums):
941
+ if image_token_id in input_tokens and remain_images > 0:
942
+ ed_image = input_tokens.index(image_token_id, st)
943
+ else:
944
+ ed_image = len(input_tokens) + 1
945
+ if video_token_id in input_tokens and remain_videos > 0:
946
+ ed_video = input_tokens.index(video_token_id, st)
947
+ else:
948
+ ed_video = len(input_tokens) + 1
949
+ if ed_image < ed_video:
950
+ t, h, w = (
951
+ image_grid_thw[image_index][0],
952
+ image_grid_thw[image_index][1],
953
+ image_grid_thw[image_index][2],
954
+ )
955
+ image_index += 1
956
+ remain_images -= 1
957
+ ed = ed_image
958
+
959
+ else:
960
+ t, h, w = (
961
+ video_grid_thw[video_index][0],
962
+ video_grid_thw[video_index][1],
963
+ video_grid_thw[video_index][2],
964
+ )
965
+ video_index += 1
966
+ remain_videos -= 1
967
+ ed = ed_video
968
+ llm_grid_t, llm_grid_h, llm_grid_w = (
969
+ t.item(),
970
+ h.item() // spatial_merge_size,
971
+ w.item() // spatial_merge_size,
972
+ )
973
+ text_len = ed - st
974
+
975
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
976
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
977
+
978
+ # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode the temporal information for videos)
979
+ t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
980
+ h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
981
+ w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
982
+ llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
983
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
984
+
985
+ if st < len(input_tokens):
986
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
987
+ text_len = len(input_tokens) - st
988
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
989
+
990
+ llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
991
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
992
+ mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
993
+ mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
994
+ return position_ids, mrope_position_deltas
995
+ else:
996
+ if attention_mask is not None:
997
+ position_ids = attention_mask.long().cumsum(-1) - 1
998
+ position_ids.masked_fill_(attention_mask == 0, 1)
999
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
1000
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
1001
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
1002
+ else:
1003
+ position_ids = (
1004
+ torch.arange(input_ids.shape[1], device=input_ids.device)
1005
+ .view(1, 1, -1)
1006
+ .expand(3, input_ids.shape[0], -1)
1007
+ )
1008
+ mrope_position_deltas = torch.zeros(
1009
+ [input_ids.shape[0], 1],
1010
+ device=input_ids.device,
1011
+ dtype=input_ids.dtype,
1012
+ )
1013
+
1014
+ return position_ids, mrope_position_deltas
1015
+
1016
+ def get_video_features(
1017
+ self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
1018
+ ):
1019
+ """
1020
+ Encodes videos into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
1021
+
1022
+ Args:
1023
+ pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1024
+ The tensors corresponding to the input videos.
1025
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1026
+ The temporal, height and width of feature shape of each video in LLM.
1027
+ """
1028
+ # Same implementation as for images
1029
+ return self.get_image_features(pixel_values_videos, video_grid_thw)
1030
+
1031
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1032
+ """
1033
+ Encodes images into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
1034
+
1035
+ Args:
1036
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1037
+ The tensors corresponding to the input images.
1038
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1039
+ The temporal, height and width of feature shape of each image in LLM.
1040
+ """
1041
+ pixel_values = pixel_values.type(self.visual.dtype)
1042
+ image_embeds, deepstack_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
1043
+ split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1044
+ image_embeds = torch.split(image_embeds, split_sizes)
1045
+ return image_embeds, deepstack_image_embeds
1046
+
1047
+ def get_placeholder_mask(
1048
+ self,
1049
+ input_ids: torch.LongTensor,
1050
+ inputs_embeds: torch.FloatTensor,
1051
+ image_features: Optional[torch.FloatTensor] = None,
1052
+ video_features: Optional[torch.FloatTensor] = None,
1053
+ ):
1054
+ """
1055
+ Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
1056
+ equal to the length of multimodal features. If the lengths are different, an error is raised.
1057
+ """
1058
+ if input_ids is None:
1059
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
1060
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
1061
+ )
1062
+ special_image_mask = special_image_mask.all(-1)
1063
+ special_video_mask = inputs_embeds == self.get_input_embeddings()(
1064
+ torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
1065
+ )
1066
+ special_video_mask = special_video_mask.all(-1)
1067
+ else:
1068
+ special_image_mask = input_ids == self.config.image_token_id
1069
+ special_video_mask = input_ids == self.config.video_token_id
1070
+
1071
+ n_image_tokens = special_image_mask.sum()
1072
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
1073
+ if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
1074
+ raise ValueError(
1075
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
1076
+ )
1077
+
1078
+ n_video_tokens = special_video_mask.sum()
1079
+ special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
1080
+ if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
1081
+ raise ValueError(
1082
+ f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
1083
+ )
1084
+
1085
+ return special_image_mask, special_video_mask
1086
+
1087
+ def forward(
1088
+ self,
1089
+ input_ids: torch.LongTensor = None,
1090
+ attention_mask: Optional[torch.Tensor] = None,
1091
+ position_ids: Optional[torch.LongTensor] = None,
1092
+ past_key_values: Optional[Cache] = None,
1093
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1094
+ pixel_values: Optional[torch.Tensor] = None,
1095
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
1096
+ image_grid_thw: Optional[torch.LongTensor] = None,
1097
+ video_grid_thw: Optional[torch.LongTensor] = None,
1098
+ cache_position: Optional[torch.LongTensor] = None,
1099
+ **kwargs: Unpack[TransformersKwargs],
1100
+ ) -> Union[tuple, Qwen3VLModelOutputWithPast]:
1101
+ r"""
1102
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1103
+ The temporal, height and width of feature shape of each image in LLM.
1104
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1105
+ The temporal, height and width of feature shape of each video in LLM.
1106
+ """
1107
+ if (input_ids is None) ^ (inputs_embeds is not None):
1108
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
1109
+
1110
+ if inputs_embeds is None:
1111
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1112
+
1113
+ image_mask = None
1114
+ video_mask = None
1115
+
1116
+ if pixel_values is not None:
1117
+ image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw)
1118
+ image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
1119
+ image_mask, _ = self.get_placeholder_mask(
1120
+ input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
1121
+ )
1122
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
1123
+
1124
+ if pixel_values_videos is not None:
1125
+ video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
1126
+ video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
1127
+ _, video_mask = self.get_placeholder_mask(
1128
+ input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
1129
+ )
1130
+ inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
1131
+
1132
+ visual_pos_masks = None
1133
+ deepstack_visual_embeds = None
1134
+ if image_mask is not None and video_mask is not None:
1135
+ # aggregate visual_pos_masks and deepstack_visual_embeds
1136
+ image_mask = image_mask[..., 0]
1137
+ video_mask = video_mask[..., 0]
1138
+ visual_pos_masks = image_mask | video_mask
1139
+ deepstack_visual_embeds = []
1140
+ image_mask_joint = image_mask[visual_pos_masks]
1141
+ video_mask_joint = video_mask[visual_pos_masks]
1142
+ for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
1143
+ embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
1144
+ embed_joint[image_mask_joint, :] = img_embed
1145
+ embed_joint[video_mask_joint, :] = vid_embed
1146
+ deepstack_visual_embeds.append(embed_joint)
1147
+ elif image_mask is not None:
1148
+ image_mask = image_mask[..., 0]
1149
+ visual_pos_masks = image_mask
1150
+ deepstack_visual_embeds = deepstack_image_embeds
1151
+ elif video_mask is not None:
1152
+ video_mask = video_mask[..., 0]
1153
+ visual_pos_masks = video_mask
1154
+ deepstack_visual_embeds = deepstack_video_embeds
1155
+
1156
+ if position_ids is None:
1157
+ attention_mask_tensor = (
1158
+ attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
1159
+ )
1160
+ if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
1161
+ attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
1162
+ # Only apply conversion for floating point tensors (inverted masks)
1163
+ if attention_mask_tensor.dtype.is_floating_point:
1164
+ attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
1165
+ attention_mask_tensor = (1.0 - attention_mask_tensor).int()
1166
+
1167
+ # Calculate RoPE index once per generation in the pre-fill stage only.
1168
+ # When compiling, we can't check tensor values thus we check only input length
1169
+ # It is safe to assume that `length!=1` means we're in pre-fill because compiled
1170
+ # models currently cannot do asssisted decoding
1171
+ prefill_compiled_stage = is_torchdynamo_compiling() and (
1172
+ (input_ids is not None and input_ids.shape[1] != 1)
1173
+ or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
1174
+ )
1175
+ prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
1176
+ (cache_position is not None and cache_position[0] == 0)
1177
+ or (past_key_values is None or past_key_values.get_seq_length() == 0)
1178
+ )
1179
+ if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
1180
+ position_ids, rope_deltas = self.get_rope_index(
1181
+ input_ids,
1182
+ image_grid_thw,
1183
+ video_grid_thw,
1184
+ attention_mask=attention_mask_tensor,
1185
+ )
1186
+ self.rope_deltas = rope_deltas
1187
+ # then use the prev pre-calculated rope-deltas to get the correct position ids
1188
+ else:
1189
+ batch_size, seq_length, _ = inputs_embeds.shape
1190
+ delta = (
1191
+ (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) if cache_position is not None else 0
1192
+ )
1193
+ position_ids = torch.arange(seq_length, device=inputs_embeds.device)
1194
+ position_ids = position_ids.view(1, -1).expand(batch_size, -1)
1195
+ if cache_position is not None: # otherwise `deltas` is an int `0`
1196
+ delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
1197
+ position_ids = position_ids.add(delta)
1198
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
1199
+
1200
+ outputs = self.language_model(
1201
+ input_ids=None,
1202
+ position_ids=position_ids,
1203
+ attention_mask=attention_mask,
1204
+ past_key_values=past_key_values,
1205
+ inputs_embeds=inputs_embeds,
1206
+ cache_position=cache_position,
1207
+ visual_pos_masks=visual_pos_masks,
1208
+ deepstack_visual_embeds=deepstack_visual_embeds,
1209
+ **kwargs,
1210
+ )
1211
+
1212
+ return Qwen3VLModelOutputWithPast(
1213
+ last_hidden_state=outputs.last_hidden_state,
1214
+ past_key_values=outputs.past_key_values,
1215
+ rope_deltas=self.rope_deltas,
1216
+ position_ids=position_ids,
1217
+ )
1218
+
1219
+
1220
+ @dataclass
1221
+ class Qwen3VLCausalLMOutputWithPast(ModelOutput):
1222
+ r"""
1223
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
1224
+ Language modeling loss (for next-token prediction).
1225
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
1226
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
1227
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
1228
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
1229
+
1230
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
1231
+ `past_key_values` input) to speed up sequential decoding.
1232
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
1233
+ The rope index difference between sequence length and multimodal rope.
1234
+ """
1235
+
1236
+ loss: Optional[torch.FloatTensor] = None
1237
+ logits: Optional[torch.FloatTensor] = None
1238
+ past_key_values: Optional[Cache] = None
1239
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
1240
+ attentions: Optional[tuple[torch.FloatTensor]] = None
1241
+ rope_deltas: Optional[torch.LongTensor] = None
1242
+ position_ids: Optional[torch.LongTensor] = None
1243
+ attention_mask: Optional[torch.Tensor] = None
1244
+
1245
+
1246
+ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
1247
+ _checkpoint_conversion_mapping = {}
1248
+ _tied_weights_keys = ["lm_head.weight"]
1249
+ # Reference: fix gemma3 grad acc #37208
1250
+ accepts_loss_kwargs = False
1251
+ config: Qwen3VLConfig
1252
+
1253
+ def __init__(self, config):
1254
+ super().__init__(config)
1255
+ self.model = Qwen3VLModel(config)
1256
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
1257
+
1258
+ self.post_init()
1259
+
1260
+ def get_input_embeddings(self):
1261
+ return self.model.get_input_embeddings()
1262
+
1263
+ def set_input_embeddings(self, value):
1264
+ self.model.set_input_embeddings(value)
1265
+
1266
+ def set_decoder(self, decoder):
1267
+ self.model.set_decoder(decoder)
1268
+
1269
+ def get_decoder(self):
1270
+ return self.model.get_decoder()
1271
+
1272
+ def get_video_features(
1273
+ self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
1274
+ ):
1275
+ return self.model.get_video_features(pixel_values_videos, video_grid_thw)
1276
+
1277
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1278
+ return self.model.get_image_features(pixel_values, image_grid_thw)
1279
+
1280
+ # Make modules available through conditional class for BC
1281
+ @property
1282
+ def language_model(self):
1283
+ return self.model.language_model
1284
+
1285
+ @property
1286
+ def visual(self):
1287
+ return self.model.visual
1288
+
1289
+ def forward(
1290
+ self,
1291
+ input_ids: torch.LongTensor = None,
1292
+ attention_mask: Optional[torch.Tensor] = None,
1293
+ position_ids: Optional[torch.LongTensor] = None,
1294
+ past_key_values: Optional[Cache] = None,
1295
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1296
+ labels: Optional[torch.LongTensor] = None,
1297
+ pixel_values: Optional[torch.Tensor] = None,
1298
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
1299
+ image_grid_thw: Optional[torch.LongTensor] = None,
1300
+ video_grid_thw: Optional[torch.LongTensor] = None,
1301
+ cache_position: Optional[torch.LongTensor] = None,
1302
+ logits_to_keep: Union[int, torch.Tensor] = 0,
1303
+ **kwargs: Unpack[TransformersKwargs],
1304
+ ) -> Union[tuple, Qwen3VLCausalLMOutputWithPast]:
1305
+ r"""
1306
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1307
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1308
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1309
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1310
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1311
+ The temporal, height and width of feature shape of each image in LLM.
1312
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1313
+ The temporal, height and width of feature shape of each video in LLM.
1314
+
1315
+ Example:
1316
+ TODO: Add example
1317
+ """
1318
+ outputs = self.model(
1319
+ input_ids=input_ids,
1320
+ pixel_values=pixel_values,
1321
+ pixel_values_videos=pixel_values_videos,
1322
+ image_grid_thw=image_grid_thw,
1323
+ video_grid_thw=video_grid_thw,
1324
+ position_ids=position_ids,
1325
+ attention_mask=attention_mask,
1326
+ past_key_values=past_key_values,
1327
+ inputs_embeds=inputs_embeds,
1328
+ cache_position=cache_position,
1329
+ **kwargs,
1330
+ )
1331
+
1332
+ hidden_states = outputs[0]
1333
+
1334
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1335
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1336
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1337
+
1338
+ loss = None
1339
+ if labels is not None:
1340
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
1341
+
1342
+ return Qwen3VLCausalLMOutputWithPast(
1343
+ loss=loss,
1344
+ logits=logits,
1345
+ past_key_values=outputs.past_key_values,
1346
+ rope_deltas=outputs.rope_deltas,
1347
+ position_ids=outputs.position_ids,
1348
+ attention_mask=attention_mask,
1349
+ )
1350
+
1351
+ def prepare_inputs_for_generation(
1352
+ self,
1353
+ input_ids,
1354
+ past_key_values=None,
1355
+ attention_mask=None,
1356
+ inputs_embeds=None,
1357
+ cache_position=None,
1358
+ position_ids=None,
1359
+ use_cache=True,
1360
+ pixel_values=None,
1361
+ pixel_values_videos=None,
1362
+ image_grid_thw=None,
1363
+ video_grid_thw=None,
1364
+ **kwargs,
1365
+ ):
1366
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
1367
+
1368
+ model_inputs = super().prepare_inputs_for_generation(
1369
+ input_ids,
1370
+ past_key_values=past_key_values,
1371
+ attention_mask=attention_mask,
1372
+ inputs_embeds=inputs_embeds,
1373
+ cache_position=cache_position,
1374
+ position_ids=position_ids,
1375
+ pixel_values=pixel_values,
1376
+ pixel_values_videos=pixel_values_videos,
1377
+ image_grid_thw=image_grid_thw,
1378
+ video_grid_thw=video_grid_thw,
1379
+ use_cache=use_cache,
1380
+ **kwargs,
1381
+ )
1382
+
1383
+ # Qwen3VL position_ids are prepareed with rope_deltas in forward
1384
+ model_inputs["position_ids"] = None
1385
+
1386
+ if cache_position[0] != 0:
1387
+ model_inputs["pixel_values"] = None
1388
+ model_inputs["pixel_values_videos"] = None
1389
+
1390
+ return model_inputs
1391
+
1392
+ def _get_image_nums_and_video_nums(
1393
+ self,
1394
+ input_ids: Optional[torch.LongTensor],
1395
+ inputs_embeds: Optional[torch.Tensor] = None,
1396
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1397
+ """
1398
+ Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
1399
+ These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
1400
+
1401
+ Args:
1402
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1403
+ Indices of input sequence tokens in the vocabulary.
1404
+
1405
+ Returns:
1406
+ image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
1407
+ video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
1408
+ """
1409
+ image_token_id = self.config.image_token_id
1410
+ video_token_id = self.config.video_token_id
1411
+ vision_start_token_id = self.config.vision_start_token_id
1412
+
1413
+ if inputs_embeds is not None:
1414
+ vision_start_mask = (
1415
+ inputs_embeds
1416
+ == self.get_input_embeddings()(
1417
+ torch.tensor(vision_start_token_id, dtype=torch.long, device=inputs_embeds.device)
1418
+ )
1419
+ )[..., 0]
1420
+ image_mask = (
1421
+ inputs_embeds
1422
+ == self.get_input_embeddings()(
1423
+ torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
1424
+ )
1425
+ )[..., 0]
1426
+ video_mask = (
1427
+ inputs_embeds
1428
+ == self.get_input_embeddings()(
1429
+ torch.tensor(video_token_id, dtype=torch.long, device=inputs_embeds.device)
1430
+ )
1431
+ )[..., 0]
1432
+ else:
1433
+ vision_start_mask = input_ids == vision_start_token_id
1434
+ image_mask = input_ids == image_token_id
1435
+ video_mask = input_ids == video_token_id
1436
+
1437
+ vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
1438
+ image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
1439
+ video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
1440
+
1441
+ return image_nums, video_nums
1442
+
1443
+ def _expand_inputs_for_generation(
1444
+ self,
1445
+ expand_size: int = 1,
1446
+ is_encoder_decoder: bool = False,
1447
+ input_ids: Optional[torch.LongTensor] = None,
1448
+ **model_kwargs,
1449
+ ) -> tuple[torch.LongTensor, dict[str, Any]]:
1450
+ # Overwritten -- Support for expanding tensors without a batch size dimension
1451
+ # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
1452
+ # pixel_values.shape[0] is sum(seqlen_images for samples)
1453
+ # image_grid_thw.shape[0] is sum(num_images for samples)
1454
+
1455
+ if expand_size == 1:
1456
+ return input_ids, model_kwargs
1457
+
1458
+ visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
1459
+
1460
+ def _expand_dict_for_generation_visual(dict_to_expand):
1461
+ image_grid_thw = model_kwargs.get("image_grid_thw", None)
1462
+ video_grid_thw = model_kwargs.get("video_grid_thw", None)
1463
+ image_nums, video_nums = self._get_image_nums_and_video_nums(
1464
+ input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
1465
+ )
1466
+
1467
+ def _repeat_interleave_samples(x, lengths, repeat_times):
1468
+ samples = torch.split(x, lengths)
1469
+ repeat_args = [repeat_times] + [1] * (x.dim() - 1)
1470
+ result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
1471
+ return result
1472
+
1473
+ for key in dict_to_expand:
1474
+ if key == "pixel_values":
1475
+ # split images into samples
1476
+ samples = torch.split(image_grid_thw, list(image_nums))
1477
+ # compute the sequence length of images for each sample
1478
+ lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1479
+ dict_to_expand[key] = _repeat_interleave_samples(
1480
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1481
+ )
1482
+ elif key == "image_grid_thw":
1483
+ # get the num of images for each sample
1484
+ lengths = list(image_nums)
1485
+ dict_to_expand[key] = _repeat_interleave_samples(
1486
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1487
+ )
1488
+ elif key == "pixel_values_videos":
1489
+ samples = torch.split(video_grid_thw, list(video_nums))
1490
+ lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1491
+ dict_to_expand[key] = _repeat_interleave_samples(
1492
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1493
+ )
1494
+ elif key == "video_grid_thw":
1495
+ lengths = list(video_nums)
1496
+ dict_to_expand[key] = _repeat_interleave_samples(
1497
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1498
+ )
1499
+ elif key == "second_per_grid_ts":
1500
+ dict_to_expand[key] = _repeat_interleave_samples(
1501
+ dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
1502
+ )
1503
+ return dict_to_expand
1504
+
1505
+ def _expand_dict_for_generation(dict_to_expand):
1506
+ for key in dict_to_expand:
1507
+ if (
1508
+ key != "cache_position"
1509
+ and dict_to_expand[key] is not None
1510
+ and isinstance(dict_to_expand[key], torch.Tensor)
1511
+ and key not in visual_keys
1512
+ ):
1513
+ dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
1514
+ return dict_to_expand
1515
+
1516
+ model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
1517
+
1518
+ if input_ids is not None:
1519
+ input_ids = input_ids.repeat_interleave(expand_size, dim=0)
1520
+
1521
+ model_kwargs = _expand_dict_for_generation(model_kwargs)
1522
+
1523
+ if is_encoder_decoder:
1524
+ if model_kwargs.get("encoder_outputs") is None:
1525
+ raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
1526
+ model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
1527
+
1528
+ return input_ids, model_kwargs
1529
+
1530
+ # ================================================================================ #
1531
+
1532
+ @dataclass
1533
+ class ActionGenerationOutput(ModelOutput):
1534
+ """
1535
+ Output type of MiBoTForActionGeneration.
1536
+ """
1537
+ actions: torch.FloatTensor = None
1538
+
1539
+ def modulate(x, shift, scale):
1540
+ return x * (1 + scale) + shift
1541
+
1542
+ class TimestepEmbedder(nn.Module):
1543
+ def __init__(self, hidden_size, frequency_embedding_size=256, dtype=torch.bfloat16):
1544
+ super().__init__()
1545
+ self.mlp = nn.Sequential(
1546
+ nn.Linear(frequency_embedding_size, hidden_size, bias=False),
1547
+ nn.SiLU(),
1548
+ nn.Linear(hidden_size, hidden_size, bias=False),
1549
+ )
1550
+ self.frequency_embedding_size = frequency_embedding_size
1551
+ self.dtype = dtype
1552
+
1553
+ def timestep_embedding(self, t, dim, max_period=10000):
1554
+ # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
1555
+ half = dim // 2
1556
+ freqs = torch.exp(
1557
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
1558
+ )
1559
+ args = t[:, None].float() * freqs[None]
1560
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
1561
+ if dim % 2:
1562
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
1563
+ return embedding.to(self.dtype)
1564
+
1565
+ def forward(self, t):
1566
+ t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
1567
+ t_emb = self.mlp(t_freq)
1568
+ return t_emb[:, None]
1569
+
1570
+ class MLPProjector(nn.Module):
1571
+ def __init__(self, input_dim: int, output_dim: int, num_layers: int = 1, bias: bool = False):
1572
+ super(MLPProjector, self).__init__()
1573
+
1574
+ if num_layers < 1:
1575
+ raise ValueError(f"num_layers must be at least 1, got {num_layers}")
1576
+ if input_dim <= 0 or output_dim <= 0:
1577
+ raise ValueError(f"input_dim and output_dim must be positive, got {input_dim} and {output_dim}")
1578
+
1579
+ self.input_dim = input_dim
1580
+ self.output_dim = output_dim
1581
+ self.bias = bias
1582
+ self.num_layers = num_layers
1583
+
1584
+ self.layers = self._build_layers()
1585
+
1586
+ def _build_layers(self) -> nn.Sequential:
1587
+ layers = []
1588
+
1589
+ layers.extend(
1590
+ [
1591
+ nn.Linear(self.input_dim, self.output_dim, bias=self.bias),
1592
+ ]
1593
+ )
1594
+
1595
+ for _ in range(1, self.num_layers):
1596
+ layers.extend(
1597
+ [
1598
+ nn.GELU(approximate="tanh"),
1599
+ nn.Linear(self.output_dim, self.output_dim, bias=self.bias),
1600
+ ]
1601
+ )
1602
+
1603
+ return nn.Sequential(*layers)
1604
+
1605
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1606
+ x = self.layers(x)
1607
+
1608
+ return x
1609
+
1610
+
1611
+ class Attention(nn.Module):
1612
+ def __init__(
1613
+ self,
1614
+ config: DiTConfig,
1615
+ ):
1616
+ super().__init__()
1617
+ self.head_dim = config.head_dim
1618
+ self.num_heads = config.hidden_size // config.head_dim
1619
+ self.num_key_value_groups = self.num_heads // config.num_key_value_heads
1620
+ self.dropout = 0.0
1621
+
1622
+ self.qkv_proj = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=True)
1623
+ self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
1624
+
1625
+ self.q_norm = Qwen3VLTextRMSNorm(self.head_dim)
1626
+ self.k_norm = Qwen3VLTextRMSNorm(self.head_dim)
1627
+
1628
+ def forward(self, hidden_state, past_key_values, position_embeds, attn_mask=None):
1629
+ batch_size, q_len, _ = hidden_state.size()
1630
+ qkv_states = self.qkv_proj(hidden_state).view(batch_size, q_len, 3, self.num_heads, self.head_dim)
1631
+ query_states, key_states, value_states = qkv_states.unbind(2)
1632
+
1633
+ query_states = self.q_norm(query_states)
1634
+ key_states = self.k_norm(key_states)
1635
+
1636
+ query_states = query_states.transpose(1, 2)
1637
+ key_states = key_states.transpose(1, 2)
1638
+ value_states = value_states.transpose(1, 2)
1639
+
1640
+ cos, sin = position_embeds
1641
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
1642
+
1643
+ k_cache, v_cache = past_key_values
1644
+ k_cache = repeat_kv(k_cache, self.num_key_value_groups)
1645
+ v_cache = repeat_kv(v_cache, self.num_key_value_groups)
1646
+
1647
+ key_states = torch.cat([k_cache, key_states], dim=-2)
1648
+ value_states = torch.cat([v_cache, value_states], dim=-2)
1649
+
1650
+ attn_output = F.scaled_dot_product_attention(
1651
+ query=query_states,
1652
+ key=key_states,
1653
+ value=value_states,
1654
+ attn_mask=attn_mask,
1655
+ dropout_p=self.dropout if self.training else 0.0,
1656
+ )
1657
+
1658
+ attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, q_len, -1)
1659
+ return self.o_proj(attn_output)
1660
+
1661
+ class ActionMLP(nn.Module):
1662
+ def __init__(
1663
+ self,
1664
+ hidden_size=768,
1665
+ ):
1666
+ super().__init__()
1667
+ intermediate_size = hidden_size * 4
1668
+ self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
1669
+ self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
1670
+ self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
1671
+ self.act_fn = ACT2FN["silu"]
1672
+
1673
+ def forward(self, hidden_state):
1674
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
1675
+
1676
+ class DecoderLayer(nn.Module):
1677
+ def __init__(self, config: DiTConfig):
1678
+ super().__init__()
1679
+ hidden_size = config.hidden_size
1680
+
1681
+ self.attn = Attention(config)
1682
+ self.mlp = ActionMLP(hidden_size=hidden_size)
1683
+
1684
+ self.input_layernorm = Qwen3VLTextRMSNorm(hidden_size)
1685
+ self.middle_layernorm = Qwen3VLTextRMSNorm(hidden_size)
1686
+ self.post_layernorm = Qwen3VLTextRMSNorm(hidden_size)
1687
+ self.final_layernorm = Qwen3VLTextRMSNorm(hidden_size)
1688
+
1689
+ self.adaln_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)
1690
+
1691
+ def forward(self, hidden_states, past_key_values, position_embeds, t_embeds, attn_mask=None):
1692
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.adaln_table[None] + t_embeds).chunk(6, dim=1)
1693
+
1694
+ residual = hidden_states
1695
+ hidden_states = self.input_layernorm(hidden_states)
1696
+ hidden_states = modulate(hidden_states, shift_msa, scale_msa)
1697
+ hidden_states = self.attn(hidden_states, past_key_values, position_embeds, attn_mask)
1698
+ hidden_states = residual + gate_msa * hidden_states
1699
+ hidden_states = self.middle_layernorm(hidden_states)
1700
+
1701
+ residual = hidden_states
1702
+ hidden_states = self.post_layernorm(hidden_states)
1703
+ hidden_states = modulate(hidden_states, shift_mlp, scale_mlp)
1704
+ hidden_states = self.mlp(hidden_states)
1705
+ hidden_states = residual + gate_mlp * hidden_states
1706
+ hidden_states = self.final_layernorm(hidden_states)
1707
+
1708
+ return hidden_states
1709
+
1710
+ class DiT(PreTrainedModel):
1711
+ config_class = DiTConfig
1712
+ _supports_flash_attn = True
1713
+ _no_split_modules = ["DecoderLayer"]
1714
+
1715
+ def __init__(self, config: DiTConfig):
1716
+ super().__init__(config)
1717
+ self.layers = nn.ModuleList(
1718
+ [DecoderLayer(config) for _ in range(config.num_hidden_layers)]
1719
+ )
1720
+ self.post_init()
1721
+
1722
+ def forward(self, hidden_states, past_key_values, attn_mask, position_embeds, t_embeds):
1723
+ start_index = len(past_key_values) - len(self.layers)
1724
+ for i, layer in enumerate(self.layers):
1725
+ hidden_states = layer(
1726
+ hidden_states,
1727
+ past_key_values[start_index + i],
1728
+ position_embeds,
1729
+ t_embeds,
1730
+ attn_mask=attn_mask,
1731
+ )
1732
+ return hidden_states
1733
+
1734
+
1735
+ class MiBoTForActionGeneration(PreTrainedModel):
1736
+ config_class = MiBoTConfig
1737
+ base_model_prefix = "vlm"
1738
+ supports_gradient_checkpointing = False
1739
+ _supports_flash_attn = True
1740
+ _no_split_modules = ["DecoderLayer"]
1741
+
1742
+ def __init__(self, config: MiBoTConfig):
1743
+ super().__init__(config)
1744
+ self.config = config
1745
+ self.action_shape = (1, config.action_length, config.action_dim)
1746
+
1747
+ # Vision-Language Model
1748
+ self.vlm = Qwen3VLForConditionalGeneration(config.vlm_config)
1749
+
1750
+ # DiT Policy Head
1751
+ self.dit = DiT(config.dit_config)
1752
+
1753
+ # Projectors
1754
+ self.state_projector = MLPProjector(
1755
+ input_dim=config.state_dim,
1756
+ output_dim=config.dit_config.hidden_size,
1757
+ num_layers=2
1758
+ )
1759
+ self.action_projector = MLPProjector(
1760
+ input_dim=config.action_dim,
1761
+ output_dim=config.dit_config.hidden_size,
1762
+ num_layers=2
1763
+ )
1764
+ self.action_output_layer = MLPProjector(
1765
+ input_dim=config.dit_config.hidden_size,
1766
+ output_dim=config.action_dim,
1767
+ num_layers=2
1768
+ )
1769
+
1770
+ # Time Embedding
1771
+ self.t_embedder = TimestepEmbedder(config.dit_config.hidden_size)
1772
+ self.t_projector = MLPProjector(
1773
+ input_dim=config.dit_config.hidden_size,
1774
+ output_dim=6 * config.dit_config.hidden_size,
1775
+ bias=True
1776
+ )
1777
+
1778
+ # RoPE
1779
+ self.rotary_emb = Qwen3VLTextRotaryEmbedding(config.vlm_config.text_config)
1780
+
1781
+ # Sink Token
1782
+ self.sink = nn.Embedding(1, config.dit_config.hidden_size)
1783
+
1784
+ self.post_init()
1785
+
1786
+ def dit_forward(self, noisy_action, t, action_mask, state_embed, position_embeds, past_key_values, attn_mask):
1787
+ # time step
1788
+ t_embeds = self.t_embedder(t[:, 0, 0] * 1000)
1789
+ t_embeds = self.t_projector(t_embeds).view(t_embeds.shape[0], 6, -1)
1790
+
1791
+ # action projection
1792
+ noisy_action = noisy_action * action_mask
1793
+ noisy_action = self.action_projector(noisy_action)
1794
+
1795
+ # concat
1796
+ sink = self.sink.weight[None].repeat(state_embed.shape[0], 1, 1)
1797
+ hidden_states = torch.cat([sink, state_embed, noisy_action], dim=1).contiguous()
1798
+
1799
+ # dit
1800
+ hidden_states = self.dit(hidden_states, past_key_values, attn_mask, position_embeds, t_embeds)
1801
+
1802
+ # action
1803
+ hidden_states = hidden_states[:, -noisy_action.shape[1] :, :]
1804
+ output = self.action_output_layer(hidden_states)
1805
+
1806
+ return output
1807
+
1808
+ @torch.no_grad()
1809
+ def forward(
1810
+ self,
1811
+ state,
1812
+ action_mask,
1813
+ num_steps=5,
1814
+ **kwargs
1815
+ ):
1816
+ vlm_outputs = self.vlm(**kwargs, use_cache=True)
1817
+
1818
+ action_bs, action_length, _ = action_mask.shape
1819
+ _, state_length, _ = state.shape
1820
+
1821
+ dit_query_length = action_length + state_length + 1
1822
+
1823
+ ## position_embeds
1824
+ position_ids = (
1825
+ torch.arange(0, dit_query_length, device=action_mask.device).view(1, 1, -1).repeat(3, action_bs, 1)
1826
+ + vlm_outputs.position_ids.max(dim=-1)[0][..., None]
1827
+ + 1
1828
+ )
1829
+ position_embeds = self.rotary_emb(action_mask, position_ids)
1830
+
1831
+ ## cache attention mask
1832
+ dit_mask = torch.tril(torch.ones((action_bs, dit_query_length, dit_query_length), device=action_mask.device), diagonal=0)
1833
+ cache_mask = vlm_outputs.attention_mask[:, None, :].expand(-1, dit_query_length, -1)
1834
+ attn_mask = torch.cat([cache_mask, dit_mask], dim=-1)[:, None]
1835
+ attn_mask = attn_mask.bool()
1836
+
1837
+ ## state
1838
+ state_embed = self.state_projector(state)
1839
+
1840
+ # genetation
1841
+ def dit_forward_fn(noisy_action, t):
1842
+ return self.dit_forward(
1843
+ noisy_action=noisy_action,
1844
+ t=t,
1845
+ action_mask=action_mask,
1846
+ state_embed=state_embed,
1847
+ position_embeds=position_embeds,
1848
+ past_key_values=vlm_outputs.past_key_values,
1849
+ attn_mask=attn_mask,
1850
+ )
1851
+
1852
+ # Save current random state
1853
+ cpu_rng_state = torch.get_rng_state()
1854
+ gpu_rng_state = torch.cuda.get_rng_state(action_mask.device) if action_mask.is_cuda else None
1855
+
1856
+ # Set seed for reproducibility
1857
+ torch.manual_seed(kwargs["seed"])
1858
+
1859
+ ## action
1860
+ x = torch.randn_like(action_mask) # start from random noise
1861
+ dt = 1.0 / num_steps
1862
+
1863
+ # Recover random state
1864
+ torch.set_rng_state(cpu_rng_state)
1865
+ if gpu_rng_state is not None:
1866
+ torch.cuda.set_rng_state(gpu_rng_state, action_mask.device)
1867
+
1868
+ for step in range(num_steps):
1869
+ t = torch.ones((x.shape[0], 1, 1), device=x.device, dtype=x.dtype) * step / num_steps
1870
+ v = dit_forward_fn(x, t)
1871
+ x = x + v * dt
1872
+
1873
+ return ActionGenerationOutput(actions=x)
1874
+
1875
+
1876
+
1877
+ __all__ = ["MiBoTForActionGeneration"]
preprocessor_config.json ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_config": {
3
+ "calvin_abc_orig": {
4
+ "mean": [
5
+ [
6
+ 0.001086647273041308,
7
+ 0.010116912424564362,
8
+ -0.008357306011021137,
9
+ -0.0026971742045134306,
10
+ 0.0009072717512026429,
11
+ -0.004831478465348482,
12
+ -0.08337152749300003,
13
+ 0.0,
14
+ 0.0,
15
+ 0.0,
16
+ 0.0,
17
+ 0.0,
18
+ 0.0,
19
+ 0.0,
20
+ 0.0,
21
+ 0.0,
22
+ 0.0,
23
+ 0.0,
24
+ 0.0,
25
+ 0.0,
26
+ 0.0,
27
+ 0.0,
28
+ 0.0,
29
+ 0.0,
30
+ 0.0,
31
+ 0.0,
32
+ 0.0,
33
+ 0.0,
34
+ 0.0,
35
+ 0.0,
36
+ 0.0,
37
+ 0.0
38
+ ],
39
+ [
40
+ 0.001086647273041308,
41
+ 0.010116912424564362,
42
+ -0.008357306011021137,
43
+ -0.0026971742045134306,
44
+ 0.0009072717512026429,
45
+ -0.004831478465348482,
46
+ -0.08337152749300003,
47
+ 0.0,
48
+ 0.0,
49
+ 0.0,
50
+ 0.0,
51
+ 0.0,
52
+ 0.0,
53
+ 0.0,
54
+ 0.0,
55
+ 0.0,
56
+ 0.0,
57
+ 0.0,
58
+ 0.0,
59
+ 0.0,
60
+ 0.0,
61
+ 0.0,
62
+ 0.0,
63
+ 0.0,
64
+ 0.0,
65
+ 0.0,
66
+ 0.0,
67
+ 0.0,
68
+ 0.0,
69
+ 0.0,
70
+ 0.0,
71
+ 0.0
72
+ ],
73
+ [
74
+ 0.001086647273041308,
75
+ 0.010116912424564362,
76
+ -0.008357306011021137,
77
+ -0.0026971742045134306,
78
+ 0.0009072717512026429,
79
+ -0.004831478465348482,
80
+ -0.08337152749300003,
81
+ 0.0,
82
+ 0.0,
83
+ 0.0,
84
+ 0.0,
85
+ 0.0,
86
+ 0.0,
87
+ 0.0,
88
+ 0.0,
89
+ 0.0,
90
+ 0.0,
91
+ 0.0,
92
+ 0.0,
93
+ 0.0,
94
+ 0.0,
95
+ 0.0,
96
+ 0.0,
97
+ 0.0,
98
+ 0.0,
99
+ 0.0,
100
+ 0.0,
101
+ 0.0,
102
+ 0.0,
103
+ 0.0,
104
+ 0.0,
105
+ 0.0
106
+ ],
107
+ [
108
+ 0.001086647273041308,
109
+ 0.010116912424564362,
110
+ -0.008357306011021137,
111
+ -0.0026971742045134306,
112
+ 0.0009072717512026429,
113
+ -0.004831478465348482,
114
+ -0.08337152749300003,
115
+ 0.0,
116
+ 0.0,
117
+ 0.0,
118
+ 0.0,
119
+ 0.0,
120
+ 0.0,
121
+ 0.0,
122
+ 0.0,
123
+ 0.0,
124
+ 0.0,
125
+ 0.0,
126
+ 0.0,
127
+ 0.0,
128
+ 0.0,
129
+ 0.0,
130
+ 0.0,
131
+ 0.0,
132
+ 0.0,
133
+ 0.0,
134
+ 0.0,
135
+ 0.0,
136
+ 0.0,
137
+ 0.0,
138
+ 0.0,
139
+ 0.0
140
+ ],
141
+ [
142
+ 0.001086647273041308,
143
+ 0.010116912424564362,
144
+ -0.008357306011021137,
145
+ -0.0026971742045134306,
146
+ 0.0009072717512026429,
147
+ -0.004831478465348482,
148
+ -0.08337152749300003,
149
+ 0.0,
150
+ 0.0,
151
+ 0.0,
152
+ 0.0,
153
+ 0.0,
154
+ 0.0,
155
+ 0.0,
156
+ 0.0,
157
+ 0.0,
158
+ 0.0,
159
+ 0.0,
160
+ 0.0,
161
+ 0.0,
162
+ 0.0,
163
+ 0.0,
164
+ 0.0,
165
+ 0.0,
166
+ 0.0,
167
+ 0.0,
168
+ 0.0,
169
+ 0.0,
170
+ 0.0,
171
+ 0.0,
172
+ 0.0,
173
+ 0.0
174
+ ],
175
+ [
176
+ 0.001086647273041308,
177
+ 0.010116912424564362,
178
+ -0.008357306011021137,
179
+ -0.0026971742045134306,
180
+ 0.0009072717512026429,
181
+ -0.004831478465348482,
182
+ -0.08337152749300003,
183
+ 0.0,
184
+ 0.0,
185
+ 0.0,
186
+ 0.0,
187
+ 0.0,
188
+ 0.0,
189
+ 0.0,
190
+ 0.0,
191
+ 0.0,
192
+ 0.0,
193
+ 0.0,
194
+ 0.0,
195
+ 0.0,
196
+ 0.0,
197
+ 0.0,
198
+ 0.0,
199
+ 0.0,
200
+ 0.0,
201
+ 0.0,
202
+ 0.0,
203
+ 0.0,
204
+ 0.0,
205
+ 0.0,
206
+ 0.0,
207
+ 0.0
208
+ ],
209
+ [
210
+ 0.001086647273041308,
211
+ 0.010116912424564362,
212
+ -0.008357306011021137,
213
+ -0.0026971742045134306,
214
+ 0.0009072717512026429,
215
+ -0.004831478465348482,
216
+ -0.08337152749300003,
217
+ 0.0,
218
+ 0.0,
219
+ 0.0,
220
+ 0.0,
221
+ 0.0,
222
+ 0.0,
223
+ 0.0,
224
+ 0.0,
225
+ 0.0,
226
+ 0.0,
227
+ 0.0,
228
+ 0.0,
229
+ 0.0,
230
+ 0.0,
231
+ 0.0,
232
+ 0.0,
233
+ 0.0,
234
+ 0.0,
235
+ 0.0,
236
+ 0.0,
237
+ 0.0,
238
+ 0.0,
239
+ 0.0,
240
+ 0.0,
241
+ 0.0
242
+ ],
243
+ [
244
+ 0.001086647273041308,
245
+ 0.010116912424564362,
246
+ -0.008357306011021137,
247
+ -0.0026971742045134306,
248
+ 0.0009072717512026429,
249
+ -0.004831478465348482,
250
+ -0.08337152749300003,
251
+ 0.0,
252
+ 0.0,
253
+ 0.0,
254
+ 0.0,
255
+ 0.0,
256
+ 0.0,
257
+ 0.0,
258
+ 0.0,
259
+ 0.0,
260
+ 0.0,
261
+ 0.0,
262
+ 0.0,
263
+ 0.0,
264
+ 0.0,
265
+ 0.0,
266
+ 0.0,
267
+ 0.0,
268
+ 0.0,
269
+ 0.0,
270
+ 0.0,
271
+ 0.0,
272
+ 0.0,
273
+ 0.0,
274
+ 0.0,
275
+ 0.0
276
+ ],
277
+ [
278
+ 0.001086647273041308,
279
+ 0.010116912424564362,
280
+ -0.008357306011021137,
281
+ -0.0026971742045134306,
282
+ 0.0009072717512026429,
283
+ -0.004831478465348482,
284
+ -0.08337152749300003,
285
+ 0.0,
286
+ 0.0,
287
+ 0.0,
288
+ 0.0,
289
+ 0.0,
290
+ 0.0,
291
+ 0.0,
292
+ 0.0,
293
+ 0.0,
294
+ 0.0,
295
+ 0.0,
296
+ 0.0,
297
+ 0.0,
298
+ 0.0,
299
+ 0.0,
300
+ 0.0,
301
+ 0.0,
302
+ 0.0,
303
+ 0.0,
304
+ 0.0,
305
+ 0.0,
306
+ 0.0,
307
+ 0.0,
308
+ 0.0,
309
+ 0.0
310
+ ],
311
+ [
312
+ 0.001086647273041308,
313
+ 0.010116912424564362,
314
+ -0.008357306011021137,
315
+ -0.0026971742045134306,
316
+ 0.0009072717512026429,
317
+ -0.004831478465348482,
318
+ -0.08337152749300003,
319
+ 0.0,
320
+ 0.0,
321
+ 0.0,
322
+ 0.0,
323
+ 0.0,
324
+ 0.0,
325
+ 0.0,
326
+ 0.0,
327
+ 0.0,
328
+ 0.0,
329
+ 0.0,
330
+ 0.0,
331
+ 0.0,
332
+ 0.0,
333
+ 0.0,
334
+ 0.0,
335
+ 0.0,
336
+ 0.0,
337
+ 0.0,
338
+ 0.0,
339
+ 0.0,
340
+ 0.0,
341
+ 0.0,
342
+ 0.0,
343
+ 0.0
344
+ ]
345
+ ],
346
+ "std": [
347
+ [
348
+ 0.24954795837402344,
349
+ 0.20441992580890656,
350
+ 0.2120743840932846,
351
+ 0.15890946984291077,
352
+ 0.17389388382434845,
353
+ 0.35485804080963135,
354
+ 0.9965195655822754,
355
+ 9.999999974752427e-07,
356
+ 9.999999974752427e-07,
357
+ 9.999999974752427e-07,
358
+ 9.999999974752427e-07,
359
+ 9.999999974752427e-07,
360
+ 9.999999974752427e-07,
361
+ 9.999999974752427e-07,
362
+ 9.999999974752427e-07,
363
+ 9.999999974752427e-07,
364
+ 9.999999974752427e-07,
365
+ 9.999999974752427e-07,
366
+ 9.999999974752427e-07,
367
+ 9.999999974752427e-07,
368
+ 9.999999974752427e-07,
369
+ 9.999999974752427e-07,
370
+ 9.999999974752427e-07,
371
+ 9.999999974752427e-07,
372
+ 9.999999974752427e-07,
373
+ 9.999999974752427e-07,
374
+ 9.999999974752427e-07,
375
+ 9.999999974752427e-07,
376
+ 9.999999974752427e-07,
377
+ 9.999999974752427e-07,
378
+ 9.999999974752427e-07,
379
+ 9.999999974752427e-07
380
+ ],
381
+ [
382
+ 0.24954795837402344,
383
+ 0.20441992580890656,
384
+ 0.2120743840932846,
385
+ 0.15890946984291077,
386
+ 0.17389388382434845,
387
+ 0.35485804080963135,
388
+ 0.9965195655822754,
389
+ 9.999999974752427e-07,
390
+ 9.999999974752427e-07,
391
+ 9.999999974752427e-07,
392
+ 9.999999974752427e-07,
393
+ 9.999999974752427e-07,
394
+ 9.999999974752427e-07,
395
+ 9.999999974752427e-07,
396
+ 9.999999974752427e-07,
397
+ 9.999999974752427e-07,
398
+ 9.999999974752427e-07,
399
+ 9.999999974752427e-07,
400
+ 9.999999974752427e-07,
401
+ 9.999999974752427e-07,
402
+ 9.999999974752427e-07,
403
+ 9.999999974752427e-07,
404
+ 9.999999974752427e-07,
405
+ 9.999999974752427e-07,
406
+ 9.999999974752427e-07,
407
+ 9.999999974752427e-07,
408
+ 9.999999974752427e-07,
409
+ 9.999999974752427e-07,
410
+ 9.999999974752427e-07,
411
+ 9.999999974752427e-07,
412
+ 9.999999974752427e-07,
413
+ 9.999999974752427e-07
414
+ ],
415
+ [
416
+ 0.24954795837402344,
417
+ 0.20441992580890656,
418
+ 0.2120743840932846,
419
+ 0.15890946984291077,
420
+ 0.17389388382434845,
421
+ 0.35485804080963135,
422
+ 0.9965195655822754,
423
+ 9.999999974752427e-07,
424
+ 9.999999974752427e-07,
425
+ 9.999999974752427e-07,
426
+ 9.999999974752427e-07,
427
+ 9.999999974752427e-07,
428
+ 9.999999974752427e-07,
429
+ 9.999999974752427e-07,
430
+ 9.999999974752427e-07,
431
+ 9.999999974752427e-07,
432
+ 9.999999974752427e-07,
433
+ 9.999999974752427e-07,
434
+ 9.999999974752427e-07,
435
+ 9.999999974752427e-07,
436
+ 9.999999974752427e-07,
437
+ 9.999999974752427e-07,
438
+ 9.999999974752427e-07,
439
+ 9.999999974752427e-07,
440
+ 9.999999974752427e-07,
441
+ 9.999999974752427e-07,
442
+ 9.999999974752427e-07,
443
+ 9.999999974752427e-07,
444
+ 9.999999974752427e-07,
445
+ 9.999999974752427e-07,
446
+ 9.999999974752427e-07,
447
+ 9.999999974752427e-07
448
+ ],
449
+ [
450
+ 0.24954795837402344,
451
+ 0.20441992580890656,
452
+ 0.2120743840932846,
453
+ 0.15890946984291077,
454
+ 0.17389388382434845,
455
+ 0.35485804080963135,
456
+ 0.9965195655822754,
457
+ 9.999999974752427e-07,
458
+ 9.999999974752427e-07,
459
+ 9.999999974752427e-07,
460
+ 9.999999974752427e-07,
461
+ 9.999999974752427e-07,
462
+ 9.999999974752427e-07,
463
+ 9.999999974752427e-07,
464
+ 9.999999974752427e-07,
465
+ 9.999999974752427e-07,
466
+ 9.999999974752427e-07,
467
+ 9.999999974752427e-07,
468
+ 9.999999974752427e-07,
469
+ 9.999999974752427e-07,
470
+ 9.999999974752427e-07,
471
+ 9.999999974752427e-07,
472
+ 9.999999974752427e-07,
473
+ 9.999999974752427e-07,
474
+ 9.999999974752427e-07,
475
+ 9.999999974752427e-07,
476
+ 9.999999974752427e-07,
477
+ 9.999999974752427e-07,
478
+ 9.999999974752427e-07,
479
+ 9.999999974752427e-07,
480
+ 9.999999974752427e-07,
481
+ 9.999999974752427e-07
482
+ ],
483
+ [
484
+ 0.24954795837402344,
485
+ 0.20441992580890656,
486
+ 0.2120743840932846,
487
+ 0.15890946984291077,
488
+ 0.17389388382434845,
489
+ 0.35485804080963135,
490
+ 0.9965195655822754,
491
+ 9.999999974752427e-07,
492
+ 9.999999974752427e-07,
493
+ 9.999999974752427e-07,
494
+ 9.999999974752427e-07,
495
+ 9.999999974752427e-07,
496
+ 9.999999974752427e-07,
497
+ 9.999999974752427e-07,
498
+ 9.999999974752427e-07,
499
+ 9.999999974752427e-07,
500
+ 9.999999974752427e-07,
501
+ 9.999999974752427e-07,
502
+ 9.999999974752427e-07,
503
+ 9.999999974752427e-07,
504
+ 9.999999974752427e-07,
505
+ 9.999999974752427e-07,
506
+ 9.999999974752427e-07,
507
+ 9.999999974752427e-07,
508
+ 9.999999974752427e-07,
509
+ 9.999999974752427e-07,
510
+ 9.999999974752427e-07,
511
+ 9.999999974752427e-07,
512
+ 9.999999974752427e-07,
513
+ 9.999999974752427e-07,
514
+ 9.999999974752427e-07,
515
+ 9.999999974752427e-07
516
+ ],
517
+ [
518
+ 0.24954795837402344,
519
+ 0.20441992580890656,
520
+ 0.2120743840932846,
521
+ 0.15890946984291077,
522
+ 0.17389388382434845,
523
+ 0.35485804080963135,
524
+ 0.9965195655822754,
525
+ 9.999999974752427e-07,
526
+ 9.999999974752427e-07,
527
+ 9.999999974752427e-07,
528
+ 9.999999974752427e-07,
529
+ 9.999999974752427e-07,
530
+ 9.999999974752427e-07,
531
+ 9.999999974752427e-07,
532
+ 9.999999974752427e-07,
533
+ 9.999999974752427e-07,
534
+ 9.999999974752427e-07,
535
+ 9.999999974752427e-07,
536
+ 9.999999974752427e-07,
537
+ 9.999999974752427e-07,
538
+ 9.999999974752427e-07,
539
+ 9.999999974752427e-07,
540
+ 9.999999974752427e-07,
541
+ 9.999999974752427e-07,
542
+ 9.999999974752427e-07,
543
+ 9.999999974752427e-07,
544
+ 9.999999974752427e-07,
545
+ 9.999999974752427e-07,
546
+ 9.999999974752427e-07,
547
+ 9.999999974752427e-07,
548
+ 9.999999974752427e-07,
549
+ 9.999999974752427e-07
550
+ ],
551
+ [
552
+ 0.24954795837402344,
553
+ 0.20441992580890656,
554
+ 0.2120743840932846,
555
+ 0.15890946984291077,
556
+ 0.17389388382434845,
557
+ 0.35485804080963135,
558
+ 0.9965195655822754,
559
+ 9.999999974752427e-07,
560
+ 9.999999974752427e-07,
561
+ 9.999999974752427e-07,
562
+ 9.999999974752427e-07,
563
+ 9.999999974752427e-07,
564
+ 9.999999974752427e-07,
565
+ 9.999999974752427e-07,
566
+ 9.999999974752427e-07,
567
+ 9.999999974752427e-07,
568
+ 9.999999974752427e-07,
569
+ 9.999999974752427e-07,
570
+ 9.999999974752427e-07,
571
+ 9.999999974752427e-07,
572
+ 9.999999974752427e-07,
573
+ 9.999999974752427e-07,
574
+ 9.999999974752427e-07,
575
+ 9.999999974752427e-07,
576
+ 9.999999974752427e-07,
577
+ 9.999999974752427e-07,
578
+ 9.999999974752427e-07,
579
+ 9.999999974752427e-07,
580
+ 9.999999974752427e-07,
581
+ 9.999999974752427e-07,
582
+ 9.999999974752427e-07,
583
+ 9.999999974752427e-07
584
+ ],
585
+ [
586
+ 0.24954795837402344,
587
+ 0.20441992580890656,
588
+ 0.2120743840932846,
589
+ 0.15890946984291077,
590
+ 0.17389388382434845,
591
+ 0.35485804080963135,
592
+ 0.9965195655822754,
593
+ 9.999999974752427e-07,
594
+ 9.999999974752427e-07,
595
+ 9.999999974752427e-07,
596
+ 9.999999974752427e-07,
597
+ 9.999999974752427e-07,
598
+ 9.999999974752427e-07,
599
+ 9.999999974752427e-07,
600
+ 9.999999974752427e-07,
601
+ 9.999999974752427e-07,
602
+ 9.999999974752427e-07,
603
+ 9.999999974752427e-07,
604
+ 9.999999974752427e-07,
605
+ 9.999999974752427e-07,
606
+ 9.999999974752427e-07,
607
+ 9.999999974752427e-07,
608
+ 9.999999974752427e-07,
609
+ 9.999999974752427e-07,
610
+ 9.999999974752427e-07,
611
+ 9.999999974752427e-07,
612
+ 9.999999974752427e-07,
613
+ 9.999999974752427e-07,
614
+ 9.999999974752427e-07,
615
+ 9.999999974752427e-07,
616
+ 9.999999974752427e-07,
617
+ 9.999999974752427e-07
618
+ ],
619
+ [
620
+ 0.24954795837402344,
621
+ 0.20441992580890656,
622
+ 0.2120743840932846,
623
+ 0.15890946984291077,
624
+ 0.17389388382434845,
625
+ 0.35485804080963135,
626
+ 0.9965195655822754,
627
+ 9.999999974752427e-07,
628
+ 9.999999974752427e-07,
629
+ 9.999999974752427e-07,
630
+ 9.999999974752427e-07,
631
+ 9.999999974752427e-07,
632
+ 9.999999974752427e-07,
633
+ 9.999999974752427e-07,
634
+ 9.999999974752427e-07,
635
+ 9.999999974752427e-07,
636
+ 9.999999974752427e-07,
637
+ 9.999999974752427e-07,
638
+ 9.999999974752427e-07,
639
+ 9.999999974752427e-07,
640
+ 9.999999974752427e-07,
641
+ 9.999999974752427e-07,
642
+ 9.999999974752427e-07,
643
+ 9.999999974752427e-07,
644
+ 9.999999974752427e-07,
645
+ 9.999999974752427e-07,
646
+ 9.999999974752427e-07,
647
+ 9.999999974752427e-07,
648
+ 9.999999974752427e-07,
649
+ 9.999999974752427e-07,
650
+ 9.999999974752427e-07,
651
+ 9.999999974752427e-07
652
+ ],
653
+ [
654
+ 0.24954795837402344,
655
+ 0.20441992580890656,
656
+ 0.2120743840932846,
657
+ 0.15890946984291077,
658
+ 0.17389388382434845,
659
+ 0.35485804080963135,
660
+ 0.9965195655822754,
661
+ 9.999999974752427e-07,
662
+ 9.999999974752427e-07,
663
+ 9.999999974752427e-07,
664
+ 9.999999974752427e-07,
665
+ 9.999999974752427e-07,
666
+ 9.999999974752427e-07,
667
+ 9.999999974752427e-07,
668
+ 9.999999974752427e-07,
669
+ 9.999999974752427e-07,
670
+ 9.999999974752427e-07,
671
+ 9.999999974752427e-07,
672
+ 9.999999974752427e-07,
673
+ 9.999999974752427e-07,
674
+ 9.999999974752427e-07,
675
+ 9.999999974752427e-07,
676
+ 9.999999974752427e-07,
677
+ 9.999999974752427e-07,
678
+ 9.999999974752427e-07,
679
+ 9.999999974752427e-07,
680
+ 9.999999974752427e-07,
681
+ 9.999999974752427e-07,
682
+ 9.999999974752427e-07,
683
+ 9.999999974752427e-07,
684
+ 9.999999974752427e-07,
685
+ 9.999999974752427e-07
686
+ ]
687
+ ]
688
+ }
689
+ },
690
+ "auto_map": {
691
+ "AutoProcessor": "processing_mibot.MiBotProcessor"
692
+ },
693
+ "crop_size": null,
694
+ "data_format": "channels_first",
695
+ "default_to_square": true,
696
+ "device": null,
697
+ "disable_grouping": null,
698
+ "do_center_crop": null,
699
+ "do_convert_rgb": true,
700
+ "do_normalize": true,
701
+ "do_pad": null,
702
+ "do_rescale": true,
703
+ "do_resize": true,
704
+ "image_mean": [
705
+ 0.5,
706
+ 0.5,
707
+ 0.5
708
+ ],
709
+ "image_processor_type": "Qwen2VLImageProcessorFast",
710
+ "image_std": [
711
+ 0.5,
712
+ 0.5,
713
+ 0.5
714
+ ],
715
+ "input_data_format": null,
716
+ "max_pixels": null,
717
+ "merge_size": 2,
718
+ "min_pixels": null,
719
+ "pad_size": null,
720
+ "patch_size": 16,
721
+ "processor_class": "MiBotProcessor",
722
+ "resample": 3,
723
+ "rescale_factor": 0.00392156862745098,
724
+ "return_tensors": null,
725
+ "size": {
726
+ "longest_edge": 90000,
727
+ "shortest_edge": 1024
728
+ },
729
+ "temporal_patch_size": 2,
730
+ "video_processor_type": "qwen3_vl"
731
+ }
processing_mibot.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (C) 2026 Xiaomi Corporation.
3
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from typing import Optional, Union, List
18
+ import numpy as np
19
+ import torch
20
+
21
+ from transformers.feature_extraction_utils import BatchFeature
22
+ from transformers.image_utils import ImageInput
23
+ from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
24
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
25
+ from transformers.utils import logging
26
+ from transformers.video_utils import VideoInput
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
31
+ pass
32
+
33
+ class Qwen3VLImagesKwargs(ImagesKwargs):
34
+ min_pixels: Optional[int]
35
+ max_pixels: Optional[int]
36
+ patch_size: Optional[int]
37
+ temporal_patch_size: Optional[int]
38
+ merge_size: Optional[int]
39
+
40
+ class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
41
+ images_kwargs: Qwen3VLImagesKwargs
42
+ videos_kwargs: Qwen3VLVideosProcessorKwargs
43
+ _defaults = {
44
+ "text_kwargs": {
45
+ "padding": False,
46
+ "return_token_type_ids": False,
47
+ "return_mm_token_type_ids": False,
48
+ },
49
+ "videos_kwargs": {"return_metadata": True},
50
+ }
51
+
52
+ class MiBotProcessor(ProcessorMixin):
53
+ r"""
54
+ Constructs a Qwen3VL processor which wraps a Qwen3VL image processor and a Qwen2 tokenizer into a single processor.
55
+ [`Qwen3VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
56
+ [`~Qwen3VLProcessor.__call__`] and [`~Qwen3VLProcessor.decode`] for more information.
57
+ Args:
58
+ image_processor ([`Qwen2VLImageProcessor`], *optional*):
59
+ The image processor is a required input.
60
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
61
+ The tokenizer is a required input.
62
+ video_processor ([`Qwen3VLVideoProcessor`], *optional*):
63
+ The video processor is a required input.
64
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
65
+ in a chat into a tokenizable string.
66
+ """
67
+ attributes = ["image_processor", "tokenizer", "video_processor"]
68
+ image_processor_class = "AutoImageProcessor"
69
+ video_processor_class = "AutoVideoProcessor"
70
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
71
+
72
+ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
73
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
74
+ self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
75
+ self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
76
+ self.image_token_id = (
77
+ tokenizer.image_token_id
78
+ if getattr(tokenizer, "image_token_id", None)
79
+ else tokenizer.convert_tokens_to_ids(self.image_token)
80
+ )
81
+ self.video_token_id = (
82
+ tokenizer.video_token_id
83
+ if getattr(tokenizer, "video_token_id", None)
84
+ else tokenizer.convert_tokens_to_ids(self.video_token)
85
+ )
86
+ self.vision_start_token = (
87
+ "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
88
+ )
89
+ self.vision_end_token = (
90
+ "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
91
+ )
92
+ self.vision_start_token_id = (
93
+ tokenizer.vision_start_token_id
94
+ if getattr(tokenizer, "vision_start_token_id", None)
95
+ else tokenizer.convert_tokens_to_ids(self.vision_start_token)
96
+ )
97
+ self.vision_end_token_id = (
98
+ tokenizer.vision_end_token_id
99
+ if getattr(tokenizer, "vision_end_token_id", None)
100
+ else tokenizer.convert_tokens_to_ids(self.vision_end_token)
101
+ )
102
+
103
+ action_config = image_processor.action_config
104
+ assert action_config is not None, "action_config must be provided"
105
+ self.action_config = {}
106
+ for robot_type, config in action_config.items():
107
+ mean = config.get("mean", 0.0)
108
+ std = config.get("std", 1.0)
109
+ self.action_config[robot_type] = {
110
+ "mean": torch.tensor(mean, dtype=torch.float32)[None],
111
+ "std": torch.tensor(std, dtype=torch.float32)[None]
112
+ }
113
+
114
+ def list_robot_types(self) -> List[str]:
115
+ """Return list of available robot types in action_config."""
116
+ return list(self.action_config.keys())
117
+
118
+ def get_action_mask(self, robot_type: str, batch_size: int = 1):
119
+ """
120
+ Get action mask indicating which action dimensions are active (std > 1e-5) for the given robot type.
121
+ The mask is repeated for the specified batch size.
122
+
123
+ Args:
124
+ robot_type (str): The type of robot (must exist in action_config).
125
+ batch_size (int, optional): Number of copies to repeat along the batch dimension. Default is 1.
126
+
127
+ Returns:
128
+ torch.Tensor: A float mask of shape (batch_size, 1, action_dim), where 1.0 means the action dimension is active.
129
+ """
130
+ return (self.action_config[robot_type]["std"] > 1e-5).float().repeat(batch_size, 1, 1)
131
+
132
+ def decode_action(self, actions, robot_type: str):
133
+ """
134
+ Decode actions using mean/std of the specified robot type.
135
+ Args:
136
+ actions: array-like or torch.Tensor
137
+ robot_type: str
138
+ Returns:
139
+ torch.Tensor: un-normalized actions
140
+ """
141
+ if robot_type not in self.action_config:
142
+ raise KeyError(f"Robot type '{robot_type}' not found. Available: {list(self.action_config.keys())}")
143
+
144
+ config = self.action_config[robot_type]
145
+ mean = config["mean"]
146
+ std = config["std"]
147
+
148
+ mean = mean.to(actions.device)
149
+ std = std.to(actions.device)
150
+ return actions.float() * std + mean
151
+
152
+ def __call__(
153
+ self,
154
+ images: ImageInput = None,
155
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
156
+ videos: VideoInput = None,
157
+ **kwargs: Unpack[Qwen3VLProcessorKwargs],
158
+ ) -> BatchFeature:
159
+ """
160
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
161
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
162
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
163
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
164
+ Args:
165
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
166
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
167
+ tensor. Both channels-first and channels-last formats are supported.
168
+ text (`str`, `list[str]`, `list[list[str]]`):
169
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
170
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
171
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
172
+ videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
173
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
174
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
175
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
176
+ If set, will return tensors of a particular framework. Acceptable values are:
177
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
178
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
179
+ - `'np'`: Return NumPy `np.ndarray` objects.
180
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
181
+ Returns:
182
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
183
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
184
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
185
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
186
+ `None`).
187
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
188
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
189
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
190
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
191
+ """
192
+ output_kwargs = self._merge_kwargs(
193
+ Qwen3VLProcessorKwargs,
194
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
195
+ **kwargs,
196
+ )
197
+ if images is not None:
198
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
199
+ image_grid_thw = image_inputs["image_grid_thw"]
200
+ else:
201
+ image_inputs = {}
202
+ image_grid_thw = None
203
+ if videos is not None:
204
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
205
+ video_grid_thw = videos_inputs["video_grid_thw"]
206
+ # If user has not requested video metadata, pop it
207
+ if "return_metadata" not in kwargs:
208
+ video_metadata = videos_inputs.pop("video_metadata")
209
+ else:
210
+ video_metadata = videos_inputs["video_metadata"]
211
+ video_grid_thw = videos_inputs["video_grid_thw"]
212
+ else:
213
+ videos_inputs = {}
214
+ video_grid_thw = None
215
+ if not isinstance(text, list):
216
+ text = [text]
217
+ text = text.copy() # below lines change text in-place
218
+ if image_grid_thw is not None:
219
+ merge_length = self.image_processor.merge_size**2
220
+ index = 0
221
+ for i in range(len(text)):
222
+ while self.image_token in text[i]:
223
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
224
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
225
+ index += 1
226
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
227
+ if video_grid_thw is not None:
228
+ merge_length = self.video_processor.merge_size**2
229
+ index = 0
230
+ for i in range(len(text)):
231
+ while self.video_token in text[i]:
232
+ metadata = video_metadata[index]
233
+ if metadata.fps is None:
234
+ logger.warning_once(
235
+ "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
236
+ "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
237
+ "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
238
+ )
239
+ metadata.fps = 24 if metadata.fps is None else metadata.fps
240
+ # if timestamps are not provided, calculate them
241
+ curr_timestamp = self._calculate_timestamps(
242
+ metadata.frames_indices,
243
+ metadata.fps,
244
+ self.video_processor.merge_size,
245
+ )
246
+ video_placeholder = ""
247
+ frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
248
+ for frame_idx in range(video_grid_thw[index][0]):
249
+ curr_time = curr_timestamp[frame_idx]
250
+ video_placeholder += f"<{curr_time:.1f} seconds>"
251
+ video_placeholder += (
252
+ self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
253
+ )
254
+ if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
255
+ text[i] = text[i].replace(
256
+ f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
257
+ )
258
+ else:
259
+ # vllm may input video token directly
260
+ text[i] = text[i].replace(self.video_token, video_placeholder, 1)
261
+ index += 1
262
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
263
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
264
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
265
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
266
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
267
+ if return_mm_token_type_ids:
268
+ array_ids = np.array(text_inputs["input_ids"])
269
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
270
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
271
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
272
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
273
+ def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
274
+ """
275
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
276
+ Args:
277
+ image_sizes (`list[list[int]]`, *optional*):
278
+ The input sizes formatted as (height, width) per each image.
279
+ video_sizes (`list[list[int]]`, *optional*):
280
+ The input sizes formatted as (num_frames, height, width) per each video.
281
+ Returns:
282
+ `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
283
+ input modalities, along with other useful data.
284
+ """
285
+ vision_data = {}
286
+ if image_sizes is not None:
287
+ images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {})
288
+ images_kwargs.update(kwargs)
289
+ merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
290
+ num_image_patches = [
291
+ self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
292
+ for image_size in image_sizes
293
+ ]
294
+ num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
295
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
296
+ if video_sizes is not None:
297
+ videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {})
298
+ videos_kwargs.update(kwargs)
299
+ num_video_patches = [
300
+ self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
301
+ for video_size in video_sizes
302
+ ]
303
+ num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
304
+ vision_data["num_video_tokens"] = num_video_tokens
305
+ return MultiModalData(**vision_data)
306
+ def post_process_image_text_to_text(
307
+ self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
308
+ ):
309
+ """
310
+ Post-process the output of the model to decode the text.
311
+ Args:
312
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
313
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
314
+ or `(sequence_length,)`.
315
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
316
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
317
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
318
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
319
+ **kwargs:
320
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
321
+ Returns:
322
+ `list[str]`: The decoded text.
323
+ """
324
+ return self.tokenizer.batch_decode(
325
+ generated_outputs,
326
+ skip_special_tokens=skip_special_tokens,
327
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
328
+ **kwargs,
329
+ )
330
+ def _calculate_timestamps(self, indices: Union[list[int], np.ndarray], video_fps: float, merge_size: int = 2):
331
+ if not isinstance(indices, list):
332
+ indices = indices.tolist()
333
+ if len(indices) % merge_size != 0:
334
+ indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
335
+ timestamps = [idx / video_fps for idx in indices]
336
+ # @JJJYmmm frames are merged by self.merge_size, \
337
+ # so we need to average the timestamps between the first/last frame within the temporal patch
338
+ timestamps = [
339
+ (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size)
340
+ ]
341
+ return timestamps
342
+
343
+ __all__ = ["MiBotProcessor"]
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_mibot.MiBotProcessor"
4
+ },
5
+ "processor_class": "MiBotProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "auto_map": {
230
+ "AutoProcessor": "processing_mibot.MiBotProcessor"
231
+ },
232
+ "bos_token": null,
233
+ "clean_up_tokenization_spaces": false,
234
+ "eos_token": "<|im_end|>",
235
+ "errors": "replace",
236
+ "extra_special_tokens": {},
237
+ "model_max_length": 262144,
238
+ "pad_token": "<|endoftext|>",
239
+ "processor_class": "MiBotProcessor",
240
+ "split_special_tokens": false,
241
+ "tokenizer_class": "Qwen2Tokenizer",
242
+ "unk_token": null
243
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_mibot.MiBotProcessor"
4
+ },
5
+ "crop_size": null,
6
+ "data_format": "channels_first",
7
+ "default_to_square": true,
8
+ "device": null,
9
+ "do_center_crop": null,
10
+ "do_convert_rgb": true,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "do_sample_frames": true,
15
+ "fps": 2,
16
+ "image_mean": [
17
+ 0.5,
18
+ 0.5,
19
+ 0.5
20
+ ],
21
+ "image_std": [
22
+ 0.5,
23
+ 0.5,
24
+ 0.5
25
+ ],
26
+ "input_data_format": null,
27
+ "max_frames": 768,
28
+ "merge_size": 2,
29
+ "min_frames": 4,
30
+ "num_frames": null,
31
+ "pad_size": null,
32
+ "patch_size": 16,
33
+ "processor_class": "MiBotProcessor",
34
+ "resample": 3,
35
+ "rescale_factor": 0.00392156862745098,
36
+ "return_metadata": false,
37
+ "size": {
38
+ "longest_edge": 25165824,
39
+ "shortest_edge": 4096
40
+ },
41
+ "temporal_patch_size": 2,
42
+ "video_metadata": null,
43
+ "video_processor_type": "Qwen3VLVideoProcessor"
44
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff