HongyuanTao commited on
Commit
bd7c462
·
verified ·
1 Parent(s): 4951d74

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_implementation": "flash_attention_2",
3
+ "architectures": [
4
+ "InfiniteVLQwen2_5_VLForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_infinitevl.InfiniteVLConfig",
8
+ "AutoModel": "modeling_infinitevl.InfiniteVLQwen2_5_VLForConditionalGeneration",
9
+ "AutoModelForCausalLM": "modeling_infinitevl.InfiniteVLQwen2_5_VLForConditionalGeneration",
10
+ "AutoProcessor": "Qwen2_5_VLProcessor"
11
+ },
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 151643,
14
+ "eos_token_id": 151645,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 2048,
17
+ "image_token_id": 151655,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 11008,
20
+ "max_position_embeddings": 128000,
21
+ "max_window_layers": 70,
22
+ "model_type": "infinite_vl",
23
+ "num_attention_heads": 16,
24
+ "num_hidden_layers": 36,
25
+ "num_key_value_heads": 2,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_scaling": {
28
+ "mrope_section": [
29
+ 16,
30
+ 24,
31
+ 24
32
+ ],
33
+ "rope_type": "default",
34
+ "type": "default"
35
+ },
36
+ "rope_theta": 1000000.0,
37
+ "sliding_window": 8192,
38
+ "tie_word_embeddings": true,
39
+ "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.50.0",
41
+ "use_cache": false,
42
+ "use_sliding_window": true,
43
+ "video_token_id": 151656,
44
+ "vision_config": {
45
+ "depth": 32,
46
+ "fullatt_block_indexes": [
47
+ 7,
48
+ 15,
49
+ 23,
50
+ 31
51
+ ],
52
+ "hidden_act": "silu",
53
+ "hidden_size": 1280,
54
+ "in_channels": 3,
55
+ "in_chans": 3,
56
+ "intermediate_size": 3420,
57
+ "model_type": "infinite_vl",
58
+ "num_heads": 16,
59
+ "out_hidden_size": 2048,
60
+ "patch_size": 14,
61
+ "spatial_merge_size": 2,
62
+ "spatial_patch_size": 14,
63
+ "temporal_patch_size": 2,
64
+ "tokens_per_second": 2,
65
+ "torch_dtype": "bfloat16",
66
+ "window_size": 112
67
+ },
68
+ "vision_end_token_id": 151653,
69
+ "vision_start_token_id": 151652,
70
+ "vision_token_id": 151654,
71
+ "vocab_size": 151936
72
+ }
configuration_infinitevl.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team.
3
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
6
+ # and the GPT-NeoX and OPT implementations. It has been modified to create InfiniteVL.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
21
+ from transformers.modeling_rope_utils import rope_config_validation
22
+
23
+
24
+ class InfiniteVLVisionConfig(PretrainedConfig):
25
+ r"""
26
+ This is the configuration class to store the configuration of a [`InfiniteVLVisionModel`].
27
+
28
+ Args:
29
+ depth (`int`, *optional*, defaults to 32):
30
+ The number of layers in the vision transformer.
31
+ hidden_size (`int`, *optional*, defaults to 3584):
32
+ Dimensionality of the encoder layers and the pooler layer.
33
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
34
+ The non-linear activation function (function or string) in the encoder and pooler.
35
+ intermediate_size (`int`, *optional*, defaults to 3420):
36
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
37
+ num_heads (`int`, *optional*, defaults to 16):
38
+ Number of attention heads for each attention layer in the Transformer encoder.
39
+ in_channels (`int`, *optional*, defaults to 3):
40
+ Number of input channels.
41
+ patch_size (`int`, *optional*, defaults to 14):
42
+ The size (resolution) of each patch.
43
+ spatial_merge_size (`int`, *optional*, defaults to 2):
44
+ The scaling factor for spatial merging of patches.
45
+ temporal_patch_size (`int`, *optional*, defaults to 2):
46
+ The size of patches along the temporal dimension.
47
+ tokens_per_second (`int`, *optional*, defaults to 4):
48
+ Number of tokens processed per second for video inputs.
49
+ window_size (`int`, *optional*, defaults to 112):
50
+ The window size for windowed attention mechanisms.
51
+ out_hidden_size (`int`, *optional*, defaults to 3584):
52
+ Dimensionality of the output hidden states.
53
+ fullatt_block_indexes (`list`, *optional*):
54
+ Indices of blocks that use full attention instead of windowed attention.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ """
58
+
59
+ model_type = "infinite_vl"
60
+ base_config_key = "vision_config"
61
+
62
+ def __init__(
63
+ self,
64
+ depth=32,
65
+ hidden_size=3584,
66
+ hidden_act="silu",
67
+ intermediate_size=3420,
68
+ num_heads=16,
69
+ in_channels=3,
70
+ patch_size=14,
71
+ spatial_merge_size=2,
72
+ temporal_patch_size=2,
73
+ tokens_per_second=4,
74
+ window_size=112,
75
+ out_hidden_size=3584,
76
+ fullatt_block_indexes=None,
77
+ initializer_range=0.02,
78
+ **kwargs,
79
+ ):
80
+ super().__init__(**kwargs)
81
+
82
+ if fullatt_block_indexes is None:
83
+ fullatt_block_indexes = [7, 15, 23, 31]
84
+
85
+ self.depth = depth
86
+ self.hidden_size = hidden_size
87
+ self.hidden_act = hidden_act
88
+ self.intermediate_size = intermediate_size
89
+ self.num_heads = num_heads
90
+ self.in_channels = in_channels
91
+ self.patch_size = patch_size
92
+ self.spatial_merge_size = spatial_merge_size
93
+ self.temporal_patch_size = temporal_patch_size
94
+ self.tokens_per_second = tokens_per_second
95
+ self.window_size = window_size
96
+ self.fullatt_block_indexes = fullatt_block_indexes
97
+ self.out_hidden_size = out_hidden_size
98
+ self.initializer_range = initializer_range
99
+
100
+
101
+ class InfiniteVLTextConfig(PretrainedConfig):
102
+ r"""
103
+ This is the configuration class to store the configuration of a [`InfiniteVLTextModel`]. It is used to instantiate an
104
+ InfiniteVL model according to the specified arguments, defining the model architecture.
105
+
106
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
107
+ documentation from [`PretrainedConfig`] for more information.
108
+
109
+ Args:
110
+ vocab_size (`int`, *optional*, defaults to 152064):
111
+ Vocabulary size of the InfiniteVL model. Defines the number of different tokens that can be represented by the
112
+ `inputs_ids` passed when calling [`InfiniteVLModel`]
113
+ hidden_size (`int`, *optional*, defaults to 8192):
114
+ Dimension of the hidden representations.
115
+ intermediate_size (`int`, *optional*, defaults to 29568):
116
+ Dimension of the MLP representations.
117
+ num_hidden_layers (`int`, *optional*, defaults to 80):
118
+ Number of hidden layers in the Transformer encoder.
119
+ num_attention_heads (`int`, *optional*, defaults to 64):
120
+ Number of attention heads for each attention layer in the Transformer encoder.
121
+ num_key_value_heads (`int`, *optional*, defaults to 8):
122
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
123
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
124
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
125
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
126
+ The non-linear activation function (function or string) in the decoder.
127
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
128
+ The maximum sequence length that this model might ever be used with.
129
+ initializer_range (`float`, *optional*, defaults to 0.02):
130
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
131
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
132
+ The epsilon used by the rms normalization layers.
133
+ use_cache (`bool`, *optional*, defaults to `True`):
134
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
135
+ relevant if `config.is_decoder=True`.
136
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
137
+ Whether the model's input and output word embeddings should be tied.
138
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
139
+ The base period of the RoPE embeddings.
140
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
141
+ Whether to use sliding window attention.
142
+ sliding_window (`int`, *optional*, defaults to 32768):
143
+ Sliding window attention (SWA) window size.
144
+ max_window_layers (`int`, *optional*, defaults to 80):
145
+ The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
146
+ additional layer afterwards will use SWA (Sliding Window Attention).
147
+ layer_types (`list`, *optional*):
148
+ Attention pattern for each layer.
149
+ attention_dropout (`float`, *optional*, defaults to 0.0):
150
+ The dropout ratio for the attention probabilities.
151
+ rope_scaling (`Dict`, *optional*):
152
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
153
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
154
+ accordingly.
155
+ expand_v (`float`, *optional*, defaults to 2):
156
+ Expansion factor for the value dimension in the linear attention/DeltaNet layer.
157
+ mode (`str`, *optional*, defaults to `"chunk"`):
158
+ Execution mode for the linear attention layer (e.g., "chunk" or "fused_recurrent").
159
+ use_gate (`bool`, *optional*, defaults to `True`):
160
+ Whether to use the gating mechanism in the DeltaNet layer.
161
+ use_short_conv (`bool`, *optional*, defaults to `True`):
162
+ Whether to use short convolution in the linear attention layer.
163
+ conv_size (`int`, *optional*, defaults to 4):
164
+ Kernel size for the short convolution.
165
+ conv_bias (`bool`, *optional*, defaults to `False`):
166
+ Whether to use bias in the short convolution.
167
+ num_linear_key_value_heads (`int`, *optional*, defaults to 16):
168
+ Number of key/value heads used in the linear attention layers.
169
+ num_linear_heads (`int`, *optional*, defaults to 16):
170
+ Number of query heads used in the linear attention layers.
171
+ linear_head_dim (`int`, *optional*, defaults to 128):
172
+ Dimension of each head in the linear attention layers.
173
+ norm_eps (`float`, *optional*, defaults to 1e-5):
174
+ Epsilon value for normalization layers in the linear attention branch.
175
+
176
+ ```python
177
+ >>> from transformers import InfiniteVLTextModel, InfiniteVLConfig
178
+
179
+ >>> # Initializing an InfiniteVL style configuration
180
+ >>> configuration = InfiniteVLConfig()
181
+
182
+ >>> # Initializing a model from the InfiniteVL style configuration
183
+ >>> model = InfiniteVLTextModel(configuration.text_config)
184
+
185
+ >>> # Accessing the model configuration
186
+ >>> configuration = model.config
187
+ ```"""
188
+
189
+ model_type = "infinite_vl_text"
190
+ base_config_key = "text_config"
191
+ keys_to_ignore_at_inference = ["past_key_values"]
192
+ # Default tensor parallel plan for base model `InfiniteVL`
193
+ base_model_tp_plan = {
194
+ "layers.*.self_attn.q_proj": "colwise",
195
+ "layers.*.self_attn.k_proj": "colwise",
196
+ "layers.*.self_attn.v_proj": "colwise",
197
+ "layers.*.self_attn.o_proj": "rowwise",
198
+ "layers.*.mlp.gate_proj": "colwise",
199
+ "layers.*.mlp.up_proj": "colwise",
200
+ "layers.*.mlp.down_proj": "rowwise",
201
+ }
202
+ base_model_pp_plan = {
203
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
204
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
205
+ "norm": (["hidden_states"], ["hidden_states"]),
206
+ }
207
+
208
+ def __init__(
209
+ self,
210
+ vocab_size=152064,
211
+ hidden_size=8192,
212
+ intermediate_size=29568,
213
+ num_hidden_layers=80,
214
+ num_attention_heads=64,
215
+ num_key_value_heads=8,
216
+ head_dim=128,
217
+ hidden_act="silu",
218
+ max_position_embeddings=32768,
219
+ initializer_range=0.02,
220
+ rms_norm_eps=1e-05,
221
+ norm_eps=1e-5,
222
+ use_cache=True,
223
+ tie_word_embeddings=False,
224
+ rope_theta=1000000.0,
225
+ use_sliding_window=False,
226
+ sliding_window=32768,
227
+ max_window_layers=80,
228
+ layer_types=None,
229
+ attention_dropout=0.0,
230
+ rope_scaling=None,
231
+ expand_v: float = 2,
232
+ mode: str = "chunk",
233
+ use_gate: bool = True,
234
+ use_short_conv: bool = True,
235
+ conv_size: int = 4,
236
+ conv_bias: bool = False,
237
+ num_linear_key_value_heads: int = 16,
238
+ num_linear_heads: int = 16,
239
+ linear_head_dim: int = 128,
240
+ **kwargs,
241
+ ):
242
+ self.vocab_size = vocab_size
243
+ self.max_position_embeddings = max_position_embeddings
244
+ self.hidden_size = hidden_size
245
+ self.intermediate_size = intermediate_size
246
+ self.num_hidden_layers = num_hidden_layers
247
+ self.num_attention_heads = num_attention_heads
248
+ self.head_dim = head_dim
249
+ self.use_sliding_window = use_sliding_window
250
+ self.sliding_window = sliding_window if self.use_sliding_window else None
251
+ self.max_window_layers = max_window_layers
252
+
253
+ # for backward compatibility
254
+ if num_key_value_heads is None:
255
+ num_key_value_heads = num_attention_heads
256
+
257
+ self.num_key_value_heads = num_key_value_heads
258
+ self.hidden_act = hidden_act
259
+ self.initializer_range = initializer_range
260
+ self.rms_norm_eps = rms_norm_eps
261
+ self.use_cache = use_cache
262
+ self.rope_theta = rope_theta
263
+ self.attention_dropout = attention_dropout
264
+ self.rope_scaling = rope_scaling
265
+
266
+ # DeltaNet / linear branch
267
+ self.expand_v = expand_v
268
+ self.mode = mode
269
+ self.use_gate = use_gate
270
+ self.use_short_conv = use_short_conv
271
+ self.conv_size = conv_size
272
+ self.conv_bias = conv_bias
273
+ self.num_linear_key_value_heads = num_linear_key_value_heads
274
+ self.num_linear_heads = num_linear_heads
275
+ self.linear_head_dim = linear_head_dim
276
+ self.norm_eps = norm_eps
277
+
278
+ self.layer_types = layer_types
279
+ if self.layer_types is None:
280
+ # Default: one sliding_attention layer followed by three linear_attention layers (period = 4)
281
+ self.layer_types = [
282
+ "linear_attention" if bool(i % 4) else "sliding_attention"
283
+ for i in range(self.num_hidden_layers)
284
+ ]
285
+
286
+ layer_type_validation(self.layer_types, self.num_hidden_layers)
287
+
288
+ # Validate the correctness of rotary position embeddings parameters
289
+ # Backward Compatibility: if there is a 'type' field, move it to 'rope_type'.
290
+ # Also change type from 'mrope' to 'default' because `mrope` uses default RoPE calculations in this architecture.
291
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
292
+ if self.rope_scaling["type"] == "mrope":
293
+ self.rope_scaling["type"] = "default"
294
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
295
+
296
+ rope_config_validation(self, ignore_keys={"mrope_section"})
297
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
298
+
299
+
300
+ class InfiniteVLConfig(PretrainedConfig):
301
+ r"""
302
+ This is the configuration class to store the configuration of a [`InfiniteVLModel`]. It is used to instantiate an
303
+ InfiniteVL model according to the specified arguments, defining the model architecture.
304
+
305
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
306
+ documentation from [`PretrainedConfig`] for more information.
307
+
308
+ Args:
309
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `InfiniteVLTextConfig`):
310
+ The config object or dictionary of the text backbone.
311
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `InfiniteVLVisionConfig`):
312
+ The config object or dictionary of the vision backbone.
313
+ image_token_id (`int`, *optional*, defaults to 151655):
314
+ The image token index to encode the image prompt.
315
+ video_token_id (`int`, *optional*, defaults to 151656):
316
+ The video token index to encode the video prompt.
317
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
318
+ The token index to denote start of vision input.
319
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
320
+ The token index to denote end of vision input.
321
+
322
+ ```python
323
+ >>> from transformers import InfiniteVLQwen2_5_VLForConditionalGeneration, InfiniteVLConfig
324
+
325
+ >>> # Initializing an InfiniteVL style configuration
326
+ >>> configuration = InfiniteVLConfig()
327
+
328
+ >>> # Initializing a model from the InfiniteVL style configuration
329
+ >>> model = InfiniteVLQwen2_5_VLForConditionalGeneration(configuration)
330
+
331
+ >>> # Accessing the model configuration
332
+ >>> configuration = model.config
333
+ ```"""
334
+
335
+ model_type = "infinite_vl"
336
+ sub_configs = {"vision_config": InfiniteVLVisionConfig, "text_config": InfiniteVLTextConfig}
337
+ keys_to_ignore_at_inference = ["past_key_values"]
338
+
339
+ def __init__(
340
+ self,
341
+ text_config=None,
342
+ vision_config=None,
343
+ image_token_id=151655,
344
+ video_token_id=151656,
345
+ vision_start_token_id=151652,
346
+ vision_end_token_id=151653,
347
+ **kwargs,
348
+ ):
349
+ # We need to init super() here so that it does not reset values
350
+ # that are in text config to the BaseClass defaults. The Base
351
+ # config has many text related defaults and not all defaults are same as for `InfiniteVLTextConfig`
352
+ super().__init__(**kwargs)
353
+
354
+ if isinstance(vision_config, dict):
355
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
356
+ elif vision_config is None:
357
+ self.vision_config = self.sub_configs["vision_config"]()
358
+
359
+ if isinstance(text_config, dict):
360
+ self.text_config = self.sub_configs["text_config"](**text_config)
361
+ elif text_config is None:
362
+ # For BC use all kwargs to init `TextConfig`
363
+ self.text_config = self.sub_configs["text_config"](**kwargs)
364
+
365
+ self.image_token_id = image_token_id
366
+ self.video_token_id = video_token_id
367
+ self.vision_start_token_id = vision_start_token_id
368
+ self.vision_end_token_id = vision_end_token_id
369
+
370
+ # Attention implementation to use. It sets it recursively on sub-configs so we call it again in the end
371
+ self._attn_implementation = kwargs.pop("attn_implementation", None)
372
+
373
+ def __setattr__(self, key, value):
374
+ if (
375
+ (text_config := super().__getattribute__("__dict__").get("text_config")) is not None
376
+ and key not in ["dtype", "_attn_implementation_internal"]
377
+ and key in text_config.__dict__
378
+ ):
379
+ setattr(text_config, key, value)
380
+ else:
381
+ super().__setattr__(key, value)
382
+
383
+ def __getattribute__(self, key):
384
+ if "text_config" in super().__getattribute__("__dict__") and key not in [
385
+ "dtype",
386
+ "_attn_implementation_internal",
387
+ ]:
388
+ text_config = super().__getattribute__("text_config")
389
+ if key in text_config.__dict__:
390
+ return getattr(text_config, key)
391
+
392
+ return super().__getattribute__(key)
393
+
394
+
395
+ __all__ = ["InfiniteVLConfig", "InfiniteVLTextConfig", "InfiniteVLVisionConfig"]
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 1e-06,
11
+ "transformers_version": "4.57.0"
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
@@ -0,0 +1,994 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4408381536,
4
+ "total_size": 8816763072
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.A_log": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.dt_bias": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.1.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.1.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.1.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.1.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.10.self_attn.A_log": "model-00001-of-00002.safetensors",
44
+ "model.layers.10.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.10.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.10.self_attn.dt_bias": "model-00001-of-00002.safetensors",
47
+ "model.layers.10.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.10.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.10.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.10.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.10.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.11.self_attn.A_log": "model-00001-of-00002.safetensors",
62
+ "model.layers.11.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.11.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.11.self_attn.dt_bias": "model-00001-of-00002.safetensors",
65
+ "model.layers.11.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.11.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.11.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.11.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.11.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
80
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
83
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
85
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.13.self_attn.A_log": "model-00001-of-00002.safetensors",
92
+ "model.layers.13.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.13.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.13.self_attn.dt_bias": "model-00001-of-00002.safetensors",
95
+ "model.layers.13.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.13.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.13.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.13.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.13.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.14.self_attn.A_log": "model-00001-of-00002.safetensors",
110
+ "model.layers.14.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.14.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.14.self_attn.dt_bias": "model-00001-of-00002.safetensors",
113
+ "model.layers.14.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.14.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.14.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.14.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.14.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors",
123
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
124
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
127
+ "model.layers.15.self_attn.A_log": "model-00001-of-00002.safetensors",
128
+ "model.layers.15.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.15.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.15.self_attn.dt_bias": "model-00001-of-00002.safetensors",
131
+ "model.layers.15.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.15.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.15.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.15.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.15.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
141
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
142
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
143
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
144
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
145
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
146
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
147
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
148
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
149
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
150
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
151
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
154
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
157
+ "model.layers.17.self_attn.A_log": "model-00002-of-00002.safetensors",
158
+ "model.layers.17.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
159
+ "model.layers.17.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
160
+ "model.layers.17.self_attn.dt_bias": "model-00002-of-00002.safetensors",
161
+ "model.layers.17.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
162
+ "model.layers.17.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.17.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.17.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
168
+ "model.layers.17.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
175
+ "model.layers.18.self_attn.A_log": "model-00002-of-00002.safetensors",
176
+ "model.layers.18.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.18.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
178
+ "model.layers.18.self_attn.dt_bias": "model-00002-of-00002.safetensors",
179
+ "model.layers.18.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.18.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.18.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.18.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.18.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
187
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
190
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.19.self_attn.A_log": "model-00002-of-00002.safetensors",
194
+ "model.layers.19.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.19.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.19.self_attn.dt_bias": "model-00002-of-00002.safetensors",
197
+ "model.layers.19.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.19.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.19.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
202
+ "model.layers.19.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
203
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
204
+ "model.layers.19.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
208
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.2.self_attn.A_log": "model-00001-of-00002.safetensors",
212
+ "model.layers.2.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.2.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.2.self_attn.dt_bias": "model-00001-of-00002.safetensors",
215
+ "model.layers.2.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.2.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.2.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.2.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.2.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
230
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
233
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
235
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.21.self_attn.A_log": "model-00002-of-00002.safetensors",
242
+ "model.layers.21.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.21.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.21.self_attn.dt_bias": "model-00002-of-00002.safetensors",
245
+ "model.layers.21.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.21.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.21.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.21.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.21.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.22.self_attn.A_log": "model-00002-of-00002.safetensors",
260
+ "model.layers.22.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.22.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
262
+ "model.layers.22.self_attn.dt_bias": "model-00002-of-00002.safetensors",
263
+ "model.layers.22.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
264
+ "model.layers.22.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
265
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
266
+ "model.layers.22.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
267
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
268
+ "model.layers.22.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
269
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
270
+ "model.layers.22.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
271
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
272
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.23.self_attn.A_log": "model-00002-of-00002.safetensors",
278
+ "model.layers.23.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.23.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.23.self_attn.dt_bias": "model-00002-of-00002.safetensors",
281
+ "model.layers.23.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.23.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.23.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.23.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.23.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
296
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
299
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
301
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.25.self_attn.A_log": "model-00002-of-00002.safetensors",
308
+ "model.layers.25.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.25.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.25.self_attn.dt_bias": "model-00002-of-00002.safetensors",
311
+ "model.layers.25.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.25.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.25.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.25.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.25.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.26.self_attn.A_log": "model-00002-of-00002.safetensors",
326
+ "model.layers.26.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.26.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.26.self_attn.dt_bias": "model-00002-of-00002.safetensors",
329
+ "model.layers.26.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.26.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.26.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.26.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.26.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
340
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
341
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
342
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
343
+ "model.layers.27.self_attn.A_log": "model-00002-of-00002.safetensors",
344
+ "model.layers.27.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
345
+ "model.layers.27.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
346
+ "model.layers.27.self_attn.dt_bias": "model-00002-of-00002.safetensors",
347
+ "model.layers.27.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.27.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
350
+ "model.layers.27.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
352
+ "model.layers.27.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
354
+ "model.layers.27.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
355
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
357
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
359
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
362
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
364
+ "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
365
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
366
+ "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
367
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
368
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
369
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
370
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
371
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
372
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
373
+ "model.layers.29.self_attn.A_log": "model-00002-of-00002.safetensors",
374
+ "model.layers.29.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
375
+ "model.layers.29.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
376
+ "model.layers.29.self_attn.dt_bias": "model-00002-of-00002.safetensors",
377
+ "model.layers.29.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
378
+ "model.layers.29.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
379
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
380
+ "model.layers.29.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
381
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
382
+ "model.layers.29.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
383
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
384
+ "model.layers.29.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
385
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
386
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.3.self_attn.A_log": "model-00001-of-00002.safetensors",
392
+ "model.layers.3.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.3.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.3.self_attn.dt_bias": "model-00001-of-00002.safetensors",
395
+ "model.layers.3.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.3.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.3.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.3.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.3.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
403
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
405
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
406
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
407
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
408
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
409
+ "model.layers.30.self_attn.A_log": "model-00002-of-00002.safetensors",
410
+ "model.layers.30.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
411
+ "model.layers.30.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
412
+ "model.layers.30.self_attn.dt_bias": "model-00002-of-00002.safetensors",
413
+ "model.layers.30.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
414
+ "model.layers.30.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
415
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
416
+ "model.layers.30.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
417
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
418
+ "model.layers.30.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
419
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
420
+ "model.layers.30.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
421
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
422
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
423
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
424
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
425
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
426
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
427
+ "model.layers.31.self_attn.A_log": "model-00002-of-00002.safetensors",
428
+ "model.layers.31.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
429
+ "model.layers.31.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
430
+ "model.layers.31.self_attn.dt_bias": "model-00002-of-00002.safetensors",
431
+ "model.layers.31.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
432
+ "model.layers.31.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
433
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
434
+ "model.layers.31.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
435
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
436
+ "model.layers.31.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
437
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
438
+ "model.layers.31.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
439
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
440
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
441
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
442
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
443
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
444
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
445
+ "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
446
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
447
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
448
+ "model.layers.32.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
449
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
450
+ "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
451
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
452
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
453
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
454
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
455
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
456
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
457
+ "model.layers.33.self_attn.A_log": "model-00002-of-00002.safetensors",
458
+ "model.layers.33.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
459
+ "model.layers.33.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
460
+ "model.layers.33.self_attn.dt_bias": "model-00002-of-00002.safetensors",
461
+ "model.layers.33.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
462
+ "model.layers.33.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
463
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
464
+ "model.layers.33.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
465
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
466
+ "model.layers.33.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
467
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
468
+ "model.layers.33.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
469
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
470
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
471
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
472
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
473
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
474
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
475
+ "model.layers.34.self_attn.A_log": "model-00002-of-00002.safetensors",
476
+ "model.layers.34.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
477
+ "model.layers.34.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
478
+ "model.layers.34.self_attn.dt_bias": "model-00002-of-00002.safetensors",
479
+ "model.layers.34.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
480
+ "model.layers.34.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
481
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
482
+ "model.layers.34.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
483
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
484
+ "model.layers.34.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
485
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
486
+ "model.layers.34.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
487
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
488
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
489
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
490
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
491
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
492
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
493
+ "model.layers.35.self_attn.A_log": "model-00002-of-00002.safetensors",
494
+ "model.layers.35.self_attn.a_proj.weight": "model-00002-of-00002.safetensors",
495
+ "model.layers.35.self_attn.b_proj.weight": "model-00002-of-00002.safetensors",
496
+ "model.layers.35.self_attn.dt_bias": "model-00002-of-00002.safetensors",
497
+ "model.layers.35.self_attn.g_proj.weight": "model-00002-of-00002.safetensors",
498
+ "model.layers.35.self_attn.k_conv1d.weight": "model-00002-of-00002.safetensors",
499
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
500
+ "model.layers.35.self_attn.o_norm.weight": "model-00002-of-00002.safetensors",
501
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
502
+ "model.layers.35.self_attn.q_conv1d.weight": "model-00002-of-00002.safetensors",
503
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
504
+ "model.layers.35.self_attn.v_conv1d.weight": "model-00002-of-00002.safetensors",
505
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
506
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
507
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
508
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
509
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
510
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
511
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
512
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
513
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
514
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
515
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
516
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
517
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
518
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
519
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
520
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
521
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
522
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
523
+ "model.layers.5.self_attn.A_log": "model-00001-of-00002.safetensors",
524
+ "model.layers.5.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
525
+ "model.layers.5.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
526
+ "model.layers.5.self_attn.dt_bias": "model-00001-of-00002.safetensors",
527
+ "model.layers.5.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
528
+ "model.layers.5.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
529
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
530
+ "model.layers.5.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
531
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
532
+ "model.layers.5.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
533
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
534
+ "model.layers.5.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
535
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
536
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
537
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
538
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
539
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
540
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
541
+ "model.layers.6.self_attn.A_log": "model-00001-of-00002.safetensors",
542
+ "model.layers.6.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
543
+ "model.layers.6.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
544
+ "model.layers.6.self_attn.dt_bias": "model-00001-of-00002.safetensors",
545
+ "model.layers.6.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
546
+ "model.layers.6.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
547
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
548
+ "model.layers.6.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
549
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
550
+ "model.layers.6.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
551
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
552
+ "model.layers.6.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
553
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
554
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
555
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
556
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
557
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
558
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
559
+ "model.layers.7.self_attn.A_log": "model-00001-of-00002.safetensors",
560
+ "model.layers.7.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
561
+ "model.layers.7.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
562
+ "model.layers.7.self_attn.dt_bias": "model-00001-of-00002.safetensors",
563
+ "model.layers.7.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
564
+ "model.layers.7.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
565
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
566
+ "model.layers.7.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
567
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
568
+ "model.layers.7.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
569
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
570
+ "model.layers.7.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
571
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
572
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
573
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
574
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
575
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
576
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
577
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
578
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
579
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
580
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
581
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
582
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
583
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
584
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
585
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
586
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
587
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
588
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
589
+ "model.layers.9.self_attn.A_log": "model-00001-of-00002.safetensors",
590
+ "model.layers.9.self_attn.a_proj.weight": "model-00001-of-00002.safetensors",
591
+ "model.layers.9.self_attn.b_proj.weight": "model-00001-of-00002.safetensors",
592
+ "model.layers.9.self_attn.dt_bias": "model-00001-of-00002.safetensors",
593
+ "model.layers.9.self_attn.g_proj.weight": "model-00001-of-00002.safetensors",
594
+ "model.layers.9.self_attn.k_conv1d.weight": "model-00001-of-00002.safetensors",
595
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
596
+ "model.layers.9.self_attn.o_norm.weight": "model-00001-of-00002.safetensors",
597
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
598
+ "model.layers.9.self_attn.q_conv1d.weight": "model-00001-of-00002.safetensors",
599
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
600
+ "model.layers.9.self_attn.v_conv1d.weight": "model-00001-of-00002.safetensors",
601
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
602
+ "model.norm.weight": "model-00002-of-00002.safetensors",
603
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
604
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
605
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
606
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
607
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
608
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
609
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
610
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
611
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
612
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
613
+ "visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
614
+ "visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
615
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
616
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
617
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
618
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
619
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
620
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
621
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
622
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
623
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
624
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
625
+ "visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
626
+ "visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
627
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
628
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
629
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
630
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
631
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
632
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
633
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
634
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
635
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
636
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
637
+ "visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
638
+ "visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
639
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
640
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
641
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
642
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
643
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
644
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
645
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
646
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
647
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
648
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
649
+ "visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
650
+ "visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
651
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
652
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
653
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
654
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
655
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
656
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
657
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
658
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
659
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
660
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
661
+ "visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
662
+ "visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
663
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
664
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
665
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
666
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
667
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
668
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
669
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
670
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
671
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
672
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
673
+ "visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
674
+ "visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
675
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
676
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
677
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
678
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
679
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
680
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
681
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
682
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
683
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
684
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
685
+ "visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
686
+ "visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
687
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
688
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
689
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
690
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
691
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
692
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
693
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
694
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
695
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
696
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
697
+ "visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
698
+ "visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
699
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
700
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
701
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
702
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
703
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
704
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
705
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
706
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
707
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
708
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
709
+ "visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
710
+ "visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
711
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
712
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
713
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
714
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
715
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
716
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
717
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
718
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
719
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
720
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
721
+ "visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
722
+ "visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
723
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
724
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
725
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
726
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
727
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
728
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
729
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
730
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
731
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
732
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
733
+ "visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
734
+ "visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
735
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
736
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
737
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
738
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
739
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
740
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
741
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
742
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
743
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
744
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
745
+ "visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
746
+ "visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
747
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
748
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
749
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
750
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
751
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
752
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
753
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
754
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
755
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
756
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
757
+ "visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
758
+ "visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
759
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
760
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
761
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
762
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
763
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
764
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
765
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
766
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
767
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
768
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
769
+ "visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
770
+ "visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
771
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
772
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
773
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
774
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
775
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
776
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
777
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
778
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
779
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
780
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
781
+ "visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
782
+ "visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
783
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
784
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
785
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
786
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
787
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
788
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
789
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
790
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
791
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
792
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
793
+ "visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
794
+ "visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
795
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
796
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
797
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
798
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
799
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
800
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
801
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
802
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
803
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
804
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
805
+ "visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
806
+ "visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
807
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00002.safetensors",
808
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
809
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00002.safetensors",
810
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
811
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
812
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
813
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
814
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
815
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
816
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
817
+ "visual.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
818
+ "visual.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
819
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00002.safetensors",
820
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
821
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00002.safetensors",
822
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00002.safetensors",
823
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
824
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
825
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
826
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
827
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
828
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
829
+ "visual.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
830
+ "visual.blocks.25.norm2.weight": "model-00001-of-00002.safetensors",
831
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00002.safetensors",
832
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
833
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00002.safetensors",
834
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
835
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
836
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
837
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
838
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
839
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
840
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
841
+ "visual.blocks.26.norm1.weight": "model-00001-of-00002.safetensors",
842
+ "visual.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
843
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00002.safetensors",
844
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
845
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00002.safetensors",
846
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
847
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
848
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
849
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
850
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
851
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
852
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
853
+ "visual.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
854
+ "visual.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
855
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00002.safetensors",
856
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00002.safetensors",
857
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00002.safetensors",
858
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00002.safetensors",
859
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
860
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
861
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
862
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
863
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
864
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
865
+ "visual.blocks.28.norm1.weight": "model-00001-of-00002.safetensors",
866
+ "visual.blocks.28.norm2.weight": "model-00001-of-00002.safetensors",
867
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00002.safetensors",
868
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
869
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00002.safetensors",
870
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
871
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
872
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
873
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
874
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
875
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
876
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
877
+ "visual.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
878
+ "visual.blocks.29.norm2.weight": "model-00001-of-00002.safetensors",
879
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
880
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
881
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
882
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
883
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
884
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
885
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
886
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
887
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
888
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
889
+ "visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
890
+ "visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
891
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00002.safetensors",
892
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
893
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00002.safetensors",
894
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
895
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
896
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
897
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
898
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
899
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
900
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
901
+ "visual.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
902
+ "visual.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
903
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00002.safetensors",
904
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00002.safetensors",
905
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00002.safetensors",
906
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
907
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
908
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
909
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
910
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
911
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
912
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
913
+ "visual.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
914
+ "visual.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
915
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
916
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
917
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
918
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
919
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
920
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
921
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
922
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
923
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
924
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
925
+ "visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
926
+ "visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
927
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
928
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
929
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
930
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
931
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
932
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
933
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
934
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
935
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
936
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
937
+ "visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
938
+ "visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
939
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
940
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
941
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
942
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
943
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
944
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
945
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
946
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
947
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
948
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
949
+ "visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
950
+ "visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
951
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
952
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
953
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
954
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
955
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
956
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
957
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
958
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
959
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
960
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
961
+ "visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
962
+ "visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
963
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
964
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
965
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
966
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
967
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
968
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
969
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
970
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
971
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
972
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
973
+ "visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
974
+ "visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
975
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
976
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
977
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
978
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
979
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00002.safetensors",
980
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
981
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00002.safetensors",
982
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
983
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00002.safetensors",
984
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
985
+ "visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
986
+ "visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
987
+ "visual.merger.ln_q.weight": "model-00001-of-00002.safetensors",
988
+ "visual.merger.mlp.0.bias": "model-00001-of-00002.safetensors",
989
+ "visual.merger.mlp.0.weight": "model-00001-of-00002.safetensors",
990
+ "visual.merger.mlp.2.bias": "model-00001-of-00002.safetensors",
991
+ "visual.merger.mlp.2.weight": "model-00001-of-00002.safetensors",
992
+ "visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors"
993
+ }
994
+ }
modeling_infinitevl.py ADDED
The diff for this file is too large to render. See raw diff
 
modular_infinitevl.py ADDED
@@ -0,0 +1,1089 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team.
3
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
6
+ # and the GPT-NeoX and OPT implementations. It has been modified to create InfiniteVL.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+ """PyTorch InfiniteVL model (built on top of Qwen2-VL/Qwen2.5-VL)."""
20
+
21
+ from typing import List, Optional, Tuple, Union
22
+
23
+ import numpy as np
24
+ import torch
25
+ import torch.nn as nn
26
+ import torch.nn.functional as F
27
+
28
+ from transformers.activations import ACT2FN
29
+ from transformers.cache_utils import Cache
30
+ from transformers.configuration_utils import PretrainedConfig
31
+ from transformers.feature_extraction_utils import BatchFeature
32
+ from transformers.image_utils import ImageInput
33
+ from transformers.modeling_flash_attention_utils import is_flash_attn_available
34
+ from transformers.modeling_layers import GradientCheckpointingLayer
35
+ from transformers.processing_utils import MultiModalData, ProcessingKwargs, Unpack, VideosKwargs
36
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
37
+ from transformers.utils import is_torchdynamo_compiling, logging
38
+ from transformers.video_utils import VideoInput
39
+
40
+ # Import base Qwen2-VL components to extend/wrap
41
+ from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLTextConfig
42
+ from transformers.models.qwen2_vl.modeling_qwen2_vl import (
43
+ PatchEmbed,
44
+ PatchMerger,
45
+ Qwen2RMSNorm,
46
+ Qwen2VLCausalLMOutputWithPast,
47
+ Qwen2VLForConditionalGeneration,
48
+ Qwen2VLModel,
49
+ Qwen2VLModelOutputWithPast,
50
+ Qwen2VLPreTrainedModel,
51
+ TransformersKwargs,
52
+ VisionAttention,
53
+ VisionRotaryEmbedding,
54
+ )
55
+ from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor
56
+
57
+
58
+ if is_flash_attn_available():
59
+ # We keep this conditional import pattern for future flash-attn
60
+ # specific branches without changing the public API.
61
+ pass
62
+
63
+
64
+ logger = logging.get_logger(__name__)
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Configs
69
+ # ---------------------------------------------------------------------------
70
+
71
+
72
+ class InfiniteVLVisionConfig(PretrainedConfig):
73
+ """
74
+ Vision backbone configuration for InfiniteVL.
75
+
76
+ This mirrors the Qwen2.5-VL vision encoder but is exposed under the
77
+ InfiniteVL naming for clarity. It is used as a sub-config inside
78
+ :class:`InfiniteVLConfig`.
79
+ """
80
+
81
+ model_type = "infinite_vl"
82
+ base_config_key = "vision_config"
83
+
84
+ def __init__(
85
+ self,
86
+ depth: int = 32,
87
+ hidden_size: int = 3584,
88
+ hidden_act: str = "silu",
89
+ intermediate_size: int = 3420,
90
+ num_heads: int = 16,
91
+ in_channels: int = 3,
92
+ patch_size: int = 14,
93
+ spatial_merge_size: int = 2,
94
+ temporal_patch_size: int = 2,
95
+ tokens_per_second: int = 4,
96
+ window_size: int = 112,
97
+ out_hidden_size: int = 3584,
98
+ fullatt_block_indexes: Optional[List[int]] = None,
99
+ initializer_range: float = 0.02,
100
+ **kwargs,
101
+ ):
102
+ super().__init__(**kwargs)
103
+
104
+ if fullatt_block_indexes is None:
105
+ fullatt_block_indexes = [7, 15, 23, 31]
106
+
107
+ self.depth = depth
108
+ self.hidden_size = hidden_size
109
+ self.hidden_act = hidden_act
110
+ self.intermediate_size = intermediate_size
111
+ self.num_heads = num_heads
112
+ self.in_channels = in_channels
113
+ self.patch_size = patch_size
114
+ self.spatial_merge_size = spatial_merge_size
115
+ self.temporal_patch_size = temporal_patch_size
116
+ self.tokens_per_second = tokens_per_second
117
+ self.window_size = window_size
118
+ self.fullatt_block_indexes = list(fullatt_block_indexes)
119
+ self.out_hidden_size = out_hidden_size
120
+ self.initializer_range = initializer_range
121
+
122
+
123
+ class InfiniteVLTextConfig(Qwen2VLTextConfig):
124
+ """
125
+ Text backbone configuration for InfiniteVL.
126
+
127
+ This class currently reuses :class:`Qwen2VLTextConfig` as a base and
128
+ only overrides the model_type to keep InfiniteVL text separate at
129
+ the configuration level, while remaining fully compatible with
130
+ the parent implementation.
131
+ """
132
+
133
+ model_type = "infinite_vl_text"
134
+
135
+
136
+ class InfiniteVLConfig(Qwen2VLConfig):
137
+ """
138
+ Top-level InfiniteVL configuration.
139
+
140
+ This extends :class:`Qwen2VLConfig` and swaps in the InfiniteVL
141
+ vision/text config classes via ``sub_configs`` so that downstream
142
+ models can transparently use InfiniteVL while remaining compatible
143
+ with Qwen2-VL tooling and loading code.
144
+ """
145
+
146
+ model_type = "infinite_vl"
147
+ sub_configs = {"vision_config": InfiniteVLVisionConfig, "text_config": InfiniteVLTextConfig}
148
+
149
+
150
+ # ---------------------------------------------------------------------------
151
+ # Vision backbone
152
+ # ---------------------------------------------------------------------------
153
+
154
+
155
+ class InfiniteVLMLP(nn.Module):
156
+ """
157
+ Standard gated MLP used in the InfiniteVL vision backbone.
158
+ """
159
+
160
+ def __init__(self, config: InfiniteVLVisionConfig, bias: bool = False):
161
+ super().__init__()
162
+ self.hidden_size = config.hidden_size
163
+ self.intermediate_size = config.intermediate_size
164
+
165
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
166
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
167
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
168
+ self.act_fn = ACT2FN[config.hidden_act]
169
+
170
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
171
+ gated = self.act_fn(self.gate_proj(hidden_state))
172
+ return self.down_proj(gated * self.up_proj(hidden_state))
173
+
174
+
175
+ class InfiniteVisionPatchEmbed(PatchEmbed):
176
+ """
177
+ Wrapper around the Qwen2-VL patch embedder kept for naming
178
+ consistency in the InfiniteVL codebase.
179
+ """
180
+
181
+ pass
182
+
183
+
184
+ class InfiniteVisionRotaryEmbedding(VisionRotaryEmbedding):
185
+ """
186
+ Rotary embedding for the InfiniteVL vision backbone. This is a direct
187
+ alias for the Qwen2-VL implementation, exposed under an InfiniteVL
188
+ name for clarity.
189
+ """
190
+
191
+ pass
192
+
193
+
194
+ class InfiniteVLPatchMerger(PatchMerger):
195
+ """
196
+ Patch merger with Qwen2-style RMSNorm on the query side.
197
+ """
198
+
199
+ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
200
+ super().__init__(dim, context_dim, spatial_merge_size)
201
+ self.ln_q = Qwen2RMSNorm(context_dim, eps=1e-6)
202
+
203
+
204
+ class InfiniteVLVisionAttention(VisionAttention):
205
+ """
206
+ Vision attention wrapper that exposes the hidden size via ``dim``
207
+ for convenience.
208
+ """
209
+
210
+ def __init__(self, config: InfiniteVLVisionConfig) -> None:
211
+ super().__init__(config)
212
+ self.dim = config.hidden_size
213
+
214
+
215
+ class InfiniteVLVisionBlock(GradientCheckpointingLayer):
216
+ """
217
+ A single InfiniteVL vision transformer block consisting of:
218
+ - Qwen2-style RMSNorm
219
+ - multi-head attention
220
+ - gated MLP
221
+ """
222
+
223
+ def __init__(self, config: InfiniteVLVisionConfig, attn_implementation: str = "sdpa") -> None:
224
+ super().__init__()
225
+ self.norm1 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
226
+ self.norm2 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
227
+ self.attn = InfiniteVLVisionAttention(config=config)
228
+ self.mlp = InfiniteVLMLP(config, bias=True)
229
+
230
+ def forward(
231
+ self,
232
+ hidden_states: torch.Tensor,
233
+ cu_seqlens: torch.Tensor,
234
+ rotary_pos_emb: Optional[torch.Tensor] = None,
235
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
236
+ **kwargs,
237
+ ) -> torch.Tensor:
238
+ hidden_states = hidden_states + self.attn(
239
+ self.norm1(hidden_states),
240
+ cu_seqlens=cu_seqlens,
241
+ rotary_pos_emb=rotary_pos_emb,
242
+ position_embeddings=position_embeddings,
243
+ **kwargs,
244
+ )
245
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
246
+ return hidden_states
247
+
248
+
249
+ # ---------------------------------------------------------------------------
250
+ # Base model wrappers
251
+ # ---------------------------------------------------------------------------
252
+
253
+
254
+ class InfiniteVLPreTrainedModel(Qwen2VLPreTrainedModel):
255
+ """
256
+ Pretrained model wrapper so that InfiniteVL can plug into the same
257
+ utilities as Qwen2-VL.
258
+ """
259
+
260
+ pass
261
+
262
+
263
+ class InfiniteVisionTransformerPretrainedModel(InfiniteVLPreTrainedModel):
264
+ """
265
+ InfiniteVL vision transformer that adapts the Qwen2.5-VL visual
266
+ encoder to the modular InfiniteVL stack.
267
+ """
268
+
269
+ config: InfiniteVLVisionConfig
270
+ _no_split_modules = ["InfiniteVLVisionBlock"]
271
+
272
+ def __init__(self, config: InfiniteVLVisionConfig, *inputs, **kwargs) -> None:
273
+ super().__init__(config, *inputs, **kwargs)
274
+ self.spatial_merge_size = config.spatial_merge_size
275
+ self.patch_size = config.patch_size
276
+ self.fullatt_block_indexes = config.fullatt_block_indexes
277
+ self.window_size = config.window_size
278
+ self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
279
+
280
+ self.patch_embed = InfiniteVisionPatchEmbed(
281
+ patch_size=config.patch_size,
282
+ temporal_patch_size=config.temporal_patch_size,
283
+ in_channels=config.in_channels,
284
+ embed_dim=config.hidden_size,
285
+ )
286
+
287
+ head_dim = config.hidden_size // config.num_heads
288
+ self.rotary_pos_emb = InfiniteVisionRotaryEmbedding(head_dim // 2)
289
+
290
+ self.blocks = nn.ModuleList([InfiniteVLVisionBlock(config) for _ in range(config.depth)])
291
+ self.merger = InfiniteVLPatchMerger(
292
+ dim=config.out_hidden_size,
293
+ context_dim=config.hidden_size,
294
+ spatial_merge_size=config.spatial_merge_size,
295
+ )
296
+ self.gradient_checkpointing = False
297
+
298
+ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
299
+ pos_ids = []
300
+ for t, h, w in grid_thw:
301
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
302
+ hpos_ids = hpos_ids.reshape(
303
+ h // self.spatial_merge_size,
304
+ self.spatial_merge_size,
305
+ w // self.spatial_merge_size,
306
+ self.spatial_merge_size,
307
+ )
308
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
309
+ hpos_ids = hpos_ids.flatten()
310
+
311
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
312
+ wpos_ids = wpos_ids.reshape(
313
+ h // self.spatial_merge_size,
314
+ self.spatial_merge_size,
315
+ w // self.spatial_merge_size,
316
+ self.spatial_merge_size,
317
+ )
318
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
319
+ wpos_ids = wpos_ids.flatten()
320
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
321
+
322
+ pos_ids = torch.cat(pos_ids, dim=0)
323
+ max_grid_size = grid_thw[:, 1:].max()
324
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
325
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
326
+ return rotary_pos_emb
327
+
328
+ def get_window_index(self, grid_thw: torch.Tensor) -> Tuple[torch.Tensor, List[int]]:
329
+ window_index: List[torch.Tensor] = []
330
+ cu_window_seqlens: List[int] = [0]
331
+ window_index_id = 0
332
+ vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
333
+
334
+ for grid_t, grid_h, grid_w in grid_thw:
335
+ llm_grid_h, llm_grid_w = (
336
+ grid_h // self.spatial_merge_size,
337
+ grid_w // self.spatial_merge_size,
338
+ )
339
+ index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
340
+ pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
341
+ pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
342
+ num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
343
+ num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
344
+ index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
345
+ index_padded = index_padded.reshape(
346
+ grid_t,
347
+ num_windows_h,
348
+ vit_merger_window_size,
349
+ num_windows_w,
350
+ vit_merger_window_size,
351
+ )
352
+ index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
353
+ grid_t,
354
+ num_windows_h * num_windows_w,
355
+ vit_merger_window_size,
356
+ vit_merger_window_size,
357
+ )
358
+ seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
359
+ index_padded = index_padded.reshape(-1)
360
+ index_new = index_padded[index_padded != -100]
361
+ window_index.append(index_new + window_index_id)
362
+ cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
363
+ cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
364
+ window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
365
+ window_index_tensor = torch.cat(window_index, dim=0)
366
+
367
+ return window_index_tensor, cu_window_seqlens
368
+
369
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
370
+ """
371
+ Args:
372
+ hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
373
+ The final hidden states of the model.
374
+ grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
375
+ The temporal, height and width of feature shape of each image in LLM.
376
+
377
+ Returns:
378
+ `torch.Tensor`: hidden_states.
379
+ """
380
+ hidden_states = self.patch_embed(hidden_states)
381
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
382
+ window_index, cu_window_seqlens = self.get_window_index(grid_thw)
383
+ cu_window_seqlens_tensor = torch.tensor(
384
+ cu_window_seqlens,
385
+ device=hidden_states.device,
386
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
387
+ )
388
+ cu_window_seqlens_tensor = torch.unique_consecutive(cu_window_seqlens_tensor)
389
+
390
+ seq_len, _ = hidden_states.size()
391
+ hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
392
+ hidden_states = hidden_states[window_index, :, :]
393
+ hidden_states = hidden_states.reshape(seq_len, -1)
394
+
395
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
396
+ rotary_pos_emb = rotary_pos_emb[window_index, :, :]
397
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
398
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
399
+ position_embeddings = (emb.cos(), emb.sin())
400
+
401
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
402
+ dim=0,
403
+ # Select dtype based on the following factors:
404
+ # - FA2 requires that cu_seqlens_q must have dtype int32
405
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
406
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
407
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
408
+ )
409
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
410
+
411
+ for layer_num, blk in enumerate(self.blocks):
412
+ if layer_num in self.fullatt_block_indexes:
413
+ cu_seqlens_now = cu_seqlens
414
+ else:
415
+ cu_seqlens_now = cu_window_seqlens_tensor
416
+
417
+ hidden_states = blk(
418
+ hidden_states,
419
+ cu_seqlens=cu_seqlens_now,
420
+ position_embeddings=position_embeddings,
421
+ **kwargs,
422
+ )
423
+
424
+ hidden_states = self.merger(hidden_states)
425
+ reverse_indices = torch.argsort(window_index)
426
+ hidden_states = hidden_states[reverse_indices, :]
427
+
428
+ return hidden_states
429
+
430
+
431
+ # ---------------------------------------------------------------------------
432
+ # Language model wrappers
433
+ # ---------------------------------------------------------------------------
434
+
435
+
436
+ class InfiniteVLModelOutputWithPast(Qwen2VLModelOutputWithPast):
437
+ """
438
+ Output type for :class:`InfiniteVLModel`. This simply extends the
439
+ Qwen2-VL output to also track ``rope_deltas``.
440
+ """
441
+
442
+ pass
443
+
444
+
445
+ class InfiniteVLModel(Qwen2VLModel):
446
+ """
447
+ InfiniteVL multimodal model that reuses the Qwen2-VL language model,
448
+ but swaps in the InfiniteVL vision encoder and a custom 3D RoPE
449
+ indexing strategy.
450
+ """
451
+
452
+ config: InfiniteVLConfig
453
+ base_model_prefix = ""
454
+ _no_split_modules = ["InfiniteVLDecoderLayer", "InfiniteVLVisionBlock"]
455
+ # Reference: fix gemma3 grad acc #37208
456
+ accepts_loss_kwargs = False
457
+
458
+ def __init__(self, config: InfiniteVLConfig):
459
+ super().__init__(config)
460
+ self.visual = InfiniteVisionTransformerPretrainedModel._from_config(config.vision_config)
461
+
462
+ def get_rope_index(
463
+ self,
464
+ input_ids: Optional[torch.LongTensor] = None,
465
+ image_grid_thw: Optional[torch.LongTensor] = None,
466
+ video_grid_thw: Optional[torch.LongTensor] = None,
467
+ second_per_grid_ts: Optional[torch.Tensor] = None,
468
+ attention_mask: Optional[torch.Tensor] = None,
469
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
470
+ """
471
+ Calculate the 3D RoPE index based on image and video temporal, height
472
+ and width in the LLM token space.
473
+
474
+ See the original Qwen2.5-VL paper and implementation for more
475
+ background on the 3D M-ROPE design.
476
+ """
477
+ spatial_merge_size = self.config.vision_config.spatial_merge_size
478
+ image_token_id = self.config.image_token_id
479
+ video_token_id = self.config.video_token_id
480
+ vision_start_token_id = self.config.vision_start_token_id
481
+ mrope_position_deltas = []
482
+
483
+ if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
484
+ total_input_ids = input_ids
485
+ if attention_mask is not None:
486
+ attention_mask = attention_mask == 1
487
+ position_ids = torch.ones(
488
+ 3,
489
+ input_ids.shape[0],
490
+ input_ids.shape[1],
491
+ dtype=input_ids.dtype,
492
+ device=input_ids.device,
493
+ )
494
+ image_index, video_index = 0, 0
495
+ for i, input_ids_row in enumerate(total_input_ids):
496
+ if attention_mask is not None:
497
+ input_ids_row = input_ids_row[attention_mask[i]]
498
+
499
+ image_nums, video_nums = 0, 0
500
+ vision_start_indices = torch.argwhere(input_ids_row == vision_start_token_id).squeeze(1)
501
+ vision_tokens = input_ids_row[vision_start_indices + 1]
502
+ image_nums = (vision_tokens == image_token_id).sum()
503
+ video_nums = (vision_tokens == video_token_id).sum()
504
+ input_tokens = input_ids_row.tolist()
505
+
506
+ llm_pos_ids_list: List[torch.Tensor] = []
507
+ st = 0
508
+ remain_images, remain_videos = image_nums, video_nums
509
+ for _ in range(image_nums + video_nums):
510
+ if image_token_id in input_tokens and remain_images > 0:
511
+ ed_image = input_tokens.index(image_token_id, st)
512
+ else:
513
+ ed_image = len(input_tokens) + 1
514
+ if video_token_id in input_tokens and remain_videos > 0:
515
+ ed_video = input_tokens.index(video_token_id, st)
516
+ else:
517
+ ed_video = len(input_tokens) + 1
518
+ if ed_image < ed_video:
519
+ t, h, w = (
520
+ image_grid_thw[image_index][0],
521
+ image_grid_thw[image_index][1],
522
+ image_grid_thw[image_index][2],
523
+ )
524
+ second_per_grid_t = 0
525
+ image_index += 1
526
+ remain_images -= 1
527
+ ed = ed_image
528
+ else:
529
+ t, h, w = (
530
+ video_grid_thw[video_index][0],
531
+ video_grid_thw[video_index][1],
532
+ video_grid_thw[video_index][2],
533
+ )
534
+ if second_per_grid_ts is not None:
535
+ second_per_grid_t = second_per_grid_ts[video_index]
536
+ else:
537
+ second_per_grid_t = 1.0
538
+ video_index += 1
539
+ remain_videos -= 1
540
+ ed = ed_video
541
+
542
+ llm_grid_t, llm_grid_h, llm_grid_w = (
543
+ t.item(),
544
+ h.item() // spatial_merge_size,
545
+ w.item() // spatial_merge_size,
546
+ )
547
+ text_len = ed - st
548
+
549
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
550
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
551
+
552
+ range_tensor = torch.arange(llm_grid_t).view(-1, 1)
553
+ expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
554
+
555
+ # normalize type, send to device
556
+ second_per_grid_t = torch.as_tensor(
557
+ second_per_grid_t,
558
+ dtype=range_tensor.dtype,
559
+ device=range_tensor.device,
560
+ )
561
+
562
+ time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
563
+ time_tensor_long = time_tensor.long()
564
+ t_index = time_tensor_long.flatten()
565
+
566
+ h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
567
+ w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
568
+ llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
569
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
570
+
571
+ if st < len(input_tokens):
572
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
573
+ text_len = len(input_tokens) - st
574
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
575
+
576
+ llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
577
+ if attention_mask is not None:
578
+ position_ids[..., i, attention_mask[i]] = llm_positions.to(position_ids.device)
579
+ else:
580
+ position_ids[..., i, :] = llm_positions.to(position_ids.device)
581
+ mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
582
+
583
+ mrope_position_deltas_tensor = torch.tensor(mrope_position_deltas).unsqueeze(1).to(
584
+ device=input_ids.device
585
+ )
586
+ return position_ids, mrope_position_deltas_tensor
587
+
588
+ # Pure text case – fall back to standard 1D RoPE indexing.
589
+ if attention_mask is not None:
590
+ position_ids = attention_mask.long().cumsum(-1) - 1
591
+ position_ids.masked_fill_(attention_mask == 0, 1)
592
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
593
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
594
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
595
+ else:
596
+ position_ids = (
597
+ torch.arange(input_ids.shape[1], device=input_ids.device)
598
+ .view(1, 1, -1)
599
+ .expand(3, input_ids.shape[0], -1)
600
+ )
601
+ mrope_position_deltas = torch.zeros(
602
+ [input_ids.shape[0], 1],
603
+ device=input_ids.device,
604
+ dtype=input_ids.dtype,
605
+ )
606
+
607
+ return position_ids, mrope_position_deltas
608
+
609
+ def forward(
610
+ self,
611
+ input_ids: Optional[torch.LongTensor] = None,
612
+ attention_mask: Optional[torch.Tensor] = None,
613
+ position_ids: Optional[torch.LongTensor] = None,
614
+ past_key_values: Optional[Cache] = None,
615
+ inputs_embeds: Optional[torch.FloatTensor] = None,
616
+ use_cache: Optional[bool] = None,
617
+ output_attentions: Optional[bool] = None,
618
+ output_hidden_states: Optional[bool] = None,
619
+ return_dict: Optional[bool] = None,
620
+ pixel_values: Optional[torch.Tensor] = None,
621
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
622
+ image_grid_thw: Optional[torch.LongTensor] = None,
623
+ video_grid_thw: Optional[torch.LongTensor] = None,
624
+ rope_deltas: Optional[torch.LongTensor] = None,
625
+ cache_position: Optional[torch.LongTensor] = None,
626
+ second_per_grid_ts: Optional[torch.Tensor] = None,
627
+ **kwargs: Unpack[TransformersKwargs],
628
+ ) -> Union[tuple, InfiniteVLModelOutputWithPast]:
629
+ r"""
630
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
631
+ The temporal, height and width of feature shape of each image in LLM.
632
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
633
+ The temporal, height and width of feature shape of each video in LLM.
634
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
635
+ The RoPE index difference between sequence length and multimodal RoPE.
636
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
637
+ The time interval (in seconds) for each grid along the temporal dimension
638
+ in the 3D position IDs.
639
+ """
640
+
641
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
642
+ output_hidden_states = (
643
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
644
+ )
645
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
646
+
647
+ if inputs_embeds is None:
648
+ inputs_embeds = self.get_input_embeddings()(input_ids)
649
+
650
+ if pixel_values is not None:
651
+ image_embeds = self.get_image_features(pixel_values, image_grid_thw)
652
+ image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
653
+ image_mask, _ = self.get_placeholder_mask(
654
+ input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
655
+ )
656
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
657
+
658
+ if pixel_values_videos is not None:
659
+ video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
660
+ video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
661
+ _, video_mask = self.get_placeholder_mask(
662
+ input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
663
+ )
664
+ inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
665
+
666
+ if position_ids is None:
667
+ # Calculate RoPE index once per generation in the pre-fill stage only.
668
+ # When compiling, we can't check tensor values thus we check only input length
669
+ # It is safe to assume that `length!=1` means we're in pre-fill because compiled
670
+ # models currently cannot do assisted decoding.
671
+ prefill_compiled_stage = is_torchdynamo_compiling() and (
672
+ (input_ids is not None and input_ids.shape[1] != 1)
673
+ or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
674
+ )
675
+ prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
676
+ (cache_position is not None and cache_position[0] == 0)
677
+ or (past_key_values is None or past_key_values.get_seq_length() == 0)
678
+ )
679
+ if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
680
+ position_ids, rope_deltas = self.get_rope_index(
681
+ input_ids,
682
+ image_grid_thw,
683
+ video_grid_thw,
684
+ second_per_grid_ts=second_per_grid_ts,
685
+ attention_mask=attention_mask,
686
+ )
687
+ self.rope_deltas = rope_deltas
688
+ else:
689
+ batch_size, seq_length, _ = inputs_embeds.shape
690
+ position_ids = torch.arange(seq_length, device=inputs_embeds.device)
691
+ position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
692
+ if cache_position is not None:
693
+ delta = (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
694
+ else:
695
+ delta = torch.zeros((batch_size, seq_length), device=inputs_embeds.device)
696
+ delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=1)
697
+ position_ids = position_ids + delta.to(position_ids.device)
698
+
699
+ outputs = self.language_model(
700
+ input_ids=None,
701
+ position_ids=position_ids,
702
+ attention_mask=attention_mask,
703
+ past_key_values=past_key_values,
704
+ inputs_embeds=inputs_embeds,
705
+ use_cache=use_cache,
706
+ output_attentions=output_attentions,
707
+ output_hidden_states=output_hidden_states,
708
+ return_dict=True,
709
+ cache_position=cache_position,
710
+ **kwargs,
711
+ )
712
+
713
+ output = InfiniteVLModelOutputWithPast(
714
+ last_hidden_state=outputs.last_hidden_state,
715
+ past_key_values=outputs.past_key_values,
716
+ hidden_states=outputs.hidden_states,
717
+ attentions=outputs.attentions,
718
+ rope_deltas=self.rope_deltas,
719
+ )
720
+ return output if return_dict else output.to_tuple()
721
+
722
+
723
+ # ---------------------------------------------------------------------------
724
+ # Causal LM wrapper
725
+ # ---------------------------------------------------------------------------
726
+
727
+
728
+ class InfiniteVLCausalLMOutputWithPast(Qwen2VLCausalLMOutputWithPast):
729
+ """
730
+ Output type for :class:`InfiniteVLQwen2_5_VLForConditionalGeneration`.
731
+ """
732
+
733
+ pass
734
+
735
+
736
+ class InfiniteVLQwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
737
+ """
738
+ InfiniteVL causal language model head on top of :class:`InfiniteVLModel`.
739
+ """
740
+
741
+ # Reference: fix gemma3 grad acc #37208
742
+ accepts_loss_kwargs = False
743
+
744
+ def forward(
745
+ self,
746
+ input_ids: Optional[torch.LongTensor] = None,
747
+ attention_mask: Optional[torch.Tensor] = None,
748
+ position_ids: Optional[torch.LongTensor] = None,
749
+ past_key_values: Optional[Cache] = None,
750
+ inputs_embeds: Optional[torch.FloatTensor] = None,
751
+ labels: Optional[torch.LongTensor] = None,
752
+ use_cache: Optional[bool] = None,
753
+ output_attentions: Optional[bool] = None,
754
+ output_hidden_states: Optional[bool] = None,
755
+ pixel_values: Optional[torch.Tensor] = None,
756
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
757
+ image_grid_thw: Optional[torch.LongTensor] = None,
758
+ video_grid_thw: Optional[torch.LongTensor] = None,
759
+ rope_deltas: Optional[torch.LongTensor] = None,
760
+ cache_position: Optional[torch.LongTensor] = None,
761
+ second_per_grid_ts: Optional[torch.Tensor] = None,
762
+ logits_to_keep: Union[int, torch.Tensor] = 0,
763
+ **kwargs: Unpack[TransformersKwargs],
764
+ ) -> Union[tuple, InfiniteVLCausalLMOutputWithPast]:
765
+ r"""
766
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
767
+ Labels for computing the masked language modeling loss. Indices should either be in
768
+ ``[0, ..., config.vocab_size]`` or ``-100`` (see ``input_ids`` docstring). Tokens with indices set to
769
+ ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in
770
+ ``[0, ..., config.vocab_size]``.
771
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
772
+ The temporal, height and width of feature shape of each image in LLM.
773
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
774
+ The temporal, height and width of feature shape of each video in LLM.
775
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
776
+ The RoPE index difference between sequence length and multimodal RoPE.
777
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
778
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
779
+ """
780
+
781
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
782
+ output_hidden_states = (
783
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
784
+ )
785
+
786
+ outputs = self.model(
787
+ input_ids=input_ids,
788
+ pixel_values=pixel_values,
789
+ pixel_values_videos=pixel_values_videos,
790
+ image_grid_thw=image_grid_thw,
791
+ video_grid_thw=video_grid_thw,
792
+ second_per_grid_ts=second_per_grid_ts,
793
+ position_ids=position_ids,
794
+ attention_mask=attention_mask,
795
+ past_key_values=past_key_values,
796
+ inputs_embeds=inputs_embeds,
797
+ use_cache=use_cache,
798
+ output_attentions=output_attentions,
799
+ output_hidden_states=output_hidden_states,
800
+ return_dict=True,
801
+ cache_position=cache_position,
802
+ **kwargs,
803
+ )
804
+
805
+ hidden_states = outputs[0]
806
+
807
+ # Only compute necessary logits, and do not upcast them to float
808
+ # if we are not computing the loss.
809
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
810
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
811
+
812
+ loss = None
813
+ if labels is not None:
814
+ loss = self.loss_function(
815
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
816
+ )
817
+
818
+ return InfiniteVLCausalLMOutputWithPast(
819
+ loss=loss,
820
+ logits=logits,
821
+ past_key_values=outputs.past_key_values,
822
+ hidden_states=outputs.hidden_states,
823
+ attentions=outputs.attentions,
824
+ rope_deltas=outputs.rope_deltas,
825
+ )
826
+
827
+ def prepare_inputs_for_generation(
828
+ self,
829
+ input_ids,
830
+ past_key_values=None,
831
+ attention_mask=None,
832
+ inputs_embeds=None,
833
+ cache_position=None,
834
+ position_ids=None,
835
+ use_cache=True,
836
+ pixel_values=None,
837
+ pixel_values_videos=None,
838
+ image_grid_thw=None,
839
+ video_grid_thw=None,
840
+ second_per_grid_ts=None,
841
+ **kwargs,
842
+ ):
843
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model.
844
+ model_inputs = super().prepare_inputs_for_generation(
845
+ input_ids,
846
+ past_key_values=past_key_values,
847
+ attention_mask=attention_mask,
848
+ inputs_embeds=inputs_embeds,
849
+ cache_position=cache_position,
850
+ position_ids=position_ids,
851
+ pixel_values=pixel_values,
852
+ pixel_values_videos=pixel_values_videos,
853
+ image_grid_thw=image_grid_thw,
854
+ video_grid_thw=video_grid_thw,
855
+ second_per_grid_ts=second_per_grid_ts,
856
+ use_cache=use_cache,
857
+ **kwargs,
858
+ )
859
+
860
+ # InfiniteVL position_ids are prepared with rope_deltas
861
+ if position_ids is None:
862
+ # Calculate RoPE index once per generation in the pre-fill stage only.
863
+ # When compiling, we can't check tensor values thus we check only input length
864
+ # It is safe to assume that `length!=1` means we're in pre-fill because compiled
865
+ # models currently cannot do assisted decoding.
866
+ if cache_position[0] == 0 or self.model.rope_deltas is None:
867
+ vision_positions, rope_deltas = self.model.get_rope_index(
868
+ model_inputs.get("input_ids", None),
869
+ image_grid_thw=image_grid_thw,
870
+ video_grid_thw=video_grid_thw,
871
+ second_per_grid_ts=second_per_grid_ts,
872
+ attention_mask=attention_mask,
873
+ )
874
+ self.model.rope_deltas = rope_deltas
875
+ # then use the previous pre-calculated rope-deltas to get the correct position ids
876
+ elif "position_ids" in model_inputs:
877
+ batch_size, seq_length = model_inputs["position_ids"].shape
878
+ device = model_inputs["position_ids"].device
879
+ position_ids = torch.arange(seq_length, device=device)
880
+ position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
881
+ delta = cache_position[0] + self.model.rope_deltas
882
+ delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
883
+ vision_positions = position_ids + delta.expand_as(position_ids)
884
+
885
+ # Concatenate "text + vision" positions into [4, bs, seq-len]
886
+ text_positions = model_inputs["position_ids"][None, ...]
887
+ model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
888
+
889
+ if cache_position[0] != 0:
890
+ model_inputs["pixel_values"] = None
891
+ model_inputs["pixel_values_videos"] = None
892
+
893
+ return model_inputs
894
+
895
+
896
+ # ---------------------------------------------------------------------------
897
+ # Processor
898
+ # ---------------------------------------------------------------------------
899
+
900
+
901
+ class InfiniteVLVideosProcessorKwargs(VideosKwargs, total=False):
902
+ fps: Union[list[float], float]
903
+
904
+
905
+ class InfiniteVLImagesKwargs(Qwen2VLImagesKwargs):
906
+ pass
907
+
908
+
909
+ class InfiniteVLProcessorKwargs(ProcessingKwargs, total=False):
910
+ images_kwargs: InfiniteVLImagesKwargs
911
+ videos_kwargs: InfiniteVLVideosProcessorKwargs
912
+ _defaults = {
913
+ "text_kwargs": {
914
+ "padding": False,
915
+ "return_mm_token_type_ids": False,
916
+ },
917
+ }
918
+
919
+
920
+ class InfiniteVLProcessor(Qwen2VLProcessor):
921
+ r"""
922
+ Constructs an InfiniteVL processor which wraps a Qwen2-VL image processor
923
+ and a Qwen2 tokenizer into a single processor.
924
+
925
+ :class:`InfiniteVLProcessor` offers all the functionalities of
926
+ :class:`Qwen2VLImageProcessor` and :class:`Qwen2TokenizerFast`. See
927
+ :meth:`InfiniteVLProcessor.__call__` and :meth:`InfiniteVLProcessor.decode`
928
+ for more information.
929
+
930
+ Args:
931
+ image_processor (:class:`Qwen2VLImageProcessor`, *optional*):
932
+ The image processor is a required input.
933
+ tokenizer (:class:`Qwen2TokenizerFast`, *optional*):
934
+ The tokenizer is a required input.
935
+ video_processor (:class:`InfiniteVLVideoProcessor`, *optional*):
936
+ The video processor is a required input.
937
+ chat_template (`str`, *optional*):
938
+ A Jinja template which will be used to convert lists of messages
939
+ in a chat into a tokenizable string.
940
+ """
941
+
942
+ image_processor_class = "AutoImageProcessor"
943
+
944
+ @property
945
+ def model_input_names(self):
946
+ tokenizer_input_names = self.tokenizer.model_input_names
947
+ image_processor_input_names = self.image_processor.model_input_names
948
+ names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
949
+ return names_from_processor + ["second_per_grid_ts"]
950
+
951
+ def __call__(
952
+ self,
953
+ images: Optional[ImageInput] = None,
954
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
955
+ videos: Optional[VideoInput] = None,
956
+ **kwargs: Unpack[InfiniteVLProcessorKwargs],
957
+ ) -> BatchFeature:
958
+ """
959
+ Main method to prepare for the model one or several sequence(s) and image(s).
960
+
961
+ This method forwards the ``text`` and ``kwargs`` arguments to
962
+ :class:`Qwen2TokenizerFast.__call__` if ``text`` is not ``None``
963
+ to encode the text. To prepare the vision inputs, this method
964
+ forwards the ``images`` / ``videos`` and ``kwargs`` arguments to
965
+ :class:`Qwen2VLImageProcessor.__call__` and the corresponding
966
+ video processor when they are not ``None``.
967
+ """
968
+ output_kwargs = self._merge_kwargs(
969
+ InfiniteVLProcessorKwargs,
970
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
971
+ **kwargs,
972
+ )
973
+
974
+ image_inputs = videos_inputs = {}
975
+ if images is not None:
976
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
977
+ image_grid_thw = image_inputs["image_grid_thw"]
978
+
979
+ if videos is not None:
980
+ fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
981
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
982
+ video_grid_thw = videos_inputs["video_grid_thw"]
983
+
984
+ if isinstance(fps, (int, float)):
985
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw)
986
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
987
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / tmp for tmp in fps]
988
+ else:
989
+ raise ValueError(
990
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the "
991
+ f"length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
992
+ )
993
+ videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
994
+
995
+ if not isinstance(text, list):
996
+ text = [text]
997
+
998
+ # below lines change text in-place
999
+ text = text.copy()
1000
+ if images is not None:
1001
+ merge_length = self.image_processor.merge_size**2
1002
+ index = 0
1003
+ for i in range(len(text)):
1004
+ while self.image_token in text[i]:
1005
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
1006
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
1007
+ index += 1
1008
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
1009
+
1010
+ if videos is not None:
1011
+ merge_length = self.video_processor.merge_size**2
1012
+ index = 0
1013
+ for i in range(len(text)):
1014
+ while self.video_token in text[i]:
1015
+ num_video_tokens = video_grid_thw[index].prod() // merge_length
1016
+ text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
1017
+ index += 1
1018
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
1019
+
1020
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
1021
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
1022
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
1023
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
1024
+
1025
+ if return_mm_token_type_ids:
1026
+ array_ids = np.array(text_inputs["input_ids"])
1027
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
1028
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
1029
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
1030
+
1031
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
1032
+
1033
+ def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs) -> MultiModalData:
1034
+ """
1035
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
1036
+
1037
+ Args:
1038
+ image_sizes (`list[list[int]]`, *optional*):
1039
+ The input sizes formatted as (height, width) per each image.
1040
+ video_sizes (`list[list[int]]`, *optional*):
1041
+ The input sizes formatted as (num_frames, height, width) per each video.
1042
+
1043
+ Returns:
1044
+ :class:`MultiModalData`: A :class:`MultiModalData` object holding number of tokens per each of the provided
1045
+ input modalities, along with other useful data.
1046
+ """
1047
+
1048
+ vision_data = {}
1049
+ merge_size: Optional[int] = None
1050
+
1051
+ if image_sizes is not None:
1052
+ images_kwargs = InfiniteVLProcessorKwargs._defaults.get("images_kwargs", {})
1053
+ images_kwargs.update(kwargs)
1054
+ merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
1055
+
1056
+ num_image_patches = [
1057
+ self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
1058
+ for image_size in image_sizes
1059
+ ]
1060
+ num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
1061
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
1062
+
1063
+ if video_sizes is not None:
1064
+ videos_kwargs = InfiniteVLProcessorKwargs._defaults.get("videos_kwargs", {})
1065
+ videos_kwargs.update(kwargs)
1066
+ # For videos we should also respect a potential merge_size override.
1067
+ video_merge_size = videos_kwargs.get("merge_size", None) or self.video_processor.merge_size
1068
+
1069
+ num_video_patches = [
1070
+ self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
1071
+ for video_size in video_sizes
1072
+ ]
1073
+ num_video_tokens = [
1074
+ (num_patches // video_merge_size**2) for num_patches in num_video_patches
1075
+ ]
1076
+ vision_data["num_video_tokens"] = num_video_tokens
1077
+
1078
+ return MultiModalData(**vision_data)
1079
+
1080
+
1081
+ __all__ = [
1082
+ # Preferred InfiniteVL names
1083
+ "InfiniteVLConfig",
1084
+ "InfiniteVLTextConfig",
1085
+ "InfiniteVLQwen2_5_VLForConditionalGeneration",
1086
+ "InfiniteVLModel",
1087
+ "InfiniteVLPreTrainedModel",
1088
+ "InfiniteVLProcessor",
1089
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2_5_VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "longest_edge": 12845056,
26
+ "shortest_edge": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
processing_infinitevl.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ from typing import Optional, Union
22
+
23
+ import numpy as np
24
+
25
+ from transformers.feature_extraction_utils import BatchFeature
26
+ from transformers.image_utils import ImageInput
27
+ from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
28
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
+ from transformers.video_utils import VideoInput
30
+
31
+
32
+ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
33
+ fps: Union[list[float], float]
34
+
35
+
36
+ class Qwen2_5_VLImagesKwargs(ImagesKwargs):
37
+ min_pixels: Optional[int]
38
+ max_pixels: Optional[int]
39
+ patch_size: Optional[int]
40
+ temporal_patch_size: Optional[int]
41
+ merge_size: Optional[int]
42
+
43
+
44
+ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
45
+ images_kwargs: Qwen2_5_VLImagesKwargs
46
+ videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
47
+ _defaults = {
48
+ "text_kwargs": {
49
+ "padding": False,
50
+ "return_mm_token_type_ids": False,
51
+ },
52
+ }
53
+
54
+
55
+ class Qwen2_5_VLProcessor(ProcessorMixin):
56
+ r"""
57
+ Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
58
+ [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
59
+ [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
60
+ Args:
61
+ image_processor ([`Qwen2VLImageProcessor`], *optional*):
62
+ The image processor is a required input.
63
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
64
+ The tokenizer is a required input.
65
+ video_processor ([`Qwen2_5_VLVideoProcessor`], *optional*):
66
+ The video processor is a required input.
67
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
68
+ in a chat into a tokenizable string.
69
+ """
70
+
71
+ attributes = ["image_processor", "tokenizer", "video_processor"]
72
+
73
+ image_processor_class = "AutoImageProcessor"
74
+ video_processor_class = "AutoVideoProcessor"
75
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
76
+
77
+ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
78
+ self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
79
+ self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
80
+ self.image_token_id = (
81
+ tokenizer.image_token_id
82
+ if getattr(tokenizer, "image_token_id", None)
83
+ else tokenizer.convert_tokens_to_ids(self.image_token)
84
+ )
85
+ self.video_token_id = (
86
+ tokenizer.video_token_id
87
+ if getattr(tokenizer, "video_token_id", None)
88
+ else tokenizer.convert_tokens_to_ids(self.video_token)
89
+ )
90
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
91
+
92
+ def __call__(
93
+ self,
94
+ images: Optional[ImageInput] = None,
95
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
96
+ videos: Optional[VideoInput] = None,
97
+ **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
98
+ ) -> BatchFeature:
99
+ """
100
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
101
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
102
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
103
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
104
+
105
+ Args:
106
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
107
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
108
+ tensor. Both channels-first and channels-last formats are supported.
109
+ text (`str`, `list[str]`, `list[list[str]]`):
110
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
111
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
112
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
113
+ videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
114
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
115
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
116
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
117
+ If set, will return tensors of a particular framework. Acceptable values are:
118
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
119
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
120
+ - `'np'`: Return NumPy `np.ndarray` objects.
121
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
122
+
123
+ Returns:
124
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
125
+
126
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
127
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
128
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
129
+ `None`).
130
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
131
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
132
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
133
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
134
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
135
+ """
136
+ output_kwargs = self._merge_kwargs(
137
+ Qwen2_5_VLProcessorKwargs,
138
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
139
+ **kwargs,
140
+ )
141
+
142
+ image_inputs = videos_inputs = {}
143
+ if images is not None:
144
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
145
+ image_grid_thw = image_inputs["image_grid_thw"]
146
+
147
+ if videos is not None:
148
+ fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
149
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
150
+ video_grid_thw = videos_inputs["video_grid_thw"]
151
+
152
+ if isinstance(fps, (int, float)):
153
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw)
154
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
155
+ second_per_grid_ts = [self.video_processor.temporal_patch_size / tmp for tmp in fps]
156
+ else:
157
+ raise ValueError(
158
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
159
+ )
160
+ videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
161
+
162
+ if not isinstance(text, list):
163
+ text = [text]
164
+
165
+ text = text.copy() # below lines change text in-place
166
+ if images is not None:
167
+ merge_length = self.image_processor.merge_size**2
168
+ index = 0
169
+ for i in range(len(text)):
170
+ while self.image_token in text[i]:
171
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
172
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
173
+ index += 1
174
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
175
+
176
+ if videos is not None:
177
+ merge_length = self.video_processor.merge_size**2
178
+ index = 0
179
+ for i in range(len(text)):
180
+ while self.video_token in text[i]:
181
+ num_video_tokens = video_grid_thw[index].prod() // merge_length
182
+ text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
183
+ index += 1
184
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
185
+
186
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
187
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
188
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
189
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
190
+
191
+ if return_mm_token_type_ids:
192
+ array_ids = np.array(text_inputs["input_ids"])
193
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
194
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
195
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
196
+
197
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
198
+
199
+ def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
200
+ """
201
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
202
+ Args:
203
+ image_sizes (`list[list[int]]`, *optional*):
204
+ The input sizes formatted as (height, width) per each image.
205
+ video_sizes (`list[list[int]]`, *optional*):
206
+ The input sizes formatted as (num_frames, height, width) per each video.
207
+ Returns:
208
+ `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
209
+ input modalities, along with other useful data.
210
+ """
211
+
212
+ vision_data = {}
213
+ if image_sizes is not None:
214
+ images_kwargs = Qwen2_5_VLProcessorKwargs._defaults.get("images_kwargs", {})
215
+ images_kwargs.update(kwargs)
216
+ merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
217
+
218
+ num_image_patches = [
219
+ self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
220
+ for image_size in image_sizes
221
+ ]
222
+ num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
223
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
224
+
225
+ if video_sizes is not None:
226
+ videos_kwargs = Qwen2_5_VLProcessorKwargs._defaults.get("videos_kwargs", {})
227
+ videos_kwargs.update(kwargs)
228
+ num_video_patches = [
229
+ self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
230
+ for video_size in video_sizes
231
+ ]
232
+ num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
233
+ vision_data["num_video_tokens"] = num_video_tokens
234
+
235
+ return MultiModalData(**vision_data)
236
+
237
+ def post_process_image_text_to_text(
238
+ self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
239
+ ):
240
+ """
241
+ Post-process the output of the model to decode the text.
242
+
243
+ Args:
244
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
245
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
246
+ or `(sequence_length,)`.
247
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
248
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
249
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
250
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
251
+ **kwargs:
252
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
253
+
254
+ Returns:
255
+ `list[str]`: The decoded text.
256
+ """
257
+ return self.tokenizer.batch_decode(
258
+ generated_outputs,
259
+ skip_special_tokens=skip_special_tokens,
260
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
261
+ **kwargs,
262
+ )
263
+
264
+ @property
265
+ def model_input_names(self):
266
+ tokenizer_input_names = self.tokenizer.model_input_names
267
+ image_processor_input_names = self.image_processor.model_input_names
268
+ names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
269
+ return names_from_processor + ["second_per_grid_ts"]
270
+
271
+
272
+ __all__ = ["Qwen2_5_VLProcessor"]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "processor_class": "Qwen2_5_VLProcessor",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff