xiazhi commited on
Commit
8dcc5d2
·
verified ·
1 Parent(s): f7e3b14

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,68 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - diffusion
5
+ - vision-language
6
+ - qwen2.5
7
+ - siglip
8
+ ---
9
+
10
+ # DiffusionVL-Qwen2.5
11
+
12
+ DiffusionVL model with SigLIP vision encoder, PoolerProjector, and Qwen2.5 LLM with BD3LM diffusion-based generation.
13
+
14
+ ## Usage
15
+
16
+ ```python
17
+ from transformers import AutoModelForCausalLM, AutoProcessor
18
+ import torch
19
+ from PIL import Image
20
+
21
+ # Load model
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ "path/to/model",
24
+ torch_dtype=torch.bfloat16,
25
+ device_map="auto",
26
+ trust_remote_code=True
27
+ )
28
+
29
+ # Load processor
30
+ processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True)
31
+
32
+ # Prepare inputs
33
+ image = Image.open("image.jpg").convert("RGB")
34
+ messages = [
35
+ {"role": "user", "content": [
36
+ {"type": "image"},
37
+ {"type": "text", "text": "Describe this image."}
38
+ ]}
39
+ ]
40
+ text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
41
+ inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
42
+ inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
43
+
44
+ # Generate
45
+ output_ids = model.generate(
46
+ inputs=inputs["input_ids"],
47
+ images=inputs.get("pixel_values"),
48
+ gen_length=256,
49
+ steps=8,
50
+ temperature=0.0,
51
+ remasking_strategy="low_confidence_static",
52
+ )
53
+
54
+ # Decode
55
+ output_text = processor.decode(output_ids[0], skip_special_tokens=True)
56
+ print(output_text)
57
+ ```
58
+
59
+ ## Model Configuration
60
+
61
+ - **Architecture**: DiffusionVL_Qwen2_5_ForConditionalGeneration
62
+ - **Vision Encoder**: SigLIP (384x384, patch_size=14)
63
+ - **MM Projector**: PoolerProjector (Conv2d + MLP)
64
+ - **LLM**: Qwen2.5 (standard RoPE)
65
+ - **BD3LM Enabled**: True
66
+ - **Block Size**: 8
67
+ - **Hidden Size**: 3584
68
+ - **Num Layers**: 28
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}<image>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}<video>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_faster_video": false,
3
+ "add_time_instruction": false,
4
+ "anneal_start_block_size": 1,
5
+ "architectures": [
6
+ "DiffusionVL_Qwen2_5_ForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bd3lm_antithetic_sampling": true,
10
+ "bd3lm_attn_backend": "sdpa",
11
+ "bd3lm_block_aligned_eos": true,
12
+ "bd3lm_block_size": 8,
13
+ "bd3lm_complementary_mask": false,
14
+ "bd3lm_cross_attn": true,
15
+ "bd3lm_ignore_bos": true,
16
+ "bd3lm_noise_granularity": "block",
17
+ "bd3lm_noise_type": "loglinear",
18
+ "bd3lm_parameterization": "subs",
19
+ "bd3lm_resample": false,
20
+ "bd3lm_sampling_eps_max": 1.0,
21
+ "bd3lm_sampling_eps_min": 0.001,
22
+ "bd3lm_time_conditioning": false,
23
+ "bd3lm_token_shift_prediction": false,
24
+ "bd3lm_var_min": true,
25
+ "bos_token_id": 151643,
26
+ "enable_bd3lm": true,
27
+ "enable_block_size_annealing": false,
28
+ "enable_noise_level_annealing": false,
29
+ "eos_token_id": 151645,
30
+ "faster_token_stride": 10,
31
+ "force_sample": false,
32
+ "hidden_act": "silu",
33
+ "hidden_size": 3584,
34
+ "image_aspect_ratio": "anyres_max_4",
35
+ "image_crop_resolution": null,
36
+ "image_grid_pinpoints": [
37
+ [
38
+ 384,
39
+ 384
40
+ ],
41
+ [
42
+ 384,
43
+ 768
44
+ ],
45
+ [
46
+ 384,
47
+ 1152
48
+ ],
49
+ [
50
+ 384,
51
+ 1536
52
+ ],
53
+ [
54
+ 384,
55
+ 1920
56
+ ],
57
+ [
58
+ 384,
59
+ 2304
60
+ ],
61
+ [
62
+ 768,
63
+ 384
64
+ ],
65
+ [
66
+ 768,
67
+ 768
68
+ ],
69
+ [
70
+ 768,
71
+ 1152
72
+ ],
73
+ [
74
+ 768,
75
+ 1536
76
+ ],
77
+ [
78
+ 768,
79
+ 1920
80
+ ],
81
+ [
82
+ 768,
83
+ 2304
84
+ ],
85
+ [
86
+ 1152,
87
+ 384
88
+ ],
89
+ [
90
+ 1152,
91
+ 768
92
+ ],
93
+ [
94
+ 1152,
95
+ 1152
96
+ ],
97
+ [
98
+ 1152,
99
+ 1536
100
+ ],
101
+ [
102
+ 1152,
103
+ 1920
104
+ ],
105
+ [
106
+ 1152,
107
+ 2304
108
+ ],
109
+ [
110
+ 1536,
111
+ 384
112
+ ],
113
+ [
114
+ 1536,
115
+ 768
116
+ ],
117
+ [
118
+ 1536,
119
+ 1152
120
+ ],
121
+ [
122
+ 1536,
123
+ 1536
124
+ ],
125
+ [
126
+ 1536,
127
+ 1920
128
+ ],
129
+ [
130
+ 1536,
131
+ 2304
132
+ ],
133
+ [
134
+ 1920,
135
+ 384
136
+ ],
137
+ [
138
+ 1920,
139
+ 768
140
+ ],
141
+ [
142
+ 1920,
143
+ 1152
144
+ ],
145
+ [
146
+ 1920,
147
+ 1536
148
+ ],
149
+ [
150
+ 1920,
151
+ 1920
152
+ ],
153
+ [
154
+ 1920,
155
+ 2304
156
+ ],
157
+ [
158
+ 2304,
159
+ 384
160
+ ],
161
+ [
162
+ 2304,
163
+ 768
164
+ ],
165
+ [
166
+ 2304,
167
+ 1152
168
+ ],
169
+ [
170
+ 2304,
171
+ 1536
172
+ ],
173
+ [
174
+ 2304,
175
+ 1920
176
+ ],
177
+ [
178
+ 2304,
179
+ 2304
180
+ ]
181
+ ],
182
+ "image_split_resolution": null,
183
+ "initializer_range": 0.02,
184
+ "intermediate_size": 18944,
185
+ "layer_types": [
186
+ "full_attention",
187
+ "full_attention",
188
+ "full_attention",
189
+ "full_attention",
190
+ "full_attention",
191
+ "full_attention",
192
+ "full_attention",
193
+ "full_attention",
194
+ "full_attention",
195
+ "full_attention",
196
+ "full_attention",
197
+ "full_attention",
198
+ "full_attention",
199
+ "full_attention",
200
+ "full_attention",
201
+ "full_attention",
202
+ "full_attention",
203
+ "full_attention",
204
+ "full_attention",
205
+ "full_attention",
206
+ "full_attention",
207
+ "full_attention",
208
+ "full_attention",
209
+ "full_attention",
210
+ "full_attention",
211
+ "full_attention",
212
+ "full_attention",
213
+ "full_attention"
214
+ ],
215
+ "max_pixels": 262144,
216
+ "max_position_embeddings": 32768,
217
+ "max_window_layers": 28,
218
+ "min_pixels": 147456,
219
+ "mm_hidden_size": 1152,
220
+ "mm_newline_position": "grid",
221
+ "mm_patch_merge_type": "spatial_unpad",
222
+ "mm_projector_lr": null,
223
+ "mm_projector_type": "mlp2x_gelu",
224
+ "mm_resampler_type": null,
225
+ "mm_spatial_pool_mode": "bilinear",
226
+ "mm_spatial_pool_stride": null,
227
+ "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
228
+ "mm_use_im_patch_token": false,
229
+ "mm_use_im_start_end": false,
230
+ "mm_vision_select_feature": "patch",
231
+ "mm_vision_select_layer": -2,
232
+ "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
233
+ "mm_vision_tower_lr": 2e-06,
234
+ "model_max_length": 8192,
235
+ "model_type": "diffusionvl_qwen2_5",
236
+ "num_attention_heads": 28,
237
+ "num_hidden_layers": 28,
238
+ "num_key_value_heads": 4,
239
+ "pos_skipping_range": 4096,
240
+ "rms_norm_eps": 1e-06,
241
+ "rope_scaling": null,
242
+ "rope_theta": 1000000.0,
243
+ "sliding_window": null,
244
+ "tie_word_embeddings": false,
245
+ "tokenizer_model_max_length": 8192,
246
+ "tokenizer_padding_side": "right",
247
+ "torch_dtype": "bfloat16",
248
+ "transformers_version": "4.55.0",
249
+ "use_cache": true,
250
+ "use_mm_proj": true,
251
+ "use_pos_skipping": false,
252
+ "use_sliding_window": false,
253
+ "vision_tower_pretrained": null,
254
+ "vocab_size": 152064,
255
+ "mask_token_id": 151671,
256
+ "vision_config": {
257
+ "hidden_size": 1152,
258
+ "intermediate_size": 4304,
259
+ "num_hidden_layers": 26,
260
+ "num_attention_heads": 16,
261
+ "num_channels": 3,
262
+ "image_size": 384,
263
+ "patch_size": 14,
264
+ "hidden_act": "gelu_pytorch_tanh",
265
+ "layer_norm_eps": 1e-06,
266
+ "attention_dropout": 0.0
267
+ },
268
+ "auto_map": {
269
+ "AutoConfig": "configuration_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_Config",
270
+ "AutoModelForCausalLM": "modeling_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_ForConditionalGeneration",
271
+ "AutoProcessor": "processing_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_Processor"
272
+ }
273
+ }
configuration_diffusionvl_qwen2_5.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """DiffusionVL-Qwen2.5 model configuration."""
16
+
17
+ from typing import List, Optional, Union
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+
21
+
22
+ class DiffusionVL_Qwen2_5_VisionConfig(PretrainedConfig):
23
+ """
24
+ Configuration for SigLIP vision encoder used in DiffusionVL-Qwen2.5.
25
+
26
+ Args:
27
+ hidden_size: Dimension of the encoder layers (1152 for SigLIP-SO400M).
28
+ intermediate_size: Dimension of the MLP layers.
29
+ num_hidden_layers: Number of transformer layers.
30
+ num_attention_heads: Number of attention heads.
31
+ num_channels: Number of input channels.
32
+ image_size: Input image resolution.
33
+ patch_size: Patch size for patch embedding.
34
+ hidden_act: Activation function.
35
+ layer_norm_eps: Layer normalization epsilon.
36
+ attention_dropout: Attention dropout probability.
37
+ """
38
+
39
+ model_type = "diffusionvl_qwen2_5_vision"
40
+ base_config_key = "vision_config"
41
+
42
+ def __init__(
43
+ self,
44
+ hidden_size: int = 1152,
45
+ intermediate_size: int = 4304,
46
+ num_hidden_layers: int = 26, # SigLIP uses 27 layers, but last one is removed
47
+ num_attention_heads: int = 16,
48
+ num_channels: int = 3,
49
+ image_size: int = 384,
50
+ patch_size: int = 14,
51
+ hidden_act: str = "gelu_pytorch_tanh",
52
+ layer_norm_eps: float = 1e-6,
53
+ attention_dropout: float = 0.0,
54
+ **kwargs,
55
+ ):
56
+ super().__init__(**kwargs)
57
+
58
+ self.hidden_size = hidden_size
59
+ self.intermediate_size = intermediate_size
60
+ self.num_hidden_layers = num_hidden_layers
61
+ self.num_attention_heads = num_attention_heads
62
+ self.num_channels = num_channels
63
+ self.image_size = image_size
64
+ self.patch_size = patch_size
65
+ self.hidden_act = hidden_act
66
+ self.layer_norm_eps = layer_norm_eps
67
+ self.attention_dropout = attention_dropout
68
+
69
+
70
+ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
71
+ """
72
+ Configuration for DiffusionVL-Qwen2.5 model.
73
+
74
+ This model uses:
75
+ - SigLIP as the vision encoder (external ViT)
76
+ - PoolerProjector as the MM projector (Conv2d + MLP)
77
+ - Qwen2.5 as the LLM backbone (standard RoPE, not M-RoPE)
78
+ - BD3LM for diffusion-based generation
79
+
80
+ Args:
81
+ vocab_size: Vocabulary size.
82
+ hidden_size: Dimension of the hidden representations.
83
+ intermediate_size: Dimension of the MLP representations.
84
+ num_hidden_layers: Number of hidden layers.
85
+ num_attention_heads: Number of attention heads.
86
+ num_key_value_heads: Number of key-value heads for GQA.
87
+ hidden_act: Activation function.
88
+ max_position_embeddings: Maximum sequence length.
89
+ initializer_range: Standard deviation for weight initialization.
90
+ rms_norm_eps: Epsilon for RMS normalization.
91
+ use_cache: Whether to use KV cache.
92
+ tie_word_embeddings: Whether to tie input and output embeddings.
93
+ attention_dropout: Attention dropout probability.
94
+ vision_config: Vision encoder configuration.
95
+ mm_hidden_size: Vision encoder hidden size for projector.
96
+ enable_bd3lm: Whether to enable BD3LM.
97
+ bd3lm_block_size: Block size for BD3LM.
98
+ bd3lm_cross_attn: Whether to use cross-attention in BD3LM.
99
+ mask_token_id: Token ID for mask token.
100
+ rope_theta: RoPE base period.
101
+ sliding_window: Sliding window size for attention.
102
+ """
103
+
104
+ model_type = "diffusionvl_qwen2_5"
105
+ sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VisionConfig}
106
+ keys_to_ignore_at_inference = ["past_key_values"]
107
+
108
+ def __init__(
109
+ self,
110
+ # Text model parameters (Qwen2.5 compatible)
111
+ vocab_size: int = 152064,
112
+ hidden_size: int = 3584,
113
+ intermediate_size: int = 18944,
114
+ num_hidden_layers: int = 28,
115
+ num_attention_heads: int = 28,
116
+ num_key_value_heads: int = 4,
117
+ hidden_act: str = "silu",
118
+ max_position_embeddings: int = 32768,
119
+ initializer_range: float = 0.02,
120
+ rms_norm_eps: float = 1e-6,
121
+ use_cache: bool = True,
122
+ tie_word_embeddings: bool = False,
123
+ attention_dropout: float = 0.0,
124
+ # Vision configuration
125
+ vision_config: Optional[Union[DiffusionVL_Qwen2_5_VisionConfig, dict]] = None,
126
+ # MM projector
127
+ mm_hidden_size: int = 1152, # SigLIP hidden size
128
+ # BD3LM diffusion parameters
129
+ enable_bd3lm: bool = True,
130
+ bd3lm_block_size: int = 8,
131
+ bd3lm_cross_attn: bool = True,
132
+ bd3lm_antithetic_sampling: bool = True,
133
+ bd3lm_sampling_eps_min: float = 1e-3,
134
+ bd3lm_sampling_eps_max: float = 1.0,
135
+ mask_token_id: int = 151671,
136
+ # RoPE parameters (standard RoPE, not M-RoPE)
137
+ rope_theta: float = 1000000.0,
138
+ rope_scaling: Optional[dict] = None,
139
+ # Sliding window attention
140
+ sliding_window: int = 32768,
141
+ max_window_layers: int = 28,
142
+ use_sliding_window: bool = False,
143
+ **kwargs,
144
+ ):
145
+ # Text model configuration
146
+ self.vocab_size = vocab_size
147
+ self.hidden_size = hidden_size
148
+ self.intermediate_size = intermediate_size
149
+ self.num_hidden_layers = num_hidden_layers
150
+ self.num_attention_heads = num_attention_heads
151
+ self.num_key_value_heads = num_key_value_heads
152
+ self.hidden_act = hidden_act
153
+ self.max_position_embeddings = max_position_embeddings
154
+ self.initializer_range = initializer_range
155
+ self.rms_norm_eps = rms_norm_eps
156
+ self.use_cache = use_cache
157
+ self.attention_dropout = attention_dropout
158
+ self.rope_theta = rope_theta
159
+ self.rope_scaling = rope_scaling
160
+ self.sliding_window = sliding_window
161
+ self.max_window_layers = max_window_layers
162
+ self.use_sliding_window = use_sliding_window
163
+
164
+ # Vision configuration
165
+ if vision_config is None:
166
+ self.vision_config = DiffusionVL_Qwen2_5_VisionConfig()
167
+ elif isinstance(vision_config, dict):
168
+ self.vision_config = DiffusionVL_Qwen2_5_VisionConfig(**vision_config)
169
+ elif isinstance(vision_config, DiffusionVL_Qwen2_5_VisionConfig):
170
+ self.vision_config = vision_config
171
+ else:
172
+ self.vision_config = DiffusionVL_Qwen2_5_VisionConfig()
173
+
174
+ # MM projector
175
+ self.mm_hidden_size = mm_hidden_size
176
+
177
+ # BD3LM diffusion configuration
178
+ self.enable_bd3lm = enable_bd3lm
179
+ self.bd3lm_block_size = bd3lm_block_size
180
+ self.bd3lm_cross_attn = bd3lm_cross_attn
181
+ self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
182
+ self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
183
+ self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max
184
+ self.mask_token_id = mask_token_id
185
+
186
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
187
+
188
+
189
+ __all__ = ["DiffusionVL_Qwen2_5_Config", "DiffusionVL_Qwen2_5_VisionConfig"]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96613e0135affeb37783597f8e0dd15985bfa4a5eb4cae4f6059e3cc2f22f693
3
+ size 4877668008
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0edc9a3304655a5eaed32f7198cc2b037c8d620e6fb2985e6f383002edc061
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b65fc332179e0afa2f9f154afeecfeea0ea7c4ff1f82b73281a8dbafca5c26a5
3
+ size 4994571904
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:565699fa87e0b2bc0eea6a3242f6f64fa92550a0f35bbbad1e8128bf28674ab2
3
+ size 1255812224
model.safetensors.index.json ADDED
@@ -0,0 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 8030348832,
4
+ "total_size": 16060697672
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.image_newline": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
21
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
33
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
34
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
45
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
57
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
69
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
81
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
93
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
105
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
117
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
129
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
141
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
142
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
153
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
165
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
166
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
177
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
189
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
201
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
213
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
225
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
237
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
249
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
261
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
273
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
285
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
297
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
309
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
321
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
322
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
327
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
333
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
334
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
345
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
346
+ "model.mm_projector.0.bias": "model-00004-of-00004.safetensors",
347
+ "model.mm_projector.0.weight": "model-00004-of-00004.safetensors",
348
+ "model.mm_projector.2.bias": "model-00004-of-00004.safetensors",
349
+ "model.mm_projector.2.weight": "model-00004-of-00004.safetensors",
350
+ "model.norm.weight": "model-00003-of-00004.safetensors",
351
+ "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00004.safetensors",
352
+ "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00004.safetensors",
353
+ "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00004.safetensors",
354
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00004.safetensors",
355
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00004.safetensors",
356
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00004.safetensors",
357
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00004.safetensors",
358
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
359
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
360
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
361
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
362
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
363
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
364
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
365
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
366
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
367
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
368
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
369
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
370
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00004.safetensors",
371
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00004.safetensors",
372
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00004.safetensors",
373
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00004.safetensors",
374
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
375
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
376
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
377
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
378
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
379
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
380
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
381
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
382
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
383
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
384
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
385
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
386
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00004.safetensors",
387
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00004.safetensors",
388
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00004.safetensors",
389
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00004.safetensors",
390
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00004.safetensors",
391
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00004.safetensors",
392
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00004.safetensors",
393
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00004.safetensors",
394
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
395
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
396
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
397
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
398
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
399
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
400
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
401
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
402
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00004.safetensors",
403
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00004.safetensors",
404
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00004.safetensors",
405
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00004.safetensors",
406
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00004.safetensors",
407
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00004.safetensors",
408
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00004.safetensors",
409
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00004.safetensors",
410
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
411
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
412
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
413
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
414
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
415
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
416
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
417
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
418
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00004.safetensors",
419
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00004.safetensors",
420
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00004.safetensors",
421
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00004.safetensors",
422
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00004.safetensors",
423
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
424
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00004.safetensors",
425
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00004.safetensors",
426
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
427
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
428
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
429
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
430
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
431
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
432
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
433
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
434
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00004.safetensors",
435
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00004.safetensors",
436
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00004.safetensors",
437
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00004.safetensors",
438
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00004.safetensors",
439
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00004.safetensors",
440
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
441
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00004.safetensors",
442
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
443
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
444
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
445
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
446
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
447
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
448
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
449
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
450
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00004.safetensors",
451
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00004.safetensors",
452
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00004.safetensors",
453
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00004.safetensors",
454
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00004.safetensors",
455
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00004.safetensors",
456
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00004.safetensors",
457
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00004.safetensors",
458
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
459
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
460
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
461
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
462
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
463
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
464
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
465
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
466
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00004.safetensors",
467
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00004.safetensors",
468
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00004.safetensors",
469
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00004.safetensors",
470
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00004.safetensors",
471
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00004.safetensors",
472
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00004.safetensors",
473
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
474
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
475
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
476
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
477
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
478
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
479
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
480
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
481
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
482
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00004.safetensors",
483
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00004.safetensors",
484
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00004.safetensors",
485
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00004.safetensors",
486
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00004.safetensors",
487
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00004.safetensors",
488
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00004.safetensors",
489
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
490
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
491
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
492
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
493
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
494
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
495
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
496
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
497
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
498
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00004.safetensors",
499
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00004.safetensors",
500
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00004.safetensors",
501
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00004.safetensors",
502
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00004.safetensors",
503
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00004.safetensors",
504
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00004.safetensors",
505
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00004.safetensors",
506
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
507
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
508
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
509
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
510
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
511
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
512
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
513
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
514
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00004.safetensors",
515
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00004.safetensors",
516
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00004.safetensors",
517
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00004.safetensors",
518
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00004.safetensors",
519
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00004.safetensors",
520
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00004.safetensors",
521
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00004.safetensors",
522
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
523
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
524
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
525
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
526
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
527
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
528
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
529
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
530
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00004.safetensors",
531
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00004.safetensors",
532
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00004.safetensors",
533
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00004.safetensors",
534
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00004.safetensors",
535
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00004.safetensors",
536
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00004.safetensors",
537
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00004.safetensors",
538
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
539
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
540
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
541
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
542
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
543
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
544
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
545
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
546
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00004.safetensors",
547
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00004.safetensors",
548
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00004.safetensors",
549
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00004.safetensors",
550
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
551
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
552
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
553
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
554
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
555
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
556
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
557
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
558
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
559
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
560
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
561
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
562
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00004.safetensors",
563
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00004.safetensors",
564
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00004.safetensors",
565
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00004.safetensors",
566
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00004.safetensors",
567
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00004.safetensors",
568
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00004.safetensors",
569
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00004.safetensors",
570
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
571
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
572
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
573
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
574
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
575
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
576
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
577
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
578
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00004.safetensors",
579
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00004.safetensors",
580
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00004-of-00004.safetensors",
581
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00004-of-00004.safetensors",
582
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00004.safetensors",
583
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00004.safetensors",
584
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
585
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
586
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
587
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
588
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
589
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
590
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
591
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
592
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
593
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
594
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00004-of-00004.safetensors",
595
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00004-of-00004.safetensors",
596
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00004-of-00004.safetensors",
597
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00004-of-00004.safetensors",
598
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
599
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
600
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
601
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
602
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
603
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
604
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
605
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
606
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
607
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
608
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
609
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
610
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00004-of-00004.safetensors",
611
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00004-of-00004.safetensors",
612
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
613
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
614
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
615
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
616
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
617
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
618
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
619
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
620
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
621
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
622
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
623
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
624
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
625
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
626
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00004-of-00004.safetensors",
627
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00004-of-00004.safetensors",
628
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00004-of-00004.safetensors",
629
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00004-of-00004.safetensors",
630
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
631
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
632
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
633
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
634
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
635
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
636
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
637
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
638
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
639
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
640
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
641
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
642
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00004-of-00004.safetensors",
643
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00004-of-00004.safetensors",
644
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00004-of-00004.safetensors",
645
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00004-of-00004.safetensors",
646
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
647
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
648
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
649
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
650
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
651
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
652
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
653
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
654
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
655
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
656
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
657
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
658
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00004.safetensors",
659
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00004.safetensors",
660
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00004.safetensors",
661
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00004.safetensors",
662
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
663
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
664
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
665
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
666
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
667
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
668
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
669
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
670
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
671
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
672
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
673
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
674
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00004.safetensors",
675
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00004.safetensors",
676
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00004.safetensors",
677
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00004.safetensors",
678
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
679
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
680
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
681
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
682
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
683
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
684
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
685
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
686
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
687
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
688
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
689
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
690
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00004.safetensors",
691
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00004.safetensors",
692
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00004.safetensors",
693
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00004.safetensors",
694
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00004.safetensors",
695
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00004.safetensors",
696
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00004.safetensors",
697
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00004.safetensors",
698
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
699
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
700
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
701
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
702
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
703
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
704
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
705
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
706
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00004.safetensors",
707
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00004.safetensors",
708
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00004.safetensors",
709
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00004.safetensors",
710
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00004.safetensors",
711
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
712
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00004.safetensors",
713
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00004.safetensors",
714
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
715
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
716
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
717
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
718
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
719
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
720
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
721
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
722
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00004.safetensors",
723
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00004.safetensors",
724
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00004.safetensors",
725
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00004.safetensors",
726
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
727
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
728
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00004.safetensors",
729
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00004.safetensors",
730
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
731
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
732
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
733
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
734
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
735
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
736
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
737
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
738
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00004.safetensors",
739
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00004.safetensors",
740
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00004.safetensors",
741
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00004.safetensors",
742
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
743
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
744
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00004.safetensors",
745
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00004.safetensors",
746
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
747
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
748
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
749
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
750
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
751
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
752
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
753
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
754
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00004.safetensors",
755
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00004.safetensors",
756
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00004.safetensors",
757
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00004.safetensors",
758
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00004.safetensors",
759
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00004.safetensors",
760
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00004.safetensors",
761
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00004.safetensors",
762
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
763
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
764
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
765
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
766
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
767
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
768
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
769
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
770
+ "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00004-of-00004.safetensors",
771
+ "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00004-of-00004.safetensors"
772
+ }
773
+ }
modeling_diffusionvl_qwen2_5.py ADDED
@@ -0,0 +1,1225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ DiffusionVL-Qwen2.5 model implementation.
18
+
19
+ This model uses:
20
+ - SigLIP as the vision encoder (external ViT)
21
+ - mlp2x_gelu as the MM projector (2-layer MLP with GELU)
22
+ - Qwen2.5 as the LLM backbone (standard RoPE)
23
+ - BD3LM for diffusion-based generation
24
+ """
25
+
26
+ import math
27
+ from typing import Callable, Dict, List, Optional, Tuple, Union
28
+
29
+ import torch
30
+ import torch.nn as nn
31
+ import torch.nn.functional as F
32
+
33
+ from transformers.activations import ACT2FN
34
+ from transformers.modeling_utils import PreTrainedModel
35
+ from transformers.cache_utils import Cache, DynamicCache
36
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
37
+ from transformers.utils import logging
38
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
39
+
40
+ from .configuration_diffusionvl_qwen2_5 import DiffusionVL_Qwen2_5_Config, DiffusionVL_Qwen2_5_VisionConfig
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+ IMAGE_TOKEN_INDEX = -200
45
+
46
+
47
+ # ============================================================================
48
+ # Image Processing Utilities (matching training code)
49
+ # ============================================================================
50
+
51
+ def select_best_resolution(original_size, possible_resolutions):
52
+ """
53
+ Selects the best resolution from a list of possible resolutions based on the original size.
54
+ """
55
+ original_width, original_height = original_size
56
+ best_fit = None
57
+ max_effective_resolution = 0
58
+ min_wasted_resolution = float("inf")
59
+
60
+ for width, height in possible_resolutions:
61
+ scale = min(width / original_width, height / original_height)
62
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
63
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
64
+ wasted_resolution = (width * height) - effective_resolution
65
+
66
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
67
+ max_effective_resolution = effective_resolution
68
+ min_wasted_resolution = wasted_resolution
69
+ best_fit = (width, height)
70
+
71
+ return best_fit
72
+
73
+
74
+ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
75
+ """
76
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
77
+ """
78
+ import re
79
+ import ast
80
+
81
+ if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
82
+ assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
83
+ matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
84
+ range_start = tuple(map(int, matches[0]))
85
+ range_end = tuple(map(int, matches[-1]))
86
+ grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
87
+ grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
88
+ if type(grid_pinpoints) is list:
89
+ possible_resolutions = grid_pinpoints
90
+ else:
91
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
92
+ width, height = select_best_resolution(image_size, possible_resolutions)
93
+ return width // patch_size, height // patch_size
94
+
95
+
96
+ def unpad_image(tensor, original_size):
97
+ """
98
+ Unpads a PyTorch tensor of a padded and resized image.
99
+
100
+ Args:
101
+ tensor: The image tensor in CxHxW format.
102
+ original_size: The original size of the image (width, height).
103
+
104
+ Returns:
105
+ The unpadded image tensor.
106
+ """
107
+ original_width, original_height = original_size
108
+ current_height, current_width = tensor.shape[1:]
109
+
110
+ original_aspect_ratio = original_width / original_height
111
+ current_aspect_ratio = current_width / current_height
112
+
113
+ if original_aspect_ratio > current_aspect_ratio:
114
+ scale_factor = current_width / original_width
115
+ new_height = int(original_height * scale_factor)
116
+ padding = (current_height - new_height) // 2
117
+ unpadded_tensor = tensor[:, padding : current_height - padding, :]
118
+ else:
119
+ scale_factor = current_height / original_height
120
+ new_width = int(original_width * scale_factor)
121
+ padding = (current_width - new_width) // 2
122
+ unpadded_tensor = tensor[:, :, padding : current_width - padding]
123
+
124
+ return unpadded_tensor
125
+
126
+
127
+ # ============================================================================
128
+ # Vision Encoder (SigLIP)
129
+ # ============================================================================
130
+
131
+ class SigLipVisionEmbeddings(nn.Module):
132
+ """Patch embedding for SigLIP vision encoder."""
133
+
134
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
135
+ super().__init__()
136
+ self.config = config
137
+ self.embed_dim = config.hidden_size
138
+ self.image_size = config.image_size
139
+ self.patch_size = config.patch_size
140
+
141
+ self.patch_embedding = nn.Conv2d(
142
+ in_channels=config.num_channels,
143
+ out_channels=self.embed_dim,
144
+ kernel_size=self.patch_size,
145
+ stride=self.patch_size,
146
+ padding="valid",
147
+ )
148
+
149
+ self.num_patches = (self.image_size // self.patch_size) ** 2
150
+ self.num_positions = self.num_patches
151
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
152
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
153
+
154
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
155
+ patch_embeds = self.patch_embedding(pixel_values)
156
+ embeddings = patch_embeds.flatten(2).transpose(1, 2)
157
+ embeddings = embeddings + self.position_embedding(self.position_ids)
158
+ return embeddings
159
+
160
+
161
+ class SigLipAttention(nn.Module):
162
+ """Multi-headed attention for SigLIP."""
163
+
164
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
165
+ super().__init__()
166
+ self.config = config
167
+ self.embed_dim = config.hidden_size
168
+ self.num_heads = config.num_attention_heads
169
+ self.head_dim = self.embed_dim // self.num_heads
170
+ self.scale = self.head_dim ** -0.5
171
+ self.dropout = config.attention_dropout
172
+
173
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
174
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
175
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
176
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
177
+
178
+ def forward(
179
+ self,
180
+ hidden_states: torch.Tensor,
181
+ attention_mask: Optional[torch.Tensor] = None,
182
+ output_attentions: Optional[bool] = False,
183
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
184
+ batch_size, q_len, _ = hidden_states.size()
185
+
186
+ query_states = self.q_proj(hidden_states)
187
+ key_states = self.k_proj(hidden_states)
188
+ value_states = self.v_proj(hidden_states)
189
+
190
+ query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
191
+ key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
192
+ value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
193
+
194
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
195
+
196
+ if attention_mask is not None:
197
+ attn_weights = attn_weights + attention_mask
198
+
199
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
200
+ attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
201
+ attn_output = torch.matmul(attn_weights, value_states)
202
+
203
+ attn_output = attn_output.transpose(1, 2).contiguous()
204
+ attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
205
+ attn_output = self.out_proj(attn_output)
206
+
207
+ return attn_output, attn_weights
208
+
209
+
210
+ class SigLipMLP(nn.Module):
211
+ """MLP for SigLIP."""
212
+
213
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
214
+ super().__init__()
215
+ self.config = config
216
+ self.activation_fn = ACT2FN[config.hidden_act]
217
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
218
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
219
+
220
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
221
+ hidden_states = self.fc1(hidden_states)
222
+ hidden_states = self.activation_fn(hidden_states)
223
+ hidden_states = self.fc2(hidden_states)
224
+ return hidden_states
225
+
226
+
227
+ class SigLipEncoderLayer(nn.Module):
228
+ """Transformer encoder layer for SigLIP."""
229
+
230
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
231
+ super().__init__()
232
+ self.embed_dim = config.hidden_size
233
+ self.self_attn = SigLipAttention(config)
234
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
235
+ self.mlp = SigLipMLP(config)
236
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
237
+
238
+ def forward(
239
+ self,
240
+ hidden_states: torch.Tensor,
241
+ attention_mask: Optional[torch.Tensor] = None,
242
+ output_attentions: Optional[bool] = False,
243
+ ) -> Tuple[torch.FloatTensor]:
244
+ residual = hidden_states
245
+ hidden_states = self.layer_norm1(hidden_states)
246
+ hidden_states, attn_weights = self.self_attn(
247
+ hidden_states=hidden_states,
248
+ attention_mask=attention_mask,
249
+ output_attentions=output_attentions,
250
+ )
251
+ hidden_states = residual + hidden_states
252
+
253
+ residual = hidden_states
254
+ hidden_states = self.layer_norm2(hidden_states)
255
+ hidden_states = self.mlp(hidden_states)
256
+ hidden_states = residual + hidden_states
257
+
258
+ outputs = (hidden_states,)
259
+ if output_attentions:
260
+ outputs += (attn_weights,)
261
+ return outputs
262
+
263
+
264
+ class SigLipEncoder(nn.Module):
265
+ """Transformer encoder for SigLIP."""
266
+
267
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
268
+ super().__init__()
269
+ self.config = config
270
+ self.layers = nn.ModuleList([SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
271
+
272
+ def forward(
273
+ self,
274
+ inputs_embeds: torch.Tensor,
275
+ attention_mask: Optional[torch.Tensor] = None,
276
+ output_attentions: Optional[bool] = None,
277
+ output_hidden_states: Optional[bool] = None,
278
+ ) -> Tuple:
279
+ hidden_states = inputs_embeds
280
+ all_hidden_states = () if output_hidden_states else None
281
+
282
+ for encoder_layer in self.layers:
283
+ if output_hidden_states:
284
+ all_hidden_states = all_hidden_states + (hidden_states,)
285
+ layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions)
286
+ hidden_states = layer_outputs[0]
287
+
288
+ if output_hidden_states:
289
+ all_hidden_states = all_hidden_states + (hidden_states,)
290
+
291
+ return hidden_states, all_hidden_states
292
+
293
+
294
+ class SigLipVisionTransformer(nn.Module):
295
+ """SigLIP Vision Transformer."""
296
+
297
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
298
+ super().__init__()
299
+ self.config = config
300
+ self.embeddings = SigLipVisionEmbeddings(config)
301
+ self.encoder = SigLipEncoder(config)
302
+ self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
303
+
304
+ def forward(
305
+ self,
306
+ pixel_values: torch.FloatTensor,
307
+ output_hidden_states: Optional[bool] = True,
308
+ ) -> torch.Tensor:
309
+ hidden_states = self.embeddings(pixel_values)
310
+ hidden_states, all_hidden_states = self.encoder(
311
+ inputs_embeds=hidden_states,
312
+ output_hidden_states=output_hidden_states,
313
+ )
314
+ # Return the last hidden state (before post_layernorm, matching SigLIP behavior)
315
+ # Use hidden_states from the last layer
316
+ if output_hidden_states and all_hidden_states:
317
+ return all_hidden_states[-1]
318
+ return hidden_states
319
+
320
+
321
+ class SigLipVisionModel(nn.Module):
322
+ """Wrapper to match training code structure: vision_model contains the transformer."""
323
+
324
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
325
+ super().__init__()
326
+ self.vision_model = SigLipVisionTransformer(config)
327
+ self.config = config
328
+
329
+ def forward(self, pixel_values: torch.FloatTensor, output_hidden_states: bool = True) -> torch.Tensor:
330
+ return self.vision_model(pixel_values, output_hidden_states=output_hidden_states)
331
+
332
+
333
+ class DiffusionVL_Qwen2_5_VisionTower(nn.Module):
334
+ """Vision tower wrapping SigLIP. Matches training code: vision_tower.vision_tower.vision_model.xxx"""
335
+
336
+ def __init__(self, config: DiffusionVL_Qwen2_5_VisionConfig):
337
+ super().__init__()
338
+ self.vision_tower = SigLipVisionModel(config)
339
+ self.config = config
340
+
341
+ @property
342
+ def dtype(self):
343
+ return self.vision_tower.vision_model.embeddings.patch_embedding.weight.dtype
344
+
345
+ @property
346
+ def device(self):
347
+ return self.vision_tower.vision_model.embeddings.patch_embedding.weight.device
348
+
349
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
350
+ # Match training code: convert to vision tower dtype/device
351
+ pixel_values = pixel_values.to(device=self.device, dtype=self.dtype)
352
+ return self.vision_tower(pixel_values, output_hidden_states=True)
353
+
354
+
355
+ # ============================================================================
356
+ # MM Projector (mlp2x_gelu - matches training code)
357
+ # ============================================================================
358
+
359
+ def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
360
+ """
361
+ Build MM projector matching training code's mlp2x_gelu structure.
362
+
363
+ Structure:
364
+ 0: nn.Linear(mm_hidden_size, hidden_size) # 1152 -> 3584
365
+ 1: nn.GELU()
366
+ 2: nn.Linear(hidden_size, hidden_size) # 3584 -> 3584
367
+ """
368
+ return nn.Sequential(
369
+ nn.Linear(config.mm_hidden_size, config.hidden_size),
370
+ nn.GELU(),
371
+ nn.Linear(config.hidden_size, config.hidden_size),
372
+ )
373
+
374
+
375
+ # ============================================================================
376
+ # LLM Components (Qwen2.5 based)
377
+ # ============================================================================
378
+
379
+ class DiffusionVL_Qwen2_5_RMSNorm(nn.Module):
380
+ def __init__(self, hidden_size, eps=1e-6):
381
+ super().__init__()
382
+ self.weight = nn.Parameter(torch.ones(hidden_size))
383
+ self.variance_epsilon = eps
384
+
385
+ def forward(self, hidden_states):
386
+ input_dtype = hidden_states.dtype
387
+ hidden_states = hidden_states.to(torch.float32)
388
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
389
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
390
+ return self.weight * hidden_states.to(input_dtype)
391
+
392
+
393
+ def rotate_half(x: torch.Tensor) -> torch.Tensor:
394
+ """Rotates half the hidden dims of the input."""
395
+ x1 = x[..., : x.shape[-1] // 2]
396
+ x2 = x[..., x.shape[-1] // 2 :]
397
+ return torch.cat((-x2, x1), dim=-1)
398
+
399
+
400
+ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
401
+ """Apply standard rotary position embedding (not M-RoPE)."""
402
+ cos = cos.unsqueeze(unsqueeze_dim)
403
+ sin = sin.unsqueeze(unsqueeze_dim)
404
+ q_embed = (q * cos) + (rotate_half(q) * sin)
405
+ k_embed = (k * cos) + (rotate_half(k) * sin)
406
+ return q_embed, k_embed
407
+
408
+
409
+ class DiffusionVL_Qwen2_5_RotaryEmbedding(nn.Module):
410
+ """Standard rotary position embedding for Qwen2.5."""
411
+
412
+ def __init__(self, config: DiffusionVL_Qwen2_5_Config):
413
+ super().__init__()
414
+ self.config = config
415
+ dim = config.hidden_size // config.num_attention_heads
416
+ inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
417
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
418
+
419
+ @torch.no_grad()
420
+ def forward(self, x, position_ids):
421
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
422
+ position_ids_expanded = position_ids[:, None, :].float()
423
+
424
+ device_type = x.device.type
425
+ with torch.autocast(device_type=device_type, enabled=False):
426
+ freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
427
+ emb = torch.cat((freqs, freqs), dim=-1)
428
+ cos = emb.cos()
429
+ sin = emb.sin()
430
+
431
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
432
+
433
+
434
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
435
+ """Repeat key/value heads for GQA."""
436
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
437
+ if n_rep == 1:
438
+ return hidden_states
439
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
440
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
441
+
442
+
443
+ class DiffusionVL_Qwen2_5_MLP(nn.Module):
444
+ def __init__(self, config: DiffusionVL_Qwen2_5_Config):
445
+ super().__init__()
446
+ self.hidden_size = config.hidden_size
447
+ self.intermediate_size = config.intermediate_size
448
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
449
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
450
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
451
+ self.act_fn = ACT2FN[config.hidden_act]
452
+
453
+ def forward(self, hidden_states):
454
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
455
+
456
+
457
+ class DiffusionVL_Qwen2_5_Attention(nn.Module):
458
+ """Attention with BD3LM store_kv support."""
459
+
460
+ def __init__(self, config: DiffusionVL_Qwen2_5_Config, layer_idx: int):
461
+ super().__init__()
462
+ self.config = config
463
+ self.layer_idx = layer_idx
464
+
465
+ self.hidden_size = config.hidden_size
466
+ self.num_heads = config.num_attention_heads
467
+ self.head_dim = self.hidden_size // self.num_heads
468
+ self.num_key_value_heads = config.num_key_value_heads
469
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
470
+ self.scaling = self.head_dim ** -0.5
471
+ self.attention_dropout = config.attention_dropout
472
+ self.is_causal = False # BD3LM uses block causal mask
473
+
474
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
475
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
476
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
477
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
478
+
479
+ # Sliding window
480
+ self.sliding_window = config.sliding_window if (
481
+ config.use_sliding_window and layer_idx < config.max_window_layers
482
+ ) else None
483
+
484
+ def forward(
485
+ self,
486
+ hidden_states: torch.Tensor,
487
+ attention_mask: Optional[torch.Tensor] = None,
488
+ position_ids: Optional[torch.LongTensor] = None,
489
+ past_key_values: Optional[Cache] = None,
490
+ cache_position: Optional[torch.LongTensor] = None,
491
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
492
+ store_kv: bool = True,
493
+ **kwargs,
494
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
495
+ bsz, q_len, _ = hidden_states.size()
496
+
497
+ query_states = self.q_proj(hidden_states)
498
+ key_states = self.k_proj(hidden_states)
499
+ value_states = self.v_proj(hidden_states)
500
+
501
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
502
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
503
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
504
+
505
+ # Apply rotary embeddings
506
+ cos, sin = position_embeddings
507
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
508
+
509
+ # Handle KV cache
510
+ if past_key_values is not None:
511
+ if store_kv:
512
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
513
+ key_states, value_states = past_key_values.update(
514
+ key_states, value_states, self.layer_idx, cache_kwargs
515
+ )
516
+ else:
517
+ # Read-only: concatenate with cached KV
518
+ if self.layer_idx < len(past_key_values):
519
+ past_key_states, past_value_states = past_key_values[self.layer_idx]
520
+ key_states = torch.cat([past_key_states, key_states], dim=2)
521
+ value_states = torch.cat([past_value_states, value_states], dim=2)
522
+
523
+ # GQA: repeat KV heads
524
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
525
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
526
+
527
+ # Compute attention with SDPA
528
+ if attention_mask is not None:
529
+ attn_output = F.scaled_dot_product_attention(
530
+ query_states,
531
+ key_states,
532
+ value_states,
533
+ attn_mask=attention_mask,
534
+ dropout_p=0.0,
535
+ is_causal=False,
536
+ scale=self.scaling,
537
+ )
538
+ else:
539
+ attn_output = F.scaled_dot_product_attention(
540
+ query_states,
541
+ key_states,
542
+ value_states,
543
+ dropout_p=0.0,
544
+ is_causal=False,
545
+ scale=self.scaling,
546
+ )
547
+
548
+ attn_output = attn_output.transpose(1, 2).contiguous()
549
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
550
+ attn_output = self.o_proj(attn_output)
551
+
552
+ return attn_output, None
553
+
554
+
555
+ class DiffusionVL_Qwen2_5_DecoderLayer(nn.Module):
556
+ def __init__(self, config: DiffusionVL_Qwen2_5_Config, layer_idx: int):
557
+ super().__init__()
558
+ self.hidden_size = config.hidden_size
559
+ self.self_attn = DiffusionVL_Qwen2_5_Attention(config, layer_idx)
560
+ self.mlp = DiffusionVL_Qwen2_5_MLP(config)
561
+ self.input_layernorm = DiffusionVL_Qwen2_5_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
562
+ self.post_attention_layernorm = DiffusionVL_Qwen2_5_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
563
+
564
+ def forward(
565
+ self,
566
+ hidden_states: torch.Tensor,
567
+ attention_mask: Optional[torch.Tensor] = None,
568
+ position_ids: Optional[torch.LongTensor] = None,
569
+ past_key_values: Optional[Cache] = None,
570
+ cache_position: Optional[torch.LongTensor] = None,
571
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
572
+ store_kv: bool = True,
573
+ **kwargs,
574
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
575
+ residual = hidden_states
576
+ hidden_states = self.input_layernorm(hidden_states)
577
+
578
+ hidden_states, attn_weights = self.self_attn(
579
+ hidden_states=hidden_states,
580
+ attention_mask=attention_mask,
581
+ position_ids=position_ids,
582
+ past_key_values=past_key_values,
583
+ cache_position=cache_position,
584
+ position_embeddings=position_embeddings,
585
+ store_kv=store_kv,
586
+ **kwargs,
587
+ )
588
+ hidden_states = residual + hidden_states
589
+
590
+ residual = hidden_states
591
+ hidden_states = self.post_attention_layernorm(hidden_states)
592
+ hidden_states = self.mlp(hidden_states)
593
+ hidden_states = residual + hidden_states
594
+
595
+ return hidden_states, attn_weights
596
+
597
+
598
+ # ============================================================================
599
+ # Main Model Classes
600
+ # ============================================================================
601
+
602
+ class DiffusionVL_Qwen2_5_PreTrainedModel(PreTrainedModel):
603
+ config_class = DiffusionVL_Qwen2_5_Config
604
+ base_model_prefix = "model"
605
+ supports_gradient_checkpointing = True
606
+ _no_split_modules = ["DiffusionVL_Qwen2_5_DecoderLayer", "SigLipEncoderLayer"]
607
+
608
+ def _init_weights(self, module: nn.Module) -> None:
609
+ std = self.config.initializer_range
610
+ if isinstance(module, nn.Linear):
611
+ module.weight.data.normal_(mean=0.0, std=std)
612
+ if module.bias is not None:
613
+ module.bias.data.zero_()
614
+ elif isinstance(module, nn.Embedding):
615
+ module.weight.data.normal_(mean=0.0, std=std)
616
+
617
+
618
+ class DiffusionVL_Qwen2_5_Model(DiffusionVL_Qwen2_5_PreTrainedModel):
619
+ """Base model with vision tower, projector, and LLM layers."""
620
+
621
+ def __init__(self, config: DiffusionVL_Qwen2_5_Config):
622
+ super().__init__(config)
623
+ self.config = config
624
+
625
+ # Vision components
626
+ self.vision_tower = DiffusionVL_Qwen2_5_VisionTower(config.vision_config)
627
+ self.mm_projector = build_mm_projector(config)
628
+ self.image_newline = nn.Parameter(torch.zeros(config.hidden_size))
629
+
630
+ # LLM components
631
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
632
+ self.layers = nn.ModuleList([
633
+ DiffusionVL_Qwen2_5_DecoderLayer(config, layer_idx)
634
+ for layer_idx in range(config.num_hidden_layers)
635
+ ])
636
+ self.norm = DiffusionVL_Qwen2_5_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
637
+ self.rotary_emb = DiffusionVL_Qwen2_5_RotaryEmbedding(config)
638
+
639
+ # BD3LM components
640
+ self.block_size = config.bd3lm_block_size
641
+ self.mask_token_id = config.mask_token_id
642
+
643
+ self.gradient_checkpointing = False
644
+ self.post_init()
645
+
646
+ @property
647
+ def num_patches_per_side(self):
648
+ """Number of patches per side for the vision encoder."""
649
+ image_size = getattr(self.config.vision_config, "image_size", 384)
650
+ patch_size = getattr(self.config.vision_config, "patch_size", 14)
651
+ return image_size // patch_size
652
+
653
+ def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
654
+ """Get image features through vision tower and projector."""
655
+ vision_output = self.vision_tower(pixel_values)
656
+ image_features = self.mm_projector(vision_output)
657
+ return image_features
658
+
659
+ def forward(
660
+ self,
661
+ input_ids: Optional[torch.LongTensor] = None,
662
+ attention_mask: Optional[torch.Tensor] = None,
663
+ position_ids: Optional[torch.LongTensor] = None,
664
+ past_key_values: Optional[Cache] = None,
665
+ inputs_embeds: Optional[torch.FloatTensor] = None,
666
+ use_cache: Optional[bool] = None,
667
+ return_dict: Optional[bool] = None,
668
+ cache_position: Optional[torch.LongTensor] = None,
669
+ store_kv: bool = True,
670
+ **kwargs,
671
+ ):
672
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
673
+ return_dict = return_dict if return_dict is not None else True
674
+
675
+ if inputs_embeds is None:
676
+ inputs_embeds = self.embed_tokens(input_ids)
677
+
678
+ past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
679
+
680
+ if cache_position is None:
681
+ cache_position = torch.arange(
682
+ past_key_values_length, past_key_values_length + inputs_embeds.shape[1],
683
+ device=inputs_embeds.device
684
+ )
685
+
686
+ if position_ids is None:
687
+ position_ids = cache_position.unsqueeze(0)
688
+
689
+ hidden_states = inputs_embeds
690
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
691
+
692
+ for decoder_layer in self.layers:
693
+ hidden_states, _ = decoder_layer(
694
+ hidden_states,
695
+ attention_mask=attention_mask,
696
+ position_ids=position_ids,
697
+ past_key_values=past_key_values,
698
+ cache_position=cache_position,
699
+ position_embeddings=position_embeddings,
700
+ store_kv=store_kv,
701
+ )
702
+
703
+ hidden_states = self.norm(hidden_states)
704
+
705
+ if not return_dict:
706
+ return (hidden_states, past_key_values if use_cache else None)
707
+
708
+ return BaseModelOutputWithPast(
709
+ last_hidden_state=hidden_states,
710
+ past_key_values=past_key_values if use_cache else None,
711
+ )
712
+
713
+
714
+ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTrainedModel):
715
+ """DiffusionVL-Qwen2.5 with LM head for diffusion-based generation."""
716
+
717
+ _tied_weights_keys = ["lm_head.weight"]
718
+
719
+ def __init__(self, config: DiffusionVL_Qwen2_5_Config):
720
+ super().__init__(config)
721
+ self.model = DiffusionVL_Qwen2_5_Model(config)
722
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
723
+
724
+ # BD3LM attributes
725
+ self.block_size = config.bd3lm_block_size
726
+ self.mask_token_id = config.mask_token_id
727
+
728
+ self.post_init()
729
+
730
+ def get_input_embeddings(self):
731
+ return self.model.embed_tokens
732
+
733
+ def set_input_embeddings(self, value):
734
+ self.model.embed_tokens = value
735
+
736
+ def get_output_embeddings(self):
737
+ return self.lm_head
738
+
739
+ def set_output_embeddings(self, new_embeddings):
740
+ self.lm_head = new_embeddings
741
+
742
+ @torch.no_grad()
743
+ def generate(
744
+ self,
745
+ inputs: Optional[torch.Tensor] = None,
746
+ images: Optional[torch.Tensor] = None,
747
+ image_sizes: Optional[List[Tuple[int, int]]] = None,
748
+ num_patches_per_image: Optional[List[int]] = None,
749
+ gen_length: int = 256,
750
+ steps: int = 8,
751
+ temperature: float = 0.0,
752
+ **kwargs,
753
+ ):
754
+ """Diffusion-based generation."""
755
+ if images is not None:
756
+ inputs_embeds = self.prepare_inputs_labels_for_multimodal(
757
+ input_ids=inputs,
758
+ images=images,
759
+ image_sizes=image_sizes,
760
+ num_patches_per_image=num_patches_per_image,
761
+ )
762
+ else:
763
+ inputs_embeds = self.get_input_embeddings()(inputs)
764
+
765
+ return self.generate_with_bd3lm(
766
+ inputs_embeds=inputs_embeds,
767
+ gen_length=gen_length,
768
+ steps=steps,
769
+ temperature=temperature,
770
+ **kwargs,
771
+ )
772
+
773
+ def prepare_inputs_labels_for_multimodal(
774
+ self,
775
+ input_ids: torch.Tensor,
776
+ images: torch.Tensor,
777
+ image_sizes: Optional[List[Tuple[int, int]]] = None,
778
+ num_patches_per_image: Optional[List[int]] = None,
779
+ ) -> torch.Tensor:
780
+ """
781
+ Prepare inputs by merging text embeddings with image features.
782
+
783
+ Implements spatial_unpad merge type matching training code:
784
+ - For single-patch images: just add image_newline
785
+ - For multi-patch images (anyres): unpad, interpolate, add newline per row
786
+
787
+ Args:
788
+ input_ids: Token IDs with IMAGE_TOKEN_INDEX placeholders
789
+ images: Tensor of shape (total_patches, C, H, W) containing all patches
790
+ image_sizes: List of (width, height) tuples for each original image
791
+ num_patches_per_image: List of patch counts per image (from processor)
792
+ """
793
+ import re as regex_module
794
+
795
+ device = input_ids.device
796
+ batch_size = input_ids.shape[0]
797
+
798
+ # Get raw image features from vision tower + projector
799
+ # images shape: (total_patches, C, H, W)
800
+ # raw_image_features shape: (total_patches, num_tokens, hidden_size)
801
+ raw_image_features = self.model.get_image_features(images)
802
+
803
+ # Determine split sizes for per-image features
804
+ if num_patches_per_image is not None:
805
+ split_sizes = num_patches_per_image
806
+ else:
807
+ # Fallback: assume 1 patch per image
808
+ split_sizes = [1] * images.shape[0]
809
+
810
+ # Split features per image
811
+ image_features_list = list(torch.split(raw_image_features, split_sizes, dim=0))
812
+
813
+ # Process image features with spatial_unpad logic
814
+ mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "spatial_unpad")
815
+ image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "anyres_max_4")
816
+
817
+ processed_image_features = []
818
+ for image_idx, image_feature in enumerate(image_features_list):
819
+ # image_feature shape: (num_patches, num_tokens, hidden_size)
820
+
821
+ if image_feature.shape[0] > 1: # Multi-patch (anyres) image
822
+ base_image_feature = image_feature[0] # Base patch: (num_tokens, hidden_size)
823
+ image_feature = image_feature[1:] # Additional patches: (num_grid_patches, num_tokens, hidden_size)
824
+
825
+ # Get patch grid dimensions
826
+ height = width = self.model.num_patches_per_side # e.g., 27 for SigLIP 384
827
+
828
+ # Get max num patches for interpolation
829
+ max_num_patches = 4
830
+ if "anyres_max" in image_aspect_ratio:
831
+ matched = regex_module.match(r"anyres_max_(\d+)", image_aspect_ratio)
832
+ if matched:
833
+ max_num_patches = int(matched.group(1))
834
+
835
+ # Determine grid shape - matching training code logic exactly
836
+ num_grid_patches = image_feature.shape[0] # Actual grid patch count
837
+
838
+ if image_sizes is not None and image_idx < len(image_sizes):
839
+ image_size = image_sizes[image_idx]
840
+ grid_pinpoints = getattr(self.config, "image_grid_pinpoints", "(1x1),...,(2x2)")
841
+ vision_tower_image_size = 384 # SigLIP patch size
842
+
843
+ try:
844
+ num_patch_width, num_patch_height = get_anyres_image_grid_shape(
845
+ image_size, grid_pinpoints, vision_tower_image_size
846
+ )
847
+ # Verify calculated shape matches actual patch count
848
+ expected_patches = num_patch_width * num_patch_height
849
+ if expected_patches != num_grid_patches:
850
+ logger.warning(
851
+ f"Grid shape mismatch! image_size={image_size}, "
852
+ f"expected {num_patch_width}x{num_patch_height}={expected_patches} patches, "
853
+ f"but got {num_grid_patches} patches. Using actual count."
854
+ )
855
+ # Infer grid shape from actual patch count
856
+ # Try to find factors that match the image aspect ratio
857
+ img_w, img_h = image_size
858
+ for h in range(1, num_grid_patches + 1):
859
+ if num_grid_patches % h == 0:
860
+ w = num_grid_patches // h
861
+ # Check if this matches aspect ratio direction
862
+ if (img_w >= img_h and w >= h) or (img_w < img_h and w < h):
863
+ num_patch_height, num_patch_width = h, w
864
+ break
865
+ else:
866
+ num_patch_height = num_patch_width = int(math.sqrt(num_grid_patches))
867
+ except Exception as e:
868
+ # Fallback to 2x2, matching training code
869
+ logger.warning(f"get_anyres_image_grid_shape error: {e}, fallback to 2x2")
870
+ num_patch_width, num_patch_height = 2, 2
871
+ else:
872
+ # Fallback to 2x2, matching training code
873
+ num_patch_width, num_patch_height = 2, 2
874
+
875
+ # Reshape: (num_grid_patches, num_tokens, hidden) -> (patch_h, patch_w, h, w, hidden)
876
+ image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
877
+
878
+ if "unpad" in mm_patch_merge_type:
879
+ # Permute to (hidden, patch_h, h, patch_w, w) then flatten
880
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
881
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3) # (hidden, H, W)
882
+
883
+ # Unpad if image_sizes available
884
+ if image_sizes is not None and image_idx < len(image_sizes):
885
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
886
+
887
+ c, h, w = image_feature.shape
888
+
889
+ # Interpolate if too large
890
+ if "anyres_max" in image_aspect_ratio:
891
+ unit = height # num_patches_per_side
892
+ times = math.sqrt(h * w / (max_num_patches * unit**2))
893
+ if times > 1.1:
894
+ image_feature = image_feature[None]
895
+ image_feature = F.interpolate(
896
+ image_feature,
897
+ [int(h // times), int(w // times)],
898
+ mode="bilinear"
899
+ )[0]
900
+
901
+ # Add image_newline per row
902
+ # image_feature: (hidden, h, w) -> add newline: (hidden, h, w+1)
903
+ image_feature = torch.cat([
904
+ image_feature,
905
+ self.model.image_newline[:, None, None].expand(image_feature.shape[0], image_feature.shape[1], 1).to(image_feature.device)
906
+ ], dim=-1)
907
+ # Flatten and transpose: (hidden, h, w+1) -> (h*(w+1), hidden)
908
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
909
+ else:
910
+ # Flat merge without unpad
911
+ image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
912
+ image_feature = image_feature.flatten(0, 3)
913
+
914
+ # Concatenate base image feature
915
+ image_feature = torch.cat([base_image_feature, image_feature], dim=0)
916
+ processed_image_features.append(image_feature)
917
+
918
+ else: # Single-patch image
919
+ image_feature = image_feature[0] # Remove batch dim: (num_tokens, hidden_size)
920
+ if "unpad" in mm_patch_merge_type:
921
+ image_feature = torch.cat([
922
+ image_feature,
923
+ self.model.image_newline[None].to(image_feature.device)
924
+ ], dim=0)
925
+ processed_image_features.append(image_feature)
926
+
927
+ # Build embeddings with image tokens replaced
928
+ new_input_embeds_list = []
929
+
930
+ for batch_idx in range(batch_size):
931
+ cur_input_ids = input_ids[batch_idx]
932
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum().item()
933
+
934
+ if num_images == 0:
935
+ cur_input_embeds = self.get_input_embeddings()(cur_input_ids)
936
+ new_input_embeds_list.append(cur_input_embeds)
937
+ continue
938
+
939
+ # LLaVA format: IMAGE_TOKEN_INDEX (-200) as placeholder
940
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [len(cur_input_ids)]
941
+
942
+ cur_input_ids_noim = []
943
+ for idx in range(len(image_token_indices) - 1):
944
+ start = image_token_indices[idx] + 1
945
+ end = image_token_indices[idx + 1]
946
+ if start < end:
947
+ cur_input_ids_noim.append(cur_input_ids[start:end])
948
+
949
+ if cur_input_ids_noim:
950
+ cur_input_embeds_noim = self.get_input_embeddings()(torch.cat(cur_input_ids_noim).to(device))
951
+ split_sizes_text = [x.shape[0] for x in cur_input_ids_noim]
952
+ cur_input_embeds_noim_split = list(torch.split(cur_input_embeds_noim, split_sizes_text))
953
+ else:
954
+ cur_input_embeds_noim_split = []
955
+
956
+ cur_new_input_embeds = []
957
+ cur_image_idx = 0
958
+
959
+ for idx in range(num_images + 1):
960
+ if idx < len(cur_input_embeds_noim_split):
961
+ cur_new_input_embeds.append(cur_input_embeds_noim_split[idx].to(device))
962
+ if idx < num_images and cur_image_idx < len(processed_image_features):
963
+ cur_new_input_embeds.append(processed_image_features[cur_image_idx].to(device))
964
+ cur_image_idx += 1
965
+
966
+ if cur_new_input_embeds:
967
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
968
+ else:
969
+ cur_new_input_embeds = self.get_input_embeddings()(cur_input_ids.to(device))
970
+
971
+ new_input_embeds_list.append(cur_new_input_embeds)
972
+
973
+ # Pad to same length
974
+ max_len = max(x.shape[0] for x in new_input_embeds_list)
975
+ hidden_size = new_input_embeds_list[0].shape[-1]
976
+ dtype = new_input_embeds_list[0].dtype
977
+
978
+ inputs_embeds = torch.zeros(batch_size, max_len, hidden_size, dtype=dtype, device=device)
979
+ for i, embed in enumerate(new_input_embeds_list):
980
+ inputs_embeds[i, :embed.shape[0]] = embed
981
+
982
+ return inputs_embeds
983
+
984
+ @torch.no_grad()
985
+ def generate_with_bd3lm(
986
+ self,
987
+ inputs_embeds: torch.FloatTensor,
988
+ gen_length: int = 256,
989
+ steps: int = 8,
990
+ temperature: float = 0.0,
991
+ top_k: int = 0,
992
+ top_p: float = 1.0,
993
+ remasking_strategy: str = 'low_confidence_static',
994
+ use_kv_cache: bool = True,
995
+ confidence_threshold: float = 0.85,
996
+ **kwargs,
997
+ ):
998
+ """BD3LM generation with KV-cache support."""
999
+ device = inputs_embeds.device
1000
+ batch_size = inputs_embeds.shape[0]
1001
+ prompt_len = inputs_embeds.shape[1]
1002
+ block_size = self.block_size
1003
+ mask_id = self.mask_token_id
1004
+
1005
+ num_blocks = (prompt_len + gen_length + block_size - 1) // block_size
1006
+ total_length = num_blocks * block_size
1007
+
1008
+ # Initialize with mask tokens
1009
+ x_ids = torch.full((batch_size, total_length), mask_id, dtype=torch.long, device=device)
1010
+ mask_embed = self.get_input_embeddings()(torch.tensor([mask_id], device=device))
1011
+ x_embeds = mask_embed.repeat(batch_size, total_length, 1)
1012
+ x_embeds[:, :prompt_len] = inputs_embeds.clone()
1013
+
1014
+ # Reconstruct prompt IDs
1015
+ prompt_logits = self.lm_head(inputs_embeds)
1016
+ prompt_ids = torch.argmax(prompt_logits, dim=-1)
1017
+ x_ids[:, :prompt_len] = prompt_ids
1018
+
1019
+ # Block causal mask
1020
+ dtype = inputs_embeds.dtype
1021
+ block_mask = torch.tril(torch.ones(num_blocks, num_blocks, device=device, dtype=dtype))
1022
+ block_diffusion_mask = block_mask.repeat_interleave(block_size, dim=0).repeat_interleave(block_size, dim=1)
1023
+ block_diffusion_mask = block_diffusion_mask.unsqueeze(0).unsqueeze(1)
1024
+ block_diffusion_mask = torch.where(
1025
+ block_diffusion_mask == 0.,
1026
+ torch.tensor(float('-inf'), device=device, dtype=dtype),
1027
+ torch.tensor(0., device=device, dtype=dtype)
1028
+ )
1029
+
1030
+ position_ids = torch.arange(total_length, device=device).unsqueeze(0).expand(batch_size, -1)
1031
+
1032
+ # KV-cache prefill
1033
+ prefill_blocks = prompt_len // block_size
1034
+ prefill_length = prefill_blocks * block_size
1035
+
1036
+ past_key_values = DynamicCache() if use_kv_cache else None
1037
+
1038
+ if use_kv_cache and prefill_length > 0:
1039
+ prefill_embeds = x_embeds[:, :prefill_length]
1040
+ prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
1041
+ prefill_pos_ids = position_ids[:, :prefill_length]
1042
+
1043
+ prefill_outputs = self.model(
1044
+ inputs_embeds=prefill_embeds,
1045
+ attention_mask=prefill_mask,
1046
+ position_ids=prefill_pos_ids,
1047
+ past_key_values=past_key_values,
1048
+ use_cache=True,
1049
+ store_kv=True,
1050
+ )
1051
+ past_key_values = prefill_outputs.past_key_values
1052
+
1053
+ num_transfer_tokens = self._get_num_transfer_tokens(block_size, steps)
1054
+
1055
+ # Generate block by block
1056
+ for block_idx in range(prefill_blocks, num_blocks):
1057
+ block_start = block_idx * block_size
1058
+ block_end = block_start + block_size
1059
+
1060
+ cur_block_embeds = x_embeds[:, block_start:block_end].clone()
1061
+ cur_block_ids = x_ids[:, block_start:block_end]
1062
+ cur_mask = block_diffusion_mask[:, :, block_start:block_end, :block_end]
1063
+ cur_pos_ids = position_ids[:, block_start:block_end]
1064
+
1065
+ for step in range(steps + 1):
1066
+ is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed) < 1e-5, dim=-1)
1067
+ if not is_mask.any():
1068
+ if use_kv_cache:
1069
+ _ = self.model(
1070
+ inputs_embeds=cur_block_embeds,
1071
+ attention_mask=cur_mask,
1072
+ position_ids=cur_pos_ids,
1073
+ past_key_values=past_key_values,
1074
+ use_cache=True,
1075
+ store_kv=True,
1076
+ )
1077
+ break
1078
+
1079
+ if use_kv_cache:
1080
+ outputs = self.model(
1081
+ inputs_embeds=cur_block_embeds,
1082
+ attention_mask=cur_mask,
1083
+ position_ids=cur_pos_ids,
1084
+ past_key_values=past_key_values,
1085
+ use_cache=True,
1086
+ store_kv=False,
1087
+ )
1088
+ logits = self.lm_head(outputs.last_hidden_state).float()
1089
+ else:
1090
+ context_embeds = x_embeds[:, :block_end].clone()
1091
+ context_embeds[:, block_start:block_end] = cur_block_embeds
1092
+ context_mask = block_diffusion_mask[:, :, :block_end, :block_end]
1093
+ context_pos_ids = position_ids[:, :block_end]
1094
+
1095
+ outputs = self.model(
1096
+ inputs_embeds=context_embeds,
1097
+ attention_mask=context_mask,
1098
+ position_ids=context_pos_ids,
1099
+ past_key_values=None,
1100
+ use_cache=False,
1101
+ store_kv=False,
1102
+ )
1103
+ logits = self.lm_head(outputs.last_hidden_state[:, block_start:block_end]).float()
1104
+
1105
+ x0, x0_p = self._sample_with_temperature(logits, temperature, top_k, top_p)
1106
+
1107
+ # Ensure tensors are on the same device (for device_map="auto")
1108
+ output_device = x0.device
1109
+ is_mask_on_device = is_mask.to(output_device)
1110
+
1111
+ num_to_transfer = num_transfer_tokens[step].item()
1112
+ transfer_mask = self._get_transfer_mask(
1113
+ is_mask_on_device, x0_p, num_to_transfer, remasking_strategy, confidence_threshold, output_device
1114
+ )
1115
+
1116
+ cur_block_ids = torch.where(transfer_mask, x0, cur_block_ids)
1117
+ x0_embeds = self.get_input_embeddings()(x0)
1118
+ cur_block_embeds = torch.where(transfer_mask.unsqueeze(-1), x0_embeds, cur_block_embeds)
1119
+
1120
+ x_embeds[:, block_start:block_end] = cur_block_embeds
1121
+ x_ids[:, block_start:block_end] = cur_block_ids
1122
+
1123
+ # EOS check: stop generation if EOS token is generated
1124
+ if block_end > prompt_len:
1125
+ gen_start_in_block = max(prompt_len, block_start)
1126
+ gen_ids_check = x_ids[:, gen_start_in_block:block_end]
1127
+ eos_token_id = self.config.eos_token_id if hasattr(self.config, 'eos_token_id') else 151645
1128
+ if eos_token_id in gen_ids_check:
1129
+ break
1130
+
1131
+ return x_ids[:, prompt_len:prompt_len + gen_length]
1132
+
1133
+ def _get_num_transfer_tokens(self, block_length: int, steps: int) -> torch.Tensor:
1134
+ if steps == 0:
1135
+ return torch.zeros(0, dtype=torch.int64)
1136
+ base = block_length // steps
1137
+ remainder = block_length % steps
1138
+ num_transfer_tokens = torch.zeros(steps + 1, dtype=torch.int64) + base
1139
+ num_transfer_tokens[:remainder] += 1
1140
+ return num_transfer_tokens
1141
+
1142
+ def _sample_with_temperature(self, logits, temperature, top_k, top_p):
1143
+ vocab_size = logits.shape[-1]
1144
+ logits_2d = logits.reshape(-1, vocab_size)
1145
+ probs_original = F.softmax(logits_2d, dim=-1)
1146
+
1147
+ if temperature == 0:
1148
+ token = torch.argmax(logits_2d, dim=-1, keepdim=True)
1149
+ else:
1150
+ logits_modified = logits_2d / temperature
1151
+ if top_k > 0:
1152
+ logits_modified = self._top_k_logits(logits_modified, top_k)
1153
+ if top_p < 1.0:
1154
+ logits_modified = self._top_p_logits(logits_modified, top_p)
1155
+ probs_modified = F.softmax(logits_modified, dim=-1)
1156
+ token = torch.multinomial(probs_modified, num_samples=1)
1157
+
1158
+ token_prob = torch.gather(probs_original, -1, token)
1159
+ orig_shape = logits.shape[:-1]
1160
+ return token.view(*orig_shape), token_prob.view(*orig_shape)
1161
+
1162
+ @staticmethod
1163
+ def _top_k_logits(logits, k):
1164
+ if k <= 0:
1165
+ return logits
1166
+ values, _ = torch.topk(logits, k)
1167
+ min_values = values[..., -1, None]
1168
+ return torch.where(logits < min_values, float('-inf'), logits)
1169
+
1170
+ @staticmethod
1171
+ def _top_p_logits(logits, p):
1172
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
1173
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
1174
+ sorted_mask = cumulative_probs > p
1175
+ sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
1176
+ sorted_mask[..., 0] = False
1177
+ mask_indices = torch.scatter(
1178
+ torch.full_like(logits, False, dtype=torch.bool),
1179
+ -1, sorted_indices, sorted_mask
1180
+ )
1181
+ return logits.masked_fill(mask_indices, float('-inf'))
1182
+
1183
+ def _get_transfer_mask(self, is_mask, x0_p, num_to_transfer, strategy, threshold, device):
1184
+ transfer_mask = torch.zeros_like(is_mask, dtype=torch.bool)
1185
+
1186
+ if strategy == 'sequential':
1187
+ for j in range(is_mask.shape[0]):
1188
+ if is_mask[j].any():
1189
+ mask_positions = is_mask[j].nonzero(as_tuple=True)[0]
1190
+ num_to_select = min(num_to_transfer, len(mask_positions))
1191
+ selected_positions = mask_positions[:num_to_select]
1192
+ transfer_mask[j, selected_positions] = True
1193
+
1194
+ elif strategy == 'low_confidence_static':
1195
+ confidence = torch.where(is_mask, x0_p, float('-inf'))
1196
+ for j in range(confidence.shape[0]):
1197
+ num_masks = is_mask[j].sum().item()
1198
+ k = min(num_to_transfer, num_masks)
1199
+ if k > 0 and not torch.all(torch.isinf(confidence[j])):
1200
+ _, idx = torch.topk(confidence[j], k)
1201
+ transfer_mask[j, idx] = True
1202
+
1203
+ elif strategy == 'low_confidence_dynamic':
1204
+ confidence = torch.where(is_mask, x0_p, float('-inf'))
1205
+ for j in range(confidence.shape[0]):
1206
+ high_conf_mask = confidence[j] > threshold
1207
+ num_high = high_conf_mask.sum().item()
1208
+ if num_high >= num_to_transfer:
1209
+ transfer_mask[j] = high_conf_mask
1210
+ else:
1211
+ num_masks = is_mask[j].sum().item()
1212
+ k = min(num_to_transfer, num_masks)
1213
+ if k > 0:
1214
+ _, idx = torch.topk(confidence[j], k)
1215
+ transfer_mask[j, idx] = True
1216
+
1217
+ return transfer_mask
1218
+
1219
+
1220
+ __all__ = [
1221
+ "DiffusionVL_Qwen2_5_Config",
1222
+ "DiffusionVL_Qwen2_5_VisionConfig",
1223
+ "DiffusionVL_Qwen2_5_Model",
1224
+ "DiffusionVL_Qwen2_5_ForConditionalGeneration",
1225
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_mean": [
3
+ 0.5,
4
+ 0.5,
5
+ 0.5
6
+ ],
7
+ "image_std": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "size": [
13
+ 384,
14
+ 384
15
+ ],
16
+ "rescale_factor": 0.00392156862745098,
17
+ "processor_class": "DiffusionVL_Qwen2_5_Processor",
18
+ "auto_map": {
19
+ "AutoProcessor": "processing_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_Processor"
20
+ }
21
+ }
processing_diffusionvl_qwen2_5.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ DiffusionVL-Qwen2.5 Processor - Self-contained image processing matching training code.
17
+
18
+ This processor implements the same image processing pipeline as the training code:
19
+ - process_images with anyres support
20
+ - tokenizer_image_token for proper <image> token handling
21
+ - Uses SiglipImageProcessor for the underlying image preprocessing
22
+ """
23
+
24
+ import ast
25
+ import math
26
+ import re
27
+ from typing import List, Optional, Tuple, Union
28
+
29
+ import torch
30
+ import numpy as np
31
+ from PIL import Image
32
+
33
+ from transformers.feature_extraction_utils import BatchFeature
34
+ from transformers.processing_utils import ProcessorMixin
35
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
36
+ from transformers import SiglipImageProcessor
37
+
38
+
39
+ # Image token for LLaVA format
40
+ DEFAULT_IMAGE_TOKEN = "<image>"
41
+ IMAGE_TOKEN_INDEX = -200
42
+
43
+
44
+ # ============================================================================
45
+ # Image Processing Utilities (matching training code mm_utils.py)
46
+ # ============================================================================
47
+
48
+ def select_best_resolution(original_size: Tuple[int, int], possible_resolutions: List[Tuple[int, int]]) -> Tuple[int, int]:
49
+ """
50
+ Selects the best resolution from a list of possible resolutions based on the original size.
51
+ Matching training code: llava/mm_utils.py::select_best_resolution
52
+ """
53
+ original_width, original_height = original_size
54
+ best_fit = None
55
+ max_effective_resolution = 0
56
+ min_wasted_resolution = float("inf")
57
+
58
+ for width, height in possible_resolutions:
59
+ scale = min(width / original_width, height / original_height)
60
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
61
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
62
+ wasted_resolution = (width * height) - effective_resolution
63
+
64
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
65
+ max_effective_resolution = effective_resolution
66
+ min_wasted_resolution = wasted_resolution
67
+ best_fit = (width, height)
68
+
69
+ return best_fit
70
+
71
+
72
+ def resize_and_pad_image(image: Image.Image, target_resolution: Tuple[int, int]) -> Image.Image:
73
+ """
74
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
75
+ Matching training code: llava/mm_utils.py::resize_and_pad_image
76
+ """
77
+ original_width, original_height = image.size
78
+ target_width, target_height = target_resolution
79
+
80
+ scale_w = target_width / original_width
81
+ scale_h = target_height / original_height
82
+
83
+ if scale_w < scale_h:
84
+ new_width = target_width
85
+ new_height = min(math.ceil(original_height * scale_w), target_height)
86
+ else:
87
+ new_height = target_height
88
+ new_width = min(math.ceil(original_width * scale_h), target_width)
89
+
90
+ resized_image = image.resize((new_width, new_height))
91
+ new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
92
+ paste_x = (target_width - new_width) // 2
93
+ paste_y = (target_height - new_height) // 2
94
+ new_image.paste(resized_image, (paste_x, paste_y))
95
+
96
+ return new_image
97
+
98
+
99
+ def divide_to_patches(image: Image.Image, patch_size: int) -> List[Image.Image]:
100
+ """
101
+ Divides an image into patches of a specified size.
102
+ Matching training code: llava/mm_utils.py::divide_to_patches
103
+ """
104
+ patches = []
105
+ width, height = image.size
106
+ for i in range(0, height, patch_size):
107
+ for j in range(0, width, patch_size):
108
+ box = (j, i, j + patch_size, i + patch_size)
109
+ patch = image.crop(box)
110
+ patches.append(patch)
111
+ return patches
112
+
113
+
114
+ def expand2square(pil_img: Image.Image, background_color: Tuple[int, int, int]) -> Image.Image:
115
+ """
116
+ Expand image to square by padding.
117
+ Matching training code: llava/mm_utils.py::expand2square
118
+ """
119
+ width, height = pil_img.size
120
+ if width == height:
121
+ return pil_img
122
+ elif width > height:
123
+ result = Image.new(pil_img.mode, (width, width), background_color)
124
+ result.paste(pil_img, (0, (width - height) // 2))
125
+ return result
126
+ else:
127
+ result = Image.new(pil_img.mode, (height, height), background_color)
128
+ result.paste(pil_img, ((height - width) // 2, 0))
129
+ return result
130
+
131
+
132
+ def get_anyres_image_grid_shape(image_size: Tuple[int, int], grid_pinpoints, patch_size: int) -> Tuple[int, int]:
133
+ """
134
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
135
+ Matching training code: llava/mm_utils.py::get_anyres_image_grid_shape
136
+ """
137
+ if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
138
+ assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
139
+ matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
140
+ range_start = tuple(map(int, matches[0]))
141
+ range_end = tuple(map(int, matches[-1]))
142
+ grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
143
+ grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
144
+ if isinstance(grid_pinpoints, list):
145
+ possible_resolutions = grid_pinpoints
146
+ else:
147
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
148
+ width, height = select_best_resolution(image_size, possible_resolutions)
149
+ return width // patch_size, height // patch_size
150
+
151
+
152
+ def process_anyres_image(image: Image.Image, processor: SiglipImageProcessor, grid_pinpoints: str) -> torch.Tensor:
153
+ """
154
+ Process an image with variable resolutions (anyres).
155
+ Matching training code: llava/mm_utils.py::process_anyres_image
156
+
157
+ Returns: torch.Tensor of shape (num_patches, C, H, W) where num_patches = 1 + grid_patches
158
+ """
159
+ # Get patch size from processor
160
+ if isinstance(processor.size, dict):
161
+ patch_size = processor.size.get("shortest_edge", processor.size.get("height", 384))
162
+ else:
163
+ patch_size = processor.size[0] if hasattr(processor.size, '__getitem__') else 384
164
+
165
+ crop_size = processor.crop_size.get("height", patch_size) if hasattr(processor, 'crop_size') else patch_size
166
+
167
+ # Parse grid pinpoints
168
+ if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
169
+ assert patch_size in [224, 336, 384, 448, 512], f"patch_size {patch_size} should be in [224, 336, 384, 448, 512]"
170
+ matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
171
+ range_start = tuple(map(int, matches[0]))
172
+ range_end = tuple(map(int, matches[-1]))
173
+ grid_pinpoints_list = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
174
+ possible_resolutions = [[dim * patch_size for dim in pair] for pair in grid_pinpoints_list]
175
+ elif isinstance(grid_pinpoints, list):
176
+ possible_resolutions = grid_pinpoints
177
+ else:
178
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
179
+
180
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
181
+ image_padded = resize_and_pad_image(image, best_resolution)
182
+ patches = divide_to_patches(image_padded, crop_size)
183
+
184
+ # Base image (resized to patch size) - matching training code behavior
185
+ if isinstance(processor.size, dict):
186
+ shortest_edge = processor.size.get("shortest_edge", processor.size.get("height", 384))
187
+ else:
188
+ shortest_edge = min(processor.size) if hasattr(processor.size, '__iter__') else 384
189
+ image_original_resize = image.resize((shortest_edge, shortest_edge))
190
+
191
+ # Combine: base image + grid patches (same order as training code)
192
+ image_patches = [image_original_resize] + patches
193
+
194
+ # Preprocess all patches using the HF processor
195
+ processed_patches = [processor.preprocess(patch, return_tensors="pt")["pixel_values"][0] for patch in image_patches]
196
+
197
+ return torch.stack(processed_patches, dim=0)
198
+
199
+
200
+ def process_images(images: List[Image.Image], image_processor: SiglipImageProcessor, model_cfg) -> torch.Tensor:
201
+ """
202
+ Process images matching the training code pipeline.
203
+ Matching training code: llava/mm_utils.py::process_images
204
+
205
+ Args:
206
+ images: List of PIL Images
207
+ image_processor: SiglipImageProcessor instance
208
+ model_cfg: Model config with image_aspect_ratio and image_grid_pinpoints
209
+
210
+ Returns:
211
+ torch.Tensor or List[torch.Tensor] of processed image patches
212
+ """
213
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
214
+ new_images = []
215
+
216
+ if image_aspect_ratio == "anyres" or (image_aspect_ratio and "anyres" in image_aspect_ratio):
217
+ grid_pinpoints = getattr(model_cfg, "image_grid_pinpoints", "(1x1),...,(2x2)")
218
+ for image in images:
219
+ processed = process_anyres_image(image, image_processor, grid_pinpoints)
220
+ new_images.append(processed)
221
+ elif image_aspect_ratio == "pad":
222
+ for image in images:
223
+ image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
224
+ processed = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
225
+ new_images.append(processed)
226
+ else:
227
+ # Default: simple preprocessing
228
+ return image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
229
+
230
+ # Stack if all same shape, otherwise return list
231
+ if all(x.shape == new_images[0].shape for x in new_images):
232
+ new_images = torch.stack(new_images, dim=0)
233
+ return new_images
234
+
235
+
236
+ def tokenizer_image_token(prompt: str, tokenizer, image_token_index: int = IMAGE_TOKEN_INDEX, return_tensors: str = None):
237
+ """
238
+ Tokenize prompt with proper handling of <image> tokens.
239
+ Matching training code: llava/mm_utils.py::tokenizer_image_token
240
+
241
+ Args:
242
+ prompt: Text prompt containing <image> placeholders
243
+ tokenizer: Tokenizer instance
244
+ image_token_index: Index to use for image tokens (default: -200)
245
+ return_tensors: If "pt", return PyTorch tensor
246
+
247
+ Returns:
248
+ List of token IDs or torch.Tensor
249
+ """
250
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
251
+
252
+ def insert_separator(X, sep):
253
+ return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
254
+
255
+ input_ids = []
256
+ offset = 0
257
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
258
+ offset = 1
259
+ input_ids.append(prompt_chunks[0][0])
260
+
261
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
262
+ input_ids.extend(x[offset:])
263
+
264
+ if return_tensors is not None:
265
+ if return_tensors == "pt":
266
+ return torch.tensor(input_ids, dtype=torch.long)
267
+ raise ValueError(f"Unsupported tensor type: {return_tensors}")
268
+ return input_ids
269
+
270
+
271
+ # ============================================================================
272
+ # Conversation Templates (matching training code)
273
+ # ============================================================================
274
+
275
+ class Conversation:
276
+ """Simple conversation class matching LLaVA's conv_templates."""
277
+
278
+ def __init__(self, system: str, roles: Tuple[str, str], sep: str, sep2: str = None):
279
+ self.system = system
280
+ self.roles = roles
281
+ self.sep = sep
282
+ self.sep2 = sep2
283
+ self.messages = []
284
+
285
+ def copy(self):
286
+ return Conversation(
287
+ system=self.system,
288
+ roles=self.roles,
289
+ sep=self.sep,
290
+ sep2=self.sep2,
291
+ )
292
+
293
+ def append_message(self, role: str, message: str):
294
+ self.messages.append([role, message])
295
+
296
+ def get_prompt(self) -> str:
297
+ """Build the prompt string."""
298
+ ret = ""
299
+ if self.system:
300
+ ret = f"<|im_start|>system\n{self.system}<|im_end|>\n"
301
+
302
+ for role, message in self.messages:
303
+ if message:
304
+ ret += f"<|im_start|>{role}\n{message}<|im_end|>\n"
305
+ else:
306
+ ret += f"<|im_start|>{role}\n"
307
+ return ret
308
+
309
+
310
+ # Pre-defined conversation template for Qwen2.5
311
+ CONV_QWEN_2_5 = Conversation(
312
+ system="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
313
+ roles=("user", "assistant"),
314
+ sep="<|im_end|>",
315
+ sep2=None,
316
+ )
317
+
318
+
319
+ # ============================================================================
320
+ # Main Processor Class
321
+ # ============================================================================
322
+
323
+ class DiffusionVL_Qwen2_5_Processor(ProcessorMixin):
324
+ """
325
+ Processor for DiffusionVL-Qwen2.5 model.
326
+
327
+ Self-contained implementation matching the training code pipeline:
328
+ - Uses SiglipImageProcessor for image preprocessing
329
+ - Implements process_images with anyres support
330
+ - Implements tokenizer_image_token for proper <image> token handling
331
+
332
+ The processor stores model config for anyres parameters. Config can be:
333
+ 1. Passed during __init__ via config parameter
334
+ 2. Set after loading via set_config() method
335
+ 3. Passed per-call via model_cfg parameter in __call__
336
+ """
337
+
338
+ attributes = ["tokenizer"]
339
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
340
+
341
+ def __init__(
342
+ self,
343
+ tokenizer=None,
344
+ image_processor=None,
345
+ config=None,
346
+ **kwargs
347
+ ):
348
+ # Use provided image_processor or create default SiglipImageProcessor
349
+ if image_processor is None:
350
+ self.image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
351
+ else:
352
+ self.image_processor = image_processor
353
+
354
+ # Store config for anyres processing
355
+ self._config = config
356
+
357
+ super().__init__(tokenizer)
358
+
359
+ def set_config(self, config):
360
+ """Set model config for anyres image processing."""
361
+ self._config = config
362
+
363
+ def __call__(
364
+ self,
365
+ text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
366
+ images: Optional[Union[Image.Image, List[Image.Image]]] = None,
367
+ model_cfg=None,
368
+ return_tensors: Optional[str] = "pt",
369
+ **kwargs,
370
+ ) -> BatchFeature:
371
+ """
372
+ Process text and images for model input.
373
+
374
+ Args:
375
+ text: Input text or list of texts with <image> placeholder.
376
+ images: PIL Image or list of PIL Images.
377
+ model_cfg: Model config (needed for anyres parameters).
378
+ return_tensors: Return type ("pt" for PyTorch).
379
+
380
+ Returns:
381
+ BatchFeature with input_ids and pixel_values.
382
+ """
383
+ if text is None and images is None:
384
+ raise ValueError("You must provide either text or images.")
385
+
386
+ # Process text using tokenizer_image_token
387
+ if text is not None:
388
+ if isinstance(text, str):
389
+ text = [text]
390
+
391
+ all_input_ids = []
392
+ for t in text:
393
+ input_ids = tokenizer_image_token(t, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
394
+ all_input_ids.append(input_ids)
395
+
396
+ # Pad sequences if multiple
397
+ if len(all_input_ids) > 1:
398
+ max_len = max(ids.shape[0] for ids in all_input_ids)
399
+ padded_input_ids = []
400
+ for ids in all_input_ids:
401
+ if ids.shape[0] < max_len:
402
+ padding = torch.full((max_len - ids.shape[0],), self.tokenizer.pad_token_id, dtype=torch.long)
403
+ ids = torch.cat([ids, padding])
404
+ padded_input_ids.append(ids)
405
+ input_ids = torch.stack(padded_input_ids)
406
+ else:
407
+ input_ids = all_input_ids[0].unsqueeze(0)
408
+
409
+ text_inputs = {"input_ids": input_ids}
410
+ else:
411
+ text_inputs = {}
412
+
413
+ # Process images using process_images
414
+ if images is not None:
415
+ if isinstance(images, Image.Image):
416
+ images = [images]
417
+
418
+ # Get image sizes before processing
419
+ image_sizes = [img.size for img in images]
420
+
421
+ # Use model_cfg if provided, otherwise use stored config
422
+ cfg = model_cfg if model_cfg is not None else self._config
423
+
424
+ if cfg is not None:
425
+ pixel_values = process_images(images, self.image_processor, cfg)
426
+ # Calculate num_patches_per_image for anyres
427
+ if isinstance(pixel_values, list):
428
+ num_patches_per_image = [t.shape[0] for t in pixel_values]
429
+ # Concatenate all patches into single tensor
430
+ pixel_values = torch.cat(pixel_values, dim=0)
431
+ elif pixel_values.dim() == 5:
432
+ # Shape: (num_images, num_patches, C, H, W)
433
+ num_patches_per_image = [pixel_values.shape[1]] * pixel_values.shape[0]
434
+ pixel_values = pixel_values.view(-1, *pixel_values.shape[2:])
435
+ else:
436
+ # Shape: (total_patches, C, H, W) - 1 patch per image
437
+ num_patches_per_image = [1] * len(images)
438
+ else:
439
+ # Fallback to simple preprocessing if no config
440
+ pixel_values = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
441
+ num_patches_per_image = [1] * len(images)
442
+
443
+ image_inputs = {
444
+ "pixel_values": pixel_values,
445
+ "image_sizes": image_sizes,
446
+ }
447
+ else:
448
+ image_inputs = {}
449
+ num_patches_per_image = None
450
+
451
+ # Create BatchFeature first
452
+ result = BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
453
+
454
+ # Add num_patches_per_image as plain Python list (not converted to tensor)
455
+ # This is needed for prepare_inputs_labels_for_multimodal
456
+ if num_patches_per_image is not None:
457
+ result["num_patches_per_image"] = num_patches_per_image
458
+
459
+ return result
460
+
461
+ def batch_decode(self, *args, **kwargs):
462
+ """Decode token IDs to text."""
463
+ return self.tokenizer.batch_decode(*args, **kwargs)
464
+
465
+ def decode(self, *args, **kwargs):
466
+ """Decode token IDs to text."""
467
+ return self.tokenizer.decode(*args, **kwargs)
468
+
469
+ @property
470
+ def model_input_names(self):
471
+ tokenizer_input_names = self.tokenizer.model_input_names
472
+ image_processor_input_names = ["pixel_values", "image_sizes", "num_patches_per_image"]
473
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
474
+
475
+
476
+ __all__ = [
477
+ "DiffusionVL_Qwen2_5_Processor",
478
+ "process_images",
479
+ "tokenizer_image_token",
480
+ "get_anyres_image_grid_shape",
481
+ "Conversation",
482
+ "CONV_QWEN_2_5",
483
+ "DEFAULT_IMAGE_TOKEN",
484
+ "IMAGE_TOKEN_INDEX",
485
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 8192,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff