xiazhi commited on
Commit
a471b31
·
verified ·
1 Parent(s): 8dcc5d2

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -64,5 +64,5 @@ print(output_text)
64
  - **LLM**: Qwen2.5 (standard RoPE)
65
  - **BD3LM Enabled**: True
66
  - **Block Size**: 8
67
- - **Hidden Size**: 3584
68
- - **Num Layers**: 28
 
64
  - **LLM**: Qwen2.5 (standard RoPE)
65
  - **BD3LM Enabled**: True
66
  - **Block Size**: 8
67
+ - **Hidden Size**: 2048
68
+ - **Num Layers**: 36
config.json CHANGED
@@ -13,6 +13,7 @@
13
  "bd3lm_complementary_mask": false,
14
  "bd3lm_cross_attn": true,
15
  "bd3lm_ignore_bos": true,
 
16
  "bd3lm_noise_granularity": "block",
17
  "bd3lm_noise_type": "loglinear",
18
  "bd3lm_parameterization": "subs",
@@ -25,163 +26,20 @@
25
  "bos_token_id": 151643,
26
  "enable_bd3lm": true,
27
  "enable_block_size_annealing": false,
 
28
  "enable_noise_level_annealing": false,
29
  "eos_token_id": 151645,
30
  "faster_token_stride": 10,
31
  "force_sample": false,
32
  "hidden_act": "silu",
33
- "hidden_size": 3584,
34
- "image_aspect_ratio": "anyres_max_4",
35
  "image_crop_resolution": null,
36
- "image_grid_pinpoints": [
37
- [
38
- 384,
39
- 384
40
- ],
41
- [
42
- 384,
43
- 768
44
- ],
45
- [
46
- 384,
47
- 1152
48
- ],
49
- [
50
- 384,
51
- 1536
52
- ],
53
- [
54
- 384,
55
- 1920
56
- ],
57
- [
58
- 384,
59
- 2304
60
- ],
61
- [
62
- 768,
63
- 384
64
- ],
65
- [
66
- 768,
67
- 768
68
- ],
69
- [
70
- 768,
71
- 1152
72
- ],
73
- [
74
- 768,
75
- 1536
76
- ],
77
- [
78
- 768,
79
- 1920
80
- ],
81
- [
82
- 768,
83
- 2304
84
- ],
85
- [
86
- 1152,
87
- 384
88
- ],
89
- [
90
- 1152,
91
- 768
92
- ],
93
- [
94
- 1152,
95
- 1152
96
- ],
97
- [
98
- 1152,
99
- 1536
100
- ],
101
- [
102
- 1152,
103
- 1920
104
- ],
105
- [
106
- 1152,
107
- 2304
108
- ],
109
- [
110
- 1536,
111
- 384
112
- ],
113
- [
114
- 1536,
115
- 768
116
- ],
117
- [
118
- 1536,
119
- 1152
120
- ],
121
- [
122
- 1536,
123
- 1536
124
- ],
125
- [
126
- 1536,
127
- 1920
128
- ],
129
- [
130
- 1536,
131
- 2304
132
- ],
133
- [
134
- 1920,
135
- 384
136
- ],
137
- [
138
- 1920,
139
- 768
140
- ],
141
- [
142
- 1920,
143
- 1152
144
- ],
145
- [
146
- 1920,
147
- 1536
148
- ],
149
- [
150
- 1920,
151
- 1920
152
- ],
153
- [
154
- 1920,
155
- 2304
156
- ],
157
- [
158
- 2304,
159
- 384
160
- ],
161
- [
162
- 2304,
163
- 768
164
- ],
165
- [
166
- 2304,
167
- 1152
168
- ],
169
- [
170
- 2304,
171
- 1536
172
- ],
173
- [
174
- 2304,
175
- 1920
176
- ],
177
- [
178
- 2304,
179
- 2304
180
- ]
181
- ],
182
  "image_split_resolution": null,
 
183
  "initializer_range": 0.02,
184
- "intermediate_size": 18944,
185
  "layer_types": [
186
  "full_attention",
187
  "full_attention",
@@ -210,17 +68,25 @@
210
  "full_attention",
211
  "full_attention",
212
  "full_attention",
 
 
 
 
 
 
 
 
213
  "full_attention"
214
  ],
215
  "max_pixels": 262144,
216
- "max_position_embeddings": 32768,
217
- "max_window_layers": 28,
218
  "min_pixels": 147456,
219
- "mm_hidden_size": 1152,
220
  "mm_newline_position": "grid",
221
- "mm_patch_merge_type": "spatial_unpad",
222
  "mm_projector_lr": null,
223
- "mm_projector_type": "mlp2x_gelu",
224
  "mm_resampler_type": null,
225
  "mm_spatial_pool_mode": "bilinear",
226
  "mm_spatial_pool_stride": null,
@@ -229,19 +95,27 @@
229
  "mm_use_im_start_end": false,
230
  "mm_vision_select_feature": "patch",
231
  "mm_vision_select_layer": -2,
232
- "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
233
  "mm_vision_tower_lr": 2e-06,
234
  "model_max_length": 8192,
235
  "model_type": "diffusionvl_qwen2_5",
236
- "num_attention_heads": 28,
237
- "num_hidden_layers": 28,
238
- "num_key_value_heads": 4,
239
  "pos_skipping_range": 4096,
240
  "rms_norm_eps": 1e-06,
241
- "rope_scaling": null,
 
 
 
 
 
 
 
 
242
  "rope_theta": 1000000.0,
243
  "sliding_window": null,
244
- "tie_word_embeddings": false,
245
  "tokenizer_model_max_length": 8192,
246
  "tokenizer_padding_side": "right",
247
  "torch_dtype": "bfloat16",
@@ -250,21 +124,38 @@
250
  "use_mm_proj": true,
251
  "use_pos_skipping": false,
252
  "use_sliding_window": false,
253
- "vision_tower_pretrained": null,
254
- "vocab_size": 152064,
255
- "mask_token_id": 151671,
256
  "vision_config": {
257
- "hidden_size": 1152,
258
- "intermediate_size": 4304,
259
- "num_hidden_layers": 26,
260
- "num_attention_heads": 16,
261
- "num_channels": 3,
262
- "image_size": 384,
 
 
 
 
 
 
 
 
 
 
263
  "patch_size": 14,
264
- "hidden_act": "gelu_pytorch_tanh",
265
- "layer_norm_eps": 1e-06,
266
- "attention_dropout": 0.0
 
 
 
267
  },
 
 
 
 
 
 
268
  "auto_map": {
269
  "AutoConfig": "configuration_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_Config",
270
  "AutoModelForCausalLM": "modeling_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_ForConditionalGeneration",
 
13
  "bd3lm_complementary_mask": false,
14
  "bd3lm_cross_attn": true,
15
  "bd3lm_ignore_bos": true,
16
+ "bd3lm_mask_prob": 0.5,
17
  "bd3lm_noise_granularity": "block",
18
  "bd3lm_noise_type": "loglinear",
19
  "bd3lm_parameterization": "subs",
 
26
  "bos_token_id": 151643,
27
  "enable_bd3lm": true,
28
  "enable_block_size_annealing": false,
29
+ "enable_mtd": false,
30
  "enable_noise_level_annealing": false,
31
  "eos_token_id": 151645,
32
  "faster_token_stride": 10,
33
  "force_sample": false,
34
  "hidden_act": "silu",
35
+ "hidden_size": 2048,
36
+ "image_aspect_ratio": "pad",
37
  "image_crop_resolution": null,
38
+ "image_grid_pinpoints": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "image_split_resolution": null,
40
+ "image_token_id": null,
41
  "initializer_range": 0.02,
42
+ "intermediate_size": 11008,
43
  "layer_types": [
44
  "full_attention",
45
  "full_attention",
 
68
  "full_attention",
69
  "full_attention",
70
  "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention",
76
+ "full_attention",
77
+ "full_attention",
78
+ "full_attention",
79
  "full_attention"
80
  ],
81
  "max_pixels": 262144,
82
+ "max_position_embeddings": 128000,
83
+ "max_window_layers": 70,
84
  "min_pixels": 147456,
85
+ "mm_hidden_size": 1280,
86
  "mm_newline_position": "grid",
87
+ "mm_patch_merge_type": "flat",
88
  "mm_projector_lr": null,
89
+ "mm_projector_type": "qwen_merger",
90
  "mm_resampler_type": null,
91
  "mm_spatial_pool_mode": "bilinear",
92
  "mm_spatial_pool_stride": null,
 
95
  "mm_use_im_start_end": false,
96
  "mm_vision_select_feature": "patch",
97
  "mm_vision_select_layer": -2,
98
+ "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/Qwen2.5-VL-3B-Instruct-Reformat",
99
  "mm_vision_tower_lr": 2e-06,
100
  "model_max_length": 8192,
101
  "model_type": "diffusionvl_qwen2_5",
102
+ "num_attention_heads": 16,
103
+ "num_hidden_layers": 36,
104
+ "num_key_value_heads": 2,
105
  "pos_skipping_range": 4096,
106
  "rms_norm_eps": 1e-06,
107
+ "rope_scaling": {
108
+ "mrope_section": [
109
+ 16,
110
+ 24,
111
+ 24
112
+ ],
113
+ "rope_type": "default",
114
+ "type": "default"
115
+ },
116
  "rope_theta": 1000000.0,
117
  "sliding_window": null,
118
+ "tie_word_embeddings": true,
119
  "tokenizer_model_max_length": 8192,
120
  "tokenizer_padding_side": "right",
121
  "torch_dtype": "bfloat16",
 
124
  "use_mm_proj": true,
125
  "use_pos_skipping": false,
126
  "use_sliding_window": false,
127
+ "video_token_id": null,
 
 
128
  "vision_config": {
129
+ "depth": 32,
130
+ "fullatt_block_indexes": [
131
+ 7,
132
+ 15,
133
+ 23,
134
+ 31
135
+ ],
136
+ "hidden_act": "silu",
137
+ "hidden_size": 1280,
138
+ "in_channels": 3,
139
+ "in_chans": 3,
140
+ "initializer_range": 0.02,
141
+ "intermediate_size": 3420,
142
+ "model_type": "",
143
+ "num_heads": 16,
144
+ "out_hidden_size": 2048,
145
  "patch_size": 14,
146
+ "spatial_merge_size": 2,
147
+ "spatial_patch_size": 14,
148
+ "temporal_patch_size": 2,
149
+ "tokens_per_second": 2,
150
+ "torch_dtype": "float32",
151
+ "window_size": 112
152
  },
153
+ "vision_end_token_id": 151653,
154
+ "vision_start_token_id": 151652,
155
+ "vision_token_id": 151654,
156
+ "vision_tower_pretrained": null,
157
+ "vocab_size": 151936,
158
+ "mask_token_id": 151671,
159
  "auto_map": {
160
  "AutoConfig": "configuration_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_Config",
161
  "AutoModelForCausalLM": "modeling_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_ForConditionalGeneration",
configuration_diffusionvl_qwen2_5.py CHANGED
@@ -1,6 +1,8 @@
1
  # coding=utf-8
2
  # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
  #
 
 
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
6
  # You may obtain a copy of the License at
@@ -12,7 +14,8 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
- """DiffusionVL-Qwen2.5 model configuration."""
 
16
 
17
  from typing import List, Optional, Union
18
 
 
1
  # coding=utf-8
2
  # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
  #
4
+ # This code is based on Qwen2.5 and SigLIP. It has been modified to create DiffusionVL.
5
+ #
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
8
  # You may obtain a copy of the License at
 
14
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
+
18
+ """DiffusionVL-Qwen2.5 (SigLIP + Qwen2.5) model configuration."""
19
 
20
  from typing import List, Optional, Union
21
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c6252727577975883dcd7a6b253f1d03c1a6e3bf3f78082ffe00f702f31e0d3
3
+ size 4957560272
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02e807b6f6e298f3472a3c9b2afc0610ac9a42fe2b38300f80e56bfc3956743c
3
+ size 2551787344
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
modeling_diffusionvl_qwen2_5.py CHANGED
@@ -1,6 +1,8 @@
1
  # coding=utf-8
2
  # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
  #
 
 
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
6
  # You may obtain a copy of the License at
@@ -13,15 +15,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
16
- """
17
- DiffusionVL-Qwen2.5 model implementation.
18
-
19
- This model uses:
20
- - SigLIP as the vision encoder (external ViT)
21
- - mlp2x_gelu as the MM projector (2-layer MLP with GELU)
22
- - Qwen2.5 as the LLM backbone (standard RoPE)
23
- - BD3LM for diffusion-based generation
24
- """
25
 
26
  import math
27
  from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -1107,6 +1101,8 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
1107
  # Ensure tensors are on the same device (for device_map="auto")
1108
  output_device = x0.device
1109
  is_mask_on_device = is_mask.to(output_device)
 
 
1110
 
1111
  num_to_transfer = num_transfer_tokens[step].item()
1112
  transfer_mask = self._get_transfer_mask(
 
1
  # coding=utf-8
2
  # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
  #
4
+ # This code is based on Qwen2.5 and SigLIP. It has been modified to create DiffusionVL.
5
+ #
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
8
  # You may obtain a copy of the License at
 
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
 
18
+ """DiffusionVL-Qwen2.5 (SigLIP + Qwen2.5) model implementation."""
 
 
 
 
 
 
 
 
19
 
20
  import math
21
  from typing import Callable, Dict, List, Optional, Tuple, Union
 
1101
  # Ensure tensors are on the same device (for device_map="auto")
1102
  output_device = x0.device
1103
  is_mask_on_device = is_mask.to(output_device)
1104
+ cur_block_ids = cur_block_ids.to(output_device)
1105
+ cur_block_embeds = cur_block_embeds.to(output_device)
1106
 
1107
  num_to_transfer = num_transfer_tokens[step].item()
1108
  transfer_mask = self._get_transfer_mask(
processing_diffusionvl_qwen2_5.py CHANGED
@@ -1,6 +1,8 @@
1
  # coding=utf-8
2
  # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
  #
 
 
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
6
  # You may obtain a copy of the License at
@@ -12,14 +14,8 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
- """
16
- DiffusionVL-Qwen2.5 Processor - Self-contained image processing matching training code.
17
-
18
- This processor implements the same image processing pipeline as the training code:
19
- - process_images with anyres support
20
- - tokenizer_image_token for proper <image> token handling
21
- - Uses SiglipImageProcessor for the underlying image preprocessing
22
- """
23
 
24
  import ast
25
  import math
 
1
  # coding=utf-8
2
  # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
  #
4
+ # This code is based on Qwen2.5 and SigLIP. It has been modified to create DiffusionVL.
5
+ #
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
8
  # You may obtain a copy of the License at
 
14
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
+
18
+ """DiffusionVL-Qwen2.5 Processor - Combines image processor and tokenizer."""
 
 
 
 
 
 
19
 
20
  import ast
21
  import math