Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

README.md +2 -2
config.json +63 -172
configuration_diffusionvl_qwen2_5.py +4 -1
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_diffusionvl_qwen2_5.py +5 -9
processing_diffusionvl_qwen2_5.py +4 -8

README.md CHANGED Viewed

@@ -64,5 +64,5 @@ print(output_text)
 - **LLM**: Qwen2.5 (standard RoPE)
 - **BD3LM Enabled**: True
 - **Block Size**: 8
-- **Hidden Size**: 3584
-- **Num Layers**: 28

 - **LLM**: Qwen2.5 (standard RoPE)
 - **BD3LM Enabled**: True
 - **Block Size**: 8
+- **Hidden Size**: 2048
+- **Num Layers**: 36

config.json CHANGED Viewed

@@ -13,6 +13,7 @@
   "bd3lm_complementary_mask": false,
   "bd3lm_cross_attn": true,
   "bd3lm_ignore_bos": true,
   "bd3lm_noise_granularity": "block",
   "bd3lm_noise_type": "loglinear",
   "bd3lm_parameterization": "subs",
@@ -25,163 +26,20 @@
   "bos_token_id": 151643,
   "enable_bd3lm": true,
   "enable_block_size_annealing": false,
   "enable_noise_level_annealing": false,
   "eos_token_id": 151645,
   "faster_token_stride": 10,
   "force_sample": false,
   "hidden_act": "silu",
-  "hidden_size": 3584,
-  "image_aspect_ratio": "anyres_max_4",
   "image_crop_resolution": null,
-  "image_grid_pinpoints": [
-    [
-      384,
-      384
-    ],
-    [
-      384,
-      768
-    ],
-    [
-      384,
-      1152
-    ],
-    [
-      384,
-      1536
-    ],
-    [
-      384,
-      1920
-    ],
-    [
-      384,
-      2304
-    ],
-    [
-      768,
-      384
-    ],
-    [
-      768,
-      768
-    ],
-    [
-      768,
-      1152
-    ],
-    [
-      768,
-      1536
-    ],
-    [
-      768,
-      1920
-    ],
-    [
-      768,
-      2304
-    ],
-    [
-      1152,
-      384
-    ],
-    [
-      1152,
-      768
-    ],
-    [
-      1152,
-      1152
-    ],
-    [
-      1152,
-      1536
-    ],
-    [
-      1152,
-      1920
-    ],
-    [
-      1152,
-      2304
-    ],
-    [
-      1536,
-      384
-    ],
-    [
-      1536,
-      768
-    ],
-    [
-      1536,
-      1152
-    ],
-    [
-      1536,
-      1536
-    ],
-    [
-      1536,
-      1920
-    ],
-    [
-      1536,
-      2304
-    ],
-    [
-      1920,
-      384
-    ],
-    [
-      1920,
-      768
-    ],
-    [
-      1920,
-      1152
-    ],
-    [
-      1920,
-      1536
-    ],
-    [
-      1920,
-      1920
-    ],
-    [
-      1920,
-      2304
-    ],
-    [
-      2304,
-      384
-    ],
-    [
-      2304,
-      768
-    ],
-    [
-      2304,
-      1152
-    ],
-    [
-      2304,
-      1536
-    ],
-    [
-      2304,
-      1920
-    ],
-    [
-      2304,
-      2304
-    ]
-  ],
   "image_split_resolution": null,
   "initializer_range": 0.02,
-  "intermediate_size": 18944,
   "layer_types": [
     "full_attention",
     "full_attention",
@@ -210,17 +68,25 @@
     "full_attention",
     "full_attention",
     "full_attention",
     "full_attention"
   ],
   "max_pixels": 262144,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 28,
   "min_pixels": 147456,
-  "mm_hidden_size": 1152,
   "mm_newline_position": "grid",
-  "mm_patch_merge_type": "spatial_unpad",
   "mm_projector_lr": null,
-  "mm_projector_type": "mlp2x_gelu",
   "mm_resampler_type": null,
   "mm_spatial_pool_mode": "bilinear",
   "mm_spatial_pool_stride": null,
@@ -229,19 +95,27 @@
   "mm_use_im_start_end": false,
   "mm_vision_select_feature": "patch",
   "mm_vision_select_layer": -2,
-  "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
   "mm_vision_tower_lr": 2e-06,
   "model_max_length": 8192,
   "model_type": "diffusionvl_qwen2_5",
-  "num_attention_heads": 28,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 4,
   "pos_skipping_range": 4096,
   "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
   "rope_theta": 1000000.0,
   "sliding_window": null,
-  "tie_word_embeddings": false,
   "tokenizer_model_max_length": 8192,
   "tokenizer_padding_side": "right",
   "torch_dtype": "bfloat16",
@@ -250,21 +124,38 @@
   "use_mm_proj": true,
   "use_pos_skipping": false,
   "use_sliding_window": false,
-  "vision_tower_pretrained": null,
-  "vocab_size": 152064,
-  "mask_token_id": 151671,
   "vision_config": {
-    "hidden_size": 1152,
-    "intermediate_size": 4304,
-    "num_hidden_layers": 26,
-    "num_attention_heads": 16,
-    "num_channels": 3,
-    "image_size": 384,
     "patch_size": 14,
-    "hidden_act": "gelu_pytorch_tanh",
-    "layer_norm_eps": 1e-06,
-    "attention_dropout": 0.0
   },
   "auto_map": {
     "AutoConfig": "configuration_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_Config",
     "AutoModelForCausalLM": "modeling_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_ForConditionalGeneration",

   "bd3lm_complementary_mask": false,
   "bd3lm_cross_attn": true,
   "bd3lm_ignore_bos": true,
+  "bd3lm_mask_prob": 0.5,
   "bd3lm_noise_granularity": "block",
   "bd3lm_noise_type": "loglinear",
   "bd3lm_parameterization": "subs",
   "bos_token_id": 151643,
   "enable_bd3lm": true,
   "enable_block_size_annealing": false,
+  "enable_mtd": false,
   "enable_noise_level_annealing": false,
   "eos_token_id": 151645,
   "faster_token_stride": 10,
   "force_sample": false,
   "hidden_act": "silu",
+  "hidden_size": 2048,
+  "image_aspect_ratio": "pad",
   "image_crop_resolution": null,
+  "image_grid_pinpoints": null,
   "image_split_resolution": null,
+  "image_token_id": null,
   "initializer_range": 0.02,
+  "intermediate_size": 11008,
   "layer_types": [
     "full_attention",
     "full_attention",
     "full_attention",
     "full_attention",
     "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
     "full_attention"
   ],
   "max_pixels": 262144,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 70,
   "min_pixels": 147456,
+  "mm_hidden_size": 1280,
   "mm_newline_position": "grid",
+  "mm_patch_merge_type": "flat",
   "mm_projector_lr": null,
+  "mm_projector_type": "qwen_merger",
   "mm_resampler_type": null,
   "mm_spatial_pool_mode": "bilinear",
   "mm_spatial_pool_stride": null,
   "mm_use_im_start_end": false,
   "mm_vision_select_feature": "patch",
   "mm_vision_select_layer": -2,
+  "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/Qwen2.5-VL-3B-Instruct-Reformat",
   "mm_vision_tower_lr": 2e-06,
   "model_max_length": 8192,
   "model_type": "diffusionvl_qwen2_5",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
   "pos_skipping_range": 4096,
   "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
   "rope_theta": 1000000.0,
   "sliding_window": null,
+  "tie_word_embeddings": true,
   "tokenizer_model_max_length": 8192,
   "tokenizer_padding_side": "right",
   "torch_dtype": "bfloat16",
   "use_mm_proj": true,
   "use_pos_skipping": false,
   "use_sliding_window": false,
+  "video_token_id": null,
   "vision_config": {
+    "depth": 32,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "model_type": "",
+    "num_heads": 16,
+    "out_hidden_size": 2048,
     "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "torch_dtype": "float32",
+    "window_size": 112
   },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vision_tower_pretrained": null,
+  "vocab_size": 151936,
+  "mask_token_id": 151671,
   "auto_map": {
     "AutoConfig": "configuration_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_Config",
     "AutoModelForCausalLM": "modeling_diffusionvl_qwen2_5.DiffusionVL_Qwen2_5_ForConditionalGeneration",

configuration_diffusionvl_qwen2_5.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # coding=utf-8
 # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""DiffusionVL-Qwen2.5 model configuration."""
 from typing import List, Optional, Union

 # coding=utf-8
 # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on Qwen2.5 and SigLIP. It has been modified to create DiffusionVL.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""DiffusionVL-Qwen2.5 (SigLIP + Qwen2.5) model configuration."""
 from typing import List, Optional, Union

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c6252727577975883dcd7a6b253f1d03c1a6e3bf3f78082ffe00f702f31e0d3
+size 4957560272

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02e807b6f6e298f3472a3c9b2afc0610ac9a42fe2b38300f80e56bfc3956743c
+size 2551787344

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_diffusionvl_qwen2_5.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # coding=utf-8
 # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,15 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-DiffusionVL-Qwen2.5 model implementation.
-This model uses:
-- SigLIP as the vision encoder (external ViT)
-- mlp2x_gelu as the MM projector (2-layer MLP with GELU)
-- Qwen2.5 as the LLM backbone (standard RoPE)
-- BD3LM for diffusion-based generation
-"""
 import math
 from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -1107,6 +1101,8 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
                 # Ensure tensors are on the same device (for device_map="auto")
                 output_device = x0.device
                 is_mask_on_device = is_mask.to(output_device)
                 num_to_transfer = num_transfer_tokens[step].item()
                 transfer_mask = self._get_transfer_mask(

 # coding=utf-8
 # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on Qwen2.5 and SigLIP. It has been modified to create DiffusionVL.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""DiffusionVL-Qwen2.5 (SigLIP + Qwen2.5) model implementation."""
 import math
 from typing import Callable, Dict, List, Optional, Tuple, Union
                 # Ensure tensors are on the same device (for device_map="auto")
                 output_device = x0.device
                 is_mask_on_device = is_mask.to(output_device)
+                cur_block_ids = cur_block_ids.to(output_device)
+                cur_block_embeds = cur_block_embeds.to(output_device)
                 num_to_transfer = num_transfer_tokens[step].item()
                 transfer_mask = self._get_transfer_mask(

processing_diffusionvl_qwen2_5.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # coding=utf-8
 # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,14 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-DiffusionVL-Qwen2.5 Processor - Self-contained image processing matching training code.
-This processor implements the same image processing pipeline as the training code:
-- process_images with anyres support
-- tokenizer_image_token for proper <image> token handling
-- Uses SiglipImageProcessor for the underlying image preprocessing
-"""
 import ast
 import math

 # coding=utf-8
 # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on Qwen2.5 and SigLIP. It has been modified to create DiffusionVL.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""DiffusionVL-Qwen2.5 Processor - Combines image processor and tokenizer."""
 import ast
 import math