Upload folder using huggingface_hub
Browse files- config.json +1 -1
- configuration_diffusionvl_qwen2_5.py +5 -4
- modeling_diffusionvl_qwen2_5.py +15 -53
- processing_diffusionvl_qwen2_5.py +0 -13
config.json
CHANGED
|
@@ -232,7 +232,7 @@
|
|
| 232 |
"mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
|
| 233 |
"mm_vision_tower_lr": 2e-06,
|
| 234 |
"model_max_length": 8192,
|
| 235 |
-
"model_type": "
|
| 236 |
"num_attention_heads": 28,
|
| 237 |
"num_hidden_layers": 28,
|
| 238 |
"num_key_value_heads": 4,
|
|
|
|
| 232 |
"mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
|
| 233 |
"mm_vision_tower_lr": 2e-06,
|
| 234 |
"model_max_length": 8192,
|
| 235 |
+
"model_type": "diffusionvl_qwen",
|
| 236 |
"num_attention_heads": 28,
|
| 237 |
"num_hidden_layers": 28,
|
| 238 |
"num_key_value_heads": 4,
|
configuration_diffusionvl_qwen2_5.py
CHANGED
|
@@ -98,13 +98,12 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
|
|
| 98 |
mm_hidden_size: Vision encoder hidden size for projector.
|
| 99 |
enable_bd3lm: Whether to enable BD3LM.
|
| 100 |
bd3lm_block_size: Block size for BD3LM.
|
| 101 |
-
bd3lm_cross_attn: Whether to use cross-attention in BD3LM.
|
| 102 |
mask_token_id: Token ID for mask token.
|
| 103 |
rope_theta: RoPE base period.
|
| 104 |
sliding_window: Sliding window size for attention.
|
| 105 |
"""
|
| 106 |
|
| 107 |
-
model_type = "
|
| 108 |
sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VisionConfig}
|
| 109 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 110 |
|
|
@@ -131,7 +130,6 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
|
|
| 131 |
# BD3LM diffusion parameters
|
| 132 |
enable_bd3lm: bool = True,
|
| 133 |
bd3lm_block_size: int = 8,
|
| 134 |
-
bd3lm_cross_attn: bool = True,
|
| 135 |
bd3lm_antithetic_sampling: bool = True,
|
| 136 |
bd3lm_sampling_eps_min: float = 1e-3,
|
| 137 |
bd3lm_sampling_eps_max: float = 1.0,
|
|
@@ -145,6 +143,10 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
|
|
| 145 |
use_sliding_window: bool = False,
|
| 146 |
**kwargs,
|
| 147 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
# Text model configuration
|
| 149 |
self.vocab_size = vocab_size
|
| 150 |
self.hidden_size = hidden_size
|
|
@@ -180,7 +182,6 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
|
|
| 180 |
# BD3LM diffusion configuration
|
| 181 |
self.enable_bd3lm = enable_bd3lm
|
| 182 |
self.bd3lm_block_size = bd3lm_block_size
|
| 183 |
-
self.bd3lm_cross_attn = bd3lm_cross_attn
|
| 184 |
self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
|
| 185 |
self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
|
| 186 |
self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max
|
|
|
|
| 98 |
mm_hidden_size: Vision encoder hidden size for projector.
|
| 99 |
enable_bd3lm: Whether to enable BD3LM.
|
| 100 |
bd3lm_block_size: Block size for BD3LM.
|
|
|
|
| 101 |
mask_token_id: Token ID for mask token.
|
| 102 |
rope_theta: RoPE base period.
|
| 103 |
sliding_window: Sliding window size for attention.
|
| 104 |
"""
|
| 105 |
|
| 106 |
+
model_type = "diffusionvl_qwen"
|
| 107 |
sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VisionConfig}
|
| 108 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 109 |
|
|
|
|
| 130 |
# BD3LM diffusion parameters
|
| 131 |
enable_bd3lm: bool = True,
|
| 132 |
bd3lm_block_size: int = 8,
|
|
|
|
| 133 |
bd3lm_antithetic_sampling: bool = True,
|
| 134 |
bd3lm_sampling_eps_min: float = 1e-3,
|
| 135 |
bd3lm_sampling_eps_max: float = 1.0,
|
|
|
|
| 143 |
use_sliding_window: bool = False,
|
| 144 |
**kwargs,
|
| 145 |
):
|
| 146 |
+
# Remove text_config from kwargs to avoid GenerationConfig issues
|
| 147 |
+
# (text_config is only needed for train code, HF config uses flattened params)
|
| 148 |
+
kwargs.pop("text_config", None)
|
| 149 |
+
|
| 150 |
# Text model configuration
|
| 151 |
self.vocab_size = vocab_size
|
| 152 |
self.hidden_size = hidden_size
|
|
|
|
| 182 |
# BD3LM diffusion configuration
|
| 183 |
self.enable_bd3lm = enable_bd3lm
|
| 184 |
self.bd3lm_block_size = bd3lm_block_size
|
|
|
|
| 185 |
self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
|
| 186 |
self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
|
| 187 |
self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max
|
modeling_diffusionvl_qwen2_5.py
CHANGED
|
@@ -38,10 +38,6 @@ logger = logging.get_logger(__name__)
|
|
| 38 |
IMAGE_TOKEN_INDEX = -200
|
| 39 |
|
| 40 |
|
| 41 |
-
# ============================================================================
|
| 42 |
-
# Image Processing Utilities (matching training code)
|
| 43 |
-
# ============================================================================
|
| 44 |
-
|
| 45 |
def select_best_resolution(original_size, possible_resolutions):
|
| 46 |
"""
|
| 47 |
Selects the best resolution from a list of possible resolutions based on the original size.
|
|
@@ -118,10 +114,6 @@ def unpad_image(tensor, original_size):
|
|
| 118 |
return unpadded_tensor
|
| 119 |
|
| 120 |
|
| 121 |
-
# ============================================================================
|
| 122 |
-
# Vision Encoder (SigLIP)
|
| 123 |
-
# ============================================================================
|
| 124 |
-
|
| 125 |
class SigLipVisionEmbeddings(nn.Module):
|
| 126 |
"""Patch embedding for SigLIP vision encoder."""
|
| 127 |
|
|
@@ -346,10 +338,6 @@ class DiffusionVL_Qwen2_5_VisionTower(nn.Module):
|
|
| 346 |
return self.vision_tower(pixel_values, output_hidden_states=True)
|
| 347 |
|
| 348 |
|
| 349 |
-
# ============================================================================
|
| 350 |
-
# MM Projector (mlp2x_gelu - matches training code)
|
| 351 |
-
# ============================================================================
|
| 352 |
-
|
| 353 |
def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
|
| 354 |
"""
|
| 355 |
Build MM projector matching training code's mlp2x_gelu structure.
|
|
@@ -366,10 +354,6 @@ def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
|
|
| 366 |
)
|
| 367 |
|
| 368 |
|
| 369 |
-
# ============================================================================
|
| 370 |
-
# LLM Components (Qwen2.5 based)
|
| 371 |
-
# ============================================================================
|
| 372 |
-
|
| 373 |
class DiffusionVL_Qwen2_5_RMSNorm(nn.Module):
|
| 374 |
def __init__(self, hidden_size, eps=1e-6):
|
| 375 |
super().__init__()
|
|
@@ -589,10 +573,6 @@ class DiffusionVL_Qwen2_5_DecoderLayer(nn.Module):
|
|
| 589 |
return hidden_states, attn_weights
|
| 590 |
|
| 591 |
|
| 592 |
-
# ============================================================================
|
| 593 |
-
# Main Model Classes
|
| 594 |
-
# ============================================================================
|
| 595 |
-
|
| 596 |
class DiffusionVL_Qwen2_5_PreTrainedModel(PreTrainedModel):
|
| 597 |
config_class = DiffusionVL_Qwen2_5_Config
|
| 598 |
base_model_prefix = "model"
|
|
@@ -985,7 +965,6 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
|
|
| 985 |
top_k: int = 0,
|
| 986 |
top_p: float = 1.0,
|
| 987 |
remasking_strategy: str = 'low_confidence_static',
|
| 988 |
-
use_kv_cache: bool = True,
|
| 989 |
confidence_threshold: float = 0.85,
|
| 990 |
**kwargs,
|
| 991 |
):
|
|
@@ -1027,9 +1006,9 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
|
|
| 1027 |
prefill_blocks = prompt_len // block_size
|
| 1028 |
prefill_length = prefill_blocks * block_size
|
| 1029 |
|
| 1030 |
-
past_key_values = DynamicCache()
|
| 1031 |
|
| 1032 |
-
if
|
| 1033 |
prefill_embeds = x_embeds[:, :prefill_length]
|
| 1034 |
prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
|
| 1035 |
prefill_pos_ids = position_ids[:, :prefill_length]
|
|
@@ -1061,42 +1040,25 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
|
|
| 1061 |
mask_embed_local = mask_embed.to(cur_block_embeds.device)
|
| 1062 |
is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed_local) < 1e-5, dim=-1)
|
| 1063 |
if not is_mask.any():
|
| 1064 |
-
|
| 1065 |
-
_ = self.model(
|
| 1066 |
-
inputs_embeds=cur_block_embeds,
|
| 1067 |
-
attention_mask=cur_mask,
|
| 1068 |
-
position_ids=cur_pos_ids,
|
| 1069 |
-
past_key_values=past_key_values,
|
| 1070 |
-
use_cache=True,
|
| 1071 |
-
store_kv=True,
|
| 1072 |
-
)
|
| 1073 |
-
break
|
| 1074 |
-
|
| 1075 |
-
if use_kv_cache:
|
| 1076 |
-
outputs = self.model(
|
| 1077 |
inputs_embeds=cur_block_embeds,
|
| 1078 |
attention_mask=cur_mask,
|
| 1079 |
position_ids=cur_pos_ids,
|
| 1080 |
past_key_values=past_key_values,
|
| 1081 |
use_cache=True,
|
| 1082 |
-
store_kv=
|
| 1083 |
-
)
|
| 1084 |
-
logits = self.lm_head(outputs.last_hidden_state).float()
|
| 1085 |
-
else:
|
| 1086 |
-
context_embeds = x_embeds[:, :block_end].clone()
|
| 1087 |
-
context_embeds[:, block_start:block_end] = cur_block_embeds
|
| 1088 |
-
context_mask = block_diffusion_mask[:, :, :block_end, :block_end]
|
| 1089 |
-
context_pos_ids = position_ids[:, :block_end]
|
| 1090 |
-
|
| 1091 |
-
outputs = self.model(
|
| 1092 |
-
inputs_embeds=context_embeds,
|
| 1093 |
-
attention_mask=context_mask,
|
| 1094 |
-
position_ids=context_pos_ids,
|
| 1095 |
-
past_key_values=None,
|
| 1096 |
-
use_cache=False,
|
| 1097 |
-
store_kv=False,
|
| 1098 |
)
|
| 1099 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1100 |
|
| 1101 |
x0, x0_p = self._sample_with_temperature(logits, temperature, top_k, top_p)
|
| 1102 |
|
|
|
|
| 38 |
IMAGE_TOKEN_INDEX = -200
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def select_best_resolution(original_size, possible_resolutions):
|
| 42 |
"""
|
| 43 |
Selects the best resolution from a list of possible resolutions based on the original size.
|
|
|
|
| 114 |
return unpadded_tensor
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
class SigLipVisionEmbeddings(nn.Module):
|
| 118 |
"""Patch embedding for SigLIP vision encoder."""
|
| 119 |
|
|
|
|
| 338 |
return self.vision_tower(pixel_values, output_hidden_states=True)
|
| 339 |
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
|
| 342 |
"""
|
| 343 |
Build MM projector matching training code's mlp2x_gelu structure.
|
|
|
|
| 354 |
)
|
| 355 |
|
| 356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
class DiffusionVL_Qwen2_5_RMSNorm(nn.Module):
|
| 358 |
def __init__(self, hidden_size, eps=1e-6):
|
| 359 |
super().__init__()
|
|
|
|
| 573 |
return hidden_states, attn_weights
|
| 574 |
|
| 575 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
class DiffusionVL_Qwen2_5_PreTrainedModel(PreTrainedModel):
|
| 577 |
config_class = DiffusionVL_Qwen2_5_Config
|
| 578 |
base_model_prefix = "model"
|
|
|
|
| 965 |
top_k: int = 0,
|
| 966 |
top_p: float = 1.0,
|
| 967 |
remasking_strategy: str = 'low_confidence_static',
|
|
|
|
| 968 |
confidence_threshold: float = 0.85,
|
| 969 |
**kwargs,
|
| 970 |
):
|
|
|
|
| 1006 |
prefill_blocks = prompt_len // block_size
|
| 1007 |
prefill_length = prefill_blocks * block_size
|
| 1008 |
|
| 1009 |
+
past_key_values = DynamicCache()
|
| 1010 |
|
| 1011 |
+
if prefill_length > 0:
|
| 1012 |
prefill_embeds = x_embeds[:, :prefill_length]
|
| 1013 |
prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
|
| 1014 |
prefill_pos_ids = position_ids[:, :prefill_length]
|
|
|
|
| 1040 |
mask_embed_local = mask_embed.to(cur_block_embeds.device)
|
| 1041 |
is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed_local) < 1e-5, dim=-1)
|
| 1042 |
if not is_mask.any():
|
| 1043 |
+
_ = self.model(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1044 |
inputs_embeds=cur_block_embeds,
|
| 1045 |
attention_mask=cur_mask,
|
| 1046 |
position_ids=cur_pos_ids,
|
| 1047 |
past_key_values=past_key_values,
|
| 1048 |
use_cache=True,
|
| 1049 |
+
store_kv=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1050 |
)
|
| 1051 |
+
break
|
| 1052 |
+
|
| 1053 |
+
outputs = self.model(
|
| 1054 |
+
inputs_embeds=cur_block_embeds,
|
| 1055 |
+
attention_mask=cur_mask,
|
| 1056 |
+
position_ids=cur_pos_ids,
|
| 1057 |
+
past_key_values=past_key_values,
|
| 1058 |
+
use_cache=True,
|
| 1059 |
+
store_kv=False,
|
| 1060 |
+
)
|
| 1061 |
+
logits = self.lm_head(outputs.last_hidden_state).float()
|
| 1062 |
|
| 1063 |
x0, x0_p = self._sample_with_temperature(logits, temperature, top_k, top_p)
|
| 1064 |
|
processing_diffusionvl_qwen2_5.py
CHANGED
|
@@ -36,11 +36,6 @@ from transformers import SiglipImageProcessor
|
|
| 36 |
DEFAULT_IMAGE_TOKEN = "<image>"
|
| 37 |
IMAGE_TOKEN_INDEX = -200
|
| 38 |
|
| 39 |
-
|
| 40 |
-
# ============================================================================
|
| 41 |
-
# Image Processing Utilities (matching training code mm_utils.py)
|
| 42 |
-
# ============================================================================
|
| 43 |
-
|
| 44 |
def select_best_resolution(original_size: Tuple[int, int], possible_resolutions: List[Tuple[int, int]]) -> Tuple[int, int]:
|
| 45 |
"""
|
| 46 |
Selects the best resolution from a list of possible resolutions based on the original size.
|
|
@@ -264,10 +259,6 @@ def tokenizer_image_token(prompt: str, tokenizer, image_token_index: int = IMAGE
|
|
| 264 |
return input_ids
|
| 265 |
|
| 266 |
|
| 267 |
-
# ============================================================================
|
| 268 |
-
# Conversation Templates (matching training code)
|
| 269 |
-
# ============================================================================
|
| 270 |
-
|
| 271 |
class Conversation:
|
| 272 |
"""Simple conversation class matching LLaVA's conv_templates."""
|
| 273 |
|
|
@@ -312,10 +303,6 @@ CONV_QWEN_2_5 = Conversation(
|
|
| 312 |
)
|
| 313 |
|
| 314 |
|
| 315 |
-
# ============================================================================
|
| 316 |
-
# Main Processor Class
|
| 317 |
-
# ============================================================================
|
| 318 |
-
|
| 319 |
class DiffusionVL_Qwen2_5_Processor(ProcessorMixin):
|
| 320 |
"""
|
| 321 |
Processor for DiffusionVL-Qwen2.5 model.
|
|
|
|
| 36 |
DEFAULT_IMAGE_TOKEN = "<image>"
|
| 37 |
IMAGE_TOKEN_INDEX = -200
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def select_best_resolution(original_size: Tuple[int, int], possible_resolutions: List[Tuple[int, int]]) -> Tuple[int, int]:
|
| 40 |
"""
|
| 41 |
Selects the best resolution from a list of possible resolutions based on the original size.
|
|
|
|
| 259 |
return input_ids
|
| 260 |
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
class Conversation:
|
| 263 |
"""Simple conversation class matching LLaVA's conv_templates."""
|
| 264 |
|
|
|
|
| 303 |
)
|
| 304 |
|
| 305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
class DiffusionVL_Qwen2_5_Processor(ProcessorMixin):
|
| 307 |
"""
|
| 308 |
Processor for DiffusionVL-Qwen2.5 model.
|