xiazhi commited on
Commit
eb188ee
·
verified ·
1 Parent(s): 5b4caff

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -232,7 +232,7 @@
232
  "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
233
  "mm_vision_tower_lr": 2e-06,
234
  "model_max_length": 8192,
235
- "model_type": "diffusionvl_qwen2_5",
236
  "num_attention_heads": 28,
237
  "num_hidden_layers": 28,
238
  "num_key_value_heads": 4,
 
232
  "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
233
  "mm_vision_tower_lr": 2e-06,
234
  "model_max_length": 8192,
235
+ "model_type": "diffusionvl_qwen",
236
  "num_attention_heads": 28,
237
  "num_hidden_layers": 28,
238
  "num_key_value_heads": 4,
configuration_diffusionvl_qwen2_5.py CHANGED
@@ -98,13 +98,12 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
98
  mm_hidden_size: Vision encoder hidden size for projector.
99
  enable_bd3lm: Whether to enable BD3LM.
100
  bd3lm_block_size: Block size for BD3LM.
101
- bd3lm_cross_attn: Whether to use cross-attention in BD3LM.
102
  mask_token_id: Token ID for mask token.
103
  rope_theta: RoPE base period.
104
  sliding_window: Sliding window size for attention.
105
  """
106
 
107
- model_type = "diffusionvl_qwen2_5"
108
  sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VisionConfig}
109
  keys_to_ignore_at_inference = ["past_key_values"]
110
 
@@ -131,7 +130,6 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
131
  # BD3LM diffusion parameters
132
  enable_bd3lm: bool = True,
133
  bd3lm_block_size: int = 8,
134
- bd3lm_cross_attn: bool = True,
135
  bd3lm_antithetic_sampling: bool = True,
136
  bd3lm_sampling_eps_min: float = 1e-3,
137
  bd3lm_sampling_eps_max: float = 1.0,
@@ -145,6 +143,10 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
145
  use_sliding_window: bool = False,
146
  **kwargs,
147
  ):
 
 
 
 
148
  # Text model configuration
149
  self.vocab_size = vocab_size
150
  self.hidden_size = hidden_size
@@ -180,7 +182,6 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
180
  # BD3LM diffusion configuration
181
  self.enable_bd3lm = enable_bd3lm
182
  self.bd3lm_block_size = bd3lm_block_size
183
- self.bd3lm_cross_attn = bd3lm_cross_attn
184
  self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
185
  self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
186
  self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max
 
98
  mm_hidden_size: Vision encoder hidden size for projector.
99
  enable_bd3lm: Whether to enable BD3LM.
100
  bd3lm_block_size: Block size for BD3LM.
 
101
  mask_token_id: Token ID for mask token.
102
  rope_theta: RoPE base period.
103
  sliding_window: Sliding window size for attention.
104
  """
105
 
106
+ model_type = "diffusionvl_qwen"
107
  sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VisionConfig}
108
  keys_to_ignore_at_inference = ["past_key_values"]
109
 
 
130
  # BD3LM diffusion parameters
131
  enable_bd3lm: bool = True,
132
  bd3lm_block_size: int = 8,
 
133
  bd3lm_antithetic_sampling: bool = True,
134
  bd3lm_sampling_eps_min: float = 1e-3,
135
  bd3lm_sampling_eps_max: float = 1.0,
 
143
  use_sliding_window: bool = False,
144
  **kwargs,
145
  ):
146
+ # Remove text_config from kwargs to avoid GenerationConfig issues
147
+ # (text_config is only needed for train code, HF config uses flattened params)
148
+ kwargs.pop("text_config", None)
149
+
150
  # Text model configuration
151
  self.vocab_size = vocab_size
152
  self.hidden_size = hidden_size
 
182
  # BD3LM diffusion configuration
183
  self.enable_bd3lm = enable_bd3lm
184
  self.bd3lm_block_size = bd3lm_block_size
 
185
  self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
186
  self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
187
  self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max
modeling_diffusionvl_qwen2_5.py CHANGED
@@ -38,10 +38,6 @@ logger = logging.get_logger(__name__)
38
  IMAGE_TOKEN_INDEX = -200
39
 
40
 
41
- # ============================================================================
42
- # Image Processing Utilities (matching training code)
43
- # ============================================================================
44
-
45
  def select_best_resolution(original_size, possible_resolutions):
46
  """
47
  Selects the best resolution from a list of possible resolutions based on the original size.
@@ -118,10 +114,6 @@ def unpad_image(tensor, original_size):
118
  return unpadded_tensor
119
 
120
 
121
- # ============================================================================
122
- # Vision Encoder (SigLIP)
123
- # ============================================================================
124
-
125
  class SigLipVisionEmbeddings(nn.Module):
126
  """Patch embedding for SigLIP vision encoder."""
127
 
@@ -346,10 +338,6 @@ class DiffusionVL_Qwen2_5_VisionTower(nn.Module):
346
  return self.vision_tower(pixel_values, output_hidden_states=True)
347
 
348
 
349
- # ============================================================================
350
- # MM Projector (mlp2x_gelu - matches training code)
351
- # ============================================================================
352
-
353
  def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
354
  """
355
  Build MM projector matching training code's mlp2x_gelu structure.
@@ -366,10 +354,6 @@ def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
366
  )
367
 
368
 
369
- # ============================================================================
370
- # LLM Components (Qwen2.5 based)
371
- # ============================================================================
372
-
373
  class DiffusionVL_Qwen2_5_RMSNorm(nn.Module):
374
  def __init__(self, hidden_size, eps=1e-6):
375
  super().__init__()
@@ -589,10 +573,6 @@ class DiffusionVL_Qwen2_5_DecoderLayer(nn.Module):
589
  return hidden_states, attn_weights
590
 
591
 
592
- # ============================================================================
593
- # Main Model Classes
594
- # ============================================================================
595
-
596
  class DiffusionVL_Qwen2_5_PreTrainedModel(PreTrainedModel):
597
  config_class = DiffusionVL_Qwen2_5_Config
598
  base_model_prefix = "model"
@@ -985,7 +965,6 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
985
  top_k: int = 0,
986
  top_p: float = 1.0,
987
  remasking_strategy: str = 'low_confidence_static',
988
- use_kv_cache: bool = True,
989
  confidence_threshold: float = 0.85,
990
  **kwargs,
991
  ):
@@ -1027,9 +1006,9 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
1027
  prefill_blocks = prompt_len // block_size
1028
  prefill_length = prefill_blocks * block_size
1029
 
1030
- past_key_values = DynamicCache() if use_kv_cache else None
1031
 
1032
- if use_kv_cache and prefill_length > 0:
1033
  prefill_embeds = x_embeds[:, :prefill_length]
1034
  prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
1035
  prefill_pos_ids = position_ids[:, :prefill_length]
@@ -1061,42 +1040,25 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
1061
  mask_embed_local = mask_embed.to(cur_block_embeds.device)
1062
  is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed_local) < 1e-5, dim=-1)
1063
  if not is_mask.any():
1064
- if use_kv_cache:
1065
- _ = self.model(
1066
- inputs_embeds=cur_block_embeds,
1067
- attention_mask=cur_mask,
1068
- position_ids=cur_pos_ids,
1069
- past_key_values=past_key_values,
1070
- use_cache=True,
1071
- store_kv=True,
1072
- )
1073
- break
1074
-
1075
- if use_kv_cache:
1076
- outputs = self.model(
1077
  inputs_embeds=cur_block_embeds,
1078
  attention_mask=cur_mask,
1079
  position_ids=cur_pos_ids,
1080
  past_key_values=past_key_values,
1081
  use_cache=True,
1082
- store_kv=False,
1083
- )
1084
- logits = self.lm_head(outputs.last_hidden_state).float()
1085
- else:
1086
- context_embeds = x_embeds[:, :block_end].clone()
1087
- context_embeds[:, block_start:block_end] = cur_block_embeds
1088
- context_mask = block_diffusion_mask[:, :, :block_end, :block_end]
1089
- context_pos_ids = position_ids[:, :block_end]
1090
-
1091
- outputs = self.model(
1092
- inputs_embeds=context_embeds,
1093
- attention_mask=context_mask,
1094
- position_ids=context_pos_ids,
1095
- past_key_values=None,
1096
- use_cache=False,
1097
- store_kv=False,
1098
  )
1099
- logits = self.lm_head(outputs.last_hidden_state[:, block_start:block_end]).float()
 
 
 
 
 
 
 
 
 
 
1100
 
1101
  x0, x0_p = self._sample_with_temperature(logits, temperature, top_k, top_p)
1102
 
 
38
  IMAGE_TOKEN_INDEX = -200
39
 
40
 
 
 
 
 
41
  def select_best_resolution(original_size, possible_resolutions):
42
  """
43
  Selects the best resolution from a list of possible resolutions based on the original size.
 
114
  return unpadded_tensor
115
 
116
 
 
 
 
 
117
  class SigLipVisionEmbeddings(nn.Module):
118
  """Patch embedding for SigLIP vision encoder."""
119
 
 
338
  return self.vision_tower(pixel_values, output_hidden_states=True)
339
 
340
 
 
 
 
 
341
  def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
342
  """
343
  Build MM projector matching training code's mlp2x_gelu structure.
 
354
  )
355
 
356
 
 
 
 
 
357
  class DiffusionVL_Qwen2_5_RMSNorm(nn.Module):
358
  def __init__(self, hidden_size, eps=1e-6):
359
  super().__init__()
 
573
  return hidden_states, attn_weights
574
 
575
 
 
 
 
 
576
  class DiffusionVL_Qwen2_5_PreTrainedModel(PreTrainedModel):
577
  config_class = DiffusionVL_Qwen2_5_Config
578
  base_model_prefix = "model"
 
965
  top_k: int = 0,
966
  top_p: float = 1.0,
967
  remasking_strategy: str = 'low_confidence_static',
 
968
  confidence_threshold: float = 0.85,
969
  **kwargs,
970
  ):
 
1006
  prefill_blocks = prompt_len // block_size
1007
  prefill_length = prefill_blocks * block_size
1008
 
1009
+ past_key_values = DynamicCache()
1010
 
1011
+ if prefill_length > 0:
1012
  prefill_embeds = x_embeds[:, :prefill_length]
1013
  prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
1014
  prefill_pos_ids = position_ids[:, :prefill_length]
 
1040
  mask_embed_local = mask_embed.to(cur_block_embeds.device)
1041
  is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed_local) < 1e-5, dim=-1)
1042
  if not is_mask.any():
1043
+ _ = self.model(
 
 
 
 
 
 
 
 
 
 
 
 
1044
  inputs_embeds=cur_block_embeds,
1045
  attention_mask=cur_mask,
1046
  position_ids=cur_pos_ids,
1047
  past_key_values=past_key_values,
1048
  use_cache=True,
1049
+ store_kv=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1050
  )
1051
+ break
1052
+
1053
+ outputs = self.model(
1054
+ inputs_embeds=cur_block_embeds,
1055
+ attention_mask=cur_mask,
1056
+ position_ids=cur_pos_ids,
1057
+ past_key_values=past_key_values,
1058
+ use_cache=True,
1059
+ store_kv=False,
1060
+ )
1061
+ logits = self.lm_head(outputs.last_hidden_state).float()
1062
 
1063
  x0, x0_p = self._sample_with_temperature(logits, temperature, top_k, top_p)
1064
 
processing_diffusionvl_qwen2_5.py CHANGED
@@ -36,11 +36,6 @@ from transformers import SiglipImageProcessor
36
  DEFAULT_IMAGE_TOKEN = "<image>"
37
  IMAGE_TOKEN_INDEX = -200
38
 
39
-
40
- # ============================================================================
41
- # Image Processing Utilities (matching training code mm_utils.py)
42
- # ============================================================================
43
-
44
  def select_best_resolution(original_size: Tuple[int, int], possible_resolutions: List[Tuple[int, int]]) -> Tuple[int, int]:
45
  """
46
  Selects the best resolution from a list of possible resolutions based on the original size.
@@ -264,10 +259,6 @@ def tokenizer_image_token(prompt: str, tokenizer, image_token_index: int = IMAGE
264
  return input_ids
265
 
266
 
267
- # ============================================================================
268
- # Conversation Templates (matching training code)
269
- # ============================================================================
270
-
271
  class Conversation:
272
  """Simple conversation class matching LLaVA's conv_templates."""
273
 
@@ -312,10 +303,6 @@ CONV_QWEN_2_5 = Conversation(
312
  )
313
 
314
 
315
- # ============================================================================
316
- # Main Processor Class
317
- # ============================================================================
318
-
319
  class DiffusionVL_Qwen2_5_Processor(ProcessorMixin):
320
  """
321
  Processor for DiffusionVL-Qwen2.5 model.
 
36
  DEFAULT_IMAGE_TOKEN = "<image>"
37
  IMAGE_TOKEN_INDEX = -200
38
 
 
 
 
 
 
39
  def select_best_resolution(original_size: Tuple[int, int], possible_resolutions: List[Tuple[int, int]]) -> Tuple[int, int]:
40
  """
41
  Selects the best resolution from a list of possible resolutions based on the original size.
 
259
  return input_ids
260
 
261
 
 
 
 
 
262
  class Conversation:
263
  """Simple conversation class matching LLaVA's conv_templates."""
264
 
 
303
  )
304
 
305
 
 
 
 
 
306
  class DiffusionVL_Qwen2_5_Processor(ProcessorMixin):
307
  """
308
  Processor for DiffusionVL-Qwen2.5 model.