Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

config.json +1 -1
configuration_diffusionvl_qwen2_5.py +5 -4
modeling_diffusionvl_qwen2_5.py +15 -53
processing_diffusionvl_qwen2_5.py +0 -13

config.json CHANGED Viewed

@@ -232,7 +232,7 @@
   "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
   "mm_vision_tower_lr": 2e-06,
   "model_max_length": 8192,
-  "model_type": "diffusionvl_qwen2_5",
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,

   "mm_vision_tower": "/data/minimax-dialogue/users/qingke/results/hf_models/siglip2-so400m-patch14-384",
   "mm_vision_tower_lr": 2e-06,
   "model_max_length": 8192,
+  "model_type": "diffusionvl_qwen",
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,

configuration_diffusionvl_qwen2_5.py CHANGED Viewed

@@ -98,13 +98,12 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
         mm_hidden_size: Vision encoder hidden size for projector.
         enable_bd3lm: Whether to enable BD3LM.
         bd3lm_block_size: Block size for BD3LM.
-        bd3lm_cross_attn: Whether to use cross-attention in BD3LM.
         mask_token_id: Token ID for mask token.
         rope_theta: RoPE base period.
         sliding_window: Sliding window size for attention.
     """
-    model_type = "diffusionvl_qwen2_5"
     sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VisionConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
@@ -131,7 +130,6 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
         # BD3LM diffusion parameters
         enable_bd3lm: bool = True,
         bd3lm_block_size: int = 8,
-        bd3lm_cross_attn: bool = True,
         bd3lm_antithetic_sampling: bool = True,
         bd3lm_sampling_eps_min: float = 1e-3,
         bd3lm_sampling_eps_max: float = 1.0,
@@ -145,6 +143,10 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
         use_sliding_window: bool = False,
         **kwargs,
     ):
         # Text model configuration
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
@@ -180,7 +182,6 @@ class DiffusionVL_Qwen2_5_Config(PretrainedConfig):
         # BD3LM diffusion configuration
         self.enable_bd3lm = enable_bd3lm
         self.bd3lm_block_size = bd3lm_block_size
-        self.bd3lm_cross_attn = bd3lm_cross_attn
         self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
         self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
         self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max

         mm_hidden_size: Vision encoder hidden size for projector.
         enable_bd3lm: Whether to enable BD3LM.
         bd3lm_block_size: Block size for BD3LM.
         mask_token_id: Token ID for mask token.
         rope_theta: RoPE base period.
         sliding_window: Sliding window size for attention.
     """
+    model_type = "diffusionvl_qwen"
     sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VisionConfig}
     keys_to_ignore_at_inference = ["past_key_values"]
         # BD3LM diffusion parameters
         enable_bd3lm: bool = True,
         bd3lm_block_size: int = 8,
         bd3lm_antithetic_sampling: bool = True,
         bd3lm_sampling_eps_min: float = 1e-3,
         bd3lm_sampling_eps_max: float = 1.0,
         use_sliding_window: bool = False,
         **kwargs,
     ):
+        # Remove text_config from kwargs to avoid GenerationConfig issues
+        # (text_config is only needed for train code, HF config uses flattened params)
+        kwargs.pop("text_config", None)
         # Text model configuration
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         # BD3LM diffusion configuration
         self.enable_bd3lm = enable_bd3lm
         self.bd3lm_block_size = bd3lm_block_size
         self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
         self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
         self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max

modeling_diffusionvl_qwen2_5.py CHANGED Viewed

@@ -38,10 +38,6 @@ logger = logging.get_logger(__name__)
 IMAGE_TOKEN_INDEX = -200
-# ============================================================================
-# Image Processing Utilities (matching training code)
-# ============================================================================
 def select_best_resolution(original_size, possible_resolutions):
     """
     Selects the best resolution from a list of possible resolutions based on the original size.
@@ -118,10 +114,6 @@ def unpad_image(tensor, original_size):
     return unpadded_tensor
-# ============================================================================
-# Vision Encoder (SigLIP)
-# ============================================================================
 class SigLipVisionEmbeddings(nn.Module):
     """Patch embedding for SigLIP vision encoder."""
@@ -346,10 +338,6 @@ class DiffusionVL_Qwen2_5_VisionTower(nn.Module):
         return self.vision_tower(pixel_values, output_hidden_states=True)
-# ============================================================================
-# MM Projector (mlp2x_gelu - matches training code)
-# ============================================================================
 def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
     """
     Build MM projector matching training code's mlp2x_gelu structure.
@@ -366,10 +354,6 @@ def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
     )
-# ============================================================================
-# LLM Components (Qwen2.5 based)
-# ============================================================================
 class DiffusionVL_Qwen2_5_RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         super().__init__()
@@ -589,10 +573,6 @@ class DiffusionVL_Qwen2_5_DecoderLayer(nn.Module):
         return hidden_states, attn_weights
-# ============================================================================
-# Main Model Classes
-# ============================================================================
 class DiffusionVL_Qwen2_5_PreTrainedModel(PreTrainedModel):
     config_class = DiffusionVL_Qwen2_5_Config
     base_model_prefix = "model"
@@ -985,7 +965,6 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
         top_k: int = 0,
         top_p: float = 1.0,
         remasking_strategy: str = 'low_confidence_static',
-        use_kv_cache: bool = True,
         confidence_threshold: float = 0.85,
         **kwargs,
     ):
@@ -1027,9 +1006,9 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
         prefill_blocks = prompt_len // block_size
         prefill_length = prefill_blocks * block_size
-        past_key_values = DynamicCache() if use_kv_cache else None
-        if use_kv_cache and prefill_length > 0:
             prefill_embeds = x_embeds[:, :prefill_length]
             prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
             prefill_pos_ids = position_ids[:, :prefill_length]
@@ -1061,42 +1040,25 @@ class DiffusionVL_Qwen2_5_ForConditionalGeneration(DiffusionVL_Qwen2_5_PreTraine
                 mask_embed_local = mask_embed.to(cur_block_embeds.device)
                 is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed_local) < 1e-5, dim=-1)
                 if not is_mask.any():
-                    if use_kv_cache:
-                        _ = self.model(
-                            inputs_embeds=cur_block_embeds,
-                            attention_mask=cur_mask,
-                            position_ids=cur_pos_ids,
-                            past_key_values=past_key_values,
-                            use_cache=True,
-                            store_kv=True,
-                        )
-                    break
-                if use_kv_cache:
-                    outputs = self.model(
                         inputs_embeds=cur_block_embeds,
                         attention_mask=cur_mask,
                         position_ids=cur_pos_ids,
                         past_key_values=past_key_values,
                         use_cache=True,
-                        store_kv=False,
-                    )
-                    logits = self.lm_head(outputs.last_hidden_state).float()
-                else:
-                    context_embeds = x_embeds[:, :block_end].clone()
-                    context_embeds[:, block_start:block_end] = cur_block_embeds
-                    context_mask = block_diffusion_mask[:, :, :block_end, :block_end]
-                    context_pos_ids = position_ids[:, :block_end]
-                    outputs = self.model(
-                        inputs_embeds=context_embeds,
-                        attention_mask=context_mask,
-                        position_ids=context_pos_ids,
-                        past_key_values=None,
-                        use_cache=False,
-                        store_kv=False,
                     )
-                    logits = self.lm_head(outputs.last_hidden_state[:, block_start:block_end]).float()
                 x0, x0_p = self._sample_with_temperature(logits, temperature, top_k, top_p)

 IMAGE_TOKEN_INDEX = -200
 def select_best_resolution(original_size, possible_resolutions):
     """
     Selects the best resolution from a list of possible resolutions based on the original size.
     return unpadded_tensor
 class SigLipVisionEmbeddings(nn.Module):
     """Patch embedding for SigLIP vision encoder."""
         return self.vision_tower(pixel_values, output_hidden_states=True)
 def build_mm_projector(config: DiffusionVL_Qwen2_5_Config) -> nn.Module:
     """
     Build MM projector matching training code's mlp2x_gelu structure.
     )
 class DiffusionVL_Qwen2_5_RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         super().__init__()
         return hidden_states, attn_weights
 class DiffusionVL_Qwen2_5_PreTrainedModel(PreTrainedModel):
     config_class = DiffusionVL_Qwen2_5_Config
     base_model_prefix = "model"
         top_k: int = 0,
         top_p: float = 1.0,
         remasking_strategy: str = 'low_confidence_static',
         confidence_threshold: float = 0.85,
         **kwargs,
     ):
         prefill_blocks = prompt_len // block_size
         prefill_length = prefill_blocks * block_size
+        past_key_values = DynamicCache()
+        if prefill_length > 0:
             prefill_embeds = x_embeds[:, :prefill_length]
             prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
             prefill_pos_ids = position_ids[:, :prefill_length]
                 mask_embed_local = mask_embed.to(cur_block_embeds.device)
                 is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed_local) < 1e-5, dim=-1)
                 if not is_mask.any():
+                    _ = self.model(
                         inputs_embeds=cur_block_embeds,
                         attention_mask=cur_mask,
                         position_ids=cur_pos_ids,
                         past_key_values=past_key_values,
                         use_cache=True,
+                        store_kv=True,
                     )
+                    break
+                outputs = self.model(
+                    inputs_embeds=cur_block_embeds,
+                    attention_mask=cur_mask,
+                    position_ids=cur_pos_ids,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    store_kv=False,
+                )
+                logits = self.lm_head(outputs.last_hidden_state).float()
                 x0, x0_p = self._sample_with_temperature(logits, temperature, top_k, top_p)

processing_diffusionvl_qwen2_5.py CHANGED Viewed

@@ -36,11 +36,6 @@ from transformers import SiglipImageProcessor
 DEFAULT_IMAGE_TOKEN = "<image>"
 IMAGE_TOKEN_INDEX = -200
-# ============================================================================
-# Image Processing Utilities (matching training code mm_utils.py)
-# ============================================================================
 def select_best_resolution(original_size: Tuple[int, int], possible_resolutions: List[Tuple[int, int]]) -> Tuple[int, int]:
     """
     Selects the best resolution from a list of possible resolutions based on the original size.
@@ -264,10 +259,6 @@ def tokenizer_image_token(prompt: str, tokenizer, image_token_index: int = IMAGE
     return input_ids
-# ============================================================================
-# Conversation Templates (matching training code)
-# ============================================================================
 class Conversation:
     """Simple conversation class matching LLaVA's conv_templates."""
@@ -312,10 +303,6 @@ CONV_QWEN_2_5 = Conversation(
 )
-# ============================================================================
-# Main Processor Class
-# ============================================================================
 class DiffusionVL_Qwen2_5_Processor(ProcessorMixin):
     """
     Processor for DiffusionVL-Qwen2.5 model.

 DEFAULT_IMAGE_TOKEN = "<image>"
 IMAGE_TOKEN_INDEX = -200
 def select_best_resolution(original_size: Tuple[int, int], possible_resolutions: List[Tuple[int, int]]) -> Tuple[int, int]:
     """
     Selects the best resolution from a list of possible resolutions based on the original size.
     return input_ids
 class Conversation:
     """Simple conversation class matching LLaVA's conv_templates."""
 )
 class DiffusionVL_Qwen2_5_Processor(ProcessorMixin):
     """
     Processor for DiffusionVL-Qwen2.5 model.