Clean up inference config: remove training-only flags, set bd_size=32 default, dtype=bfloat16

Browse files

Files changed (3) hide show

config.json +1 -16
configuration.py +1 -13
modeling.py +162 -474

config.json CHANGED Viewed

@@ -1,6 +1,4 @@
 {
-  "always_mask_im_end": true,
-  "anneal_block_size": true,
   "architectures": [
     "Fast_dVLMForConditionalGeneration"
   ],
@@ -11,14 +9,8 @@
     "AutoModelForCausalLM": "modeling.Fast_dVLMForConditionalGeneration"
   },
   "bd_size": 32,
-  "block_causal_no_dynamic": false,
-  "complementary_mask": true,
   "dtype": "bfloat16",
-  "enable_efficient_vision_embed": false,
-  "entropy_loss": false,
-  "entropy_loss_weight": 1.0,
   "eos_token_id": 151645,
-  "flexible_bd_size": false,
   "hidden_act": "silu",
   "hidden_size": 2048,
   "image_token_id": 151655,
@@ -55,12 +47,7 @@
       "AutoModelForCausalLM": "modeling.Fast_dVLMForConditionalGeneration"
     },
     "bd_size": 8,
-    "block_causal_no_dynamic": false,
     "bos_token_id": 151643,
-    "complementary_mask": true,
-    "dtype": "float32",
-    "entropy_loss": false,
-    "entropy_loss_weight": 1.0,
     "eos_token_id": 151645,
     "hidden_act": "silu",
     "hidden_size": 2048,
@@ -125,7 +112,6 @@
     "rope_theta": 1000000.0,
     "sliding_window": null,
     "tie_word_embeddings": true,
-    "use_block_causal_mask": false,
     "use_cache": true,
     "use_sliding_window": false,
     "video_token_id": null,
@@ -135,13 +121,12 @@
     "vocab_size": 151936
   },
   "transformers_version": "4.57.1",
-  "use_block_causal_mask": true,
   "use_cache": true,
   "use_sliding_window": false,
   "video_token_id": 151656,
   "vision_config": {
     "depth": 32,
-    "dtype": "float32",
     "fullatt_block_indexes": [
       7,
       15,

 {
   "architectures": [
     "Fast_dVLMForConditionalGeneration"
   ],
     "AutoModelForCausalLM": "modeling.Fast_dVLMForConditionalGeneration"
   },
   "bd_size": 32,
   "dtype": "bfloat16",
   "eos_token_id": 151645,
   "hidden_act": "silu",
   "hidden_size": 2048,
   "image_token_id": 151655,
       "AutoModelForCausalLM": "modeling.Fast_dVLMForConditionalGeneration"
     },
     "bd_size": 8,
     "bos_token_id": 151643,
     "eos_token_id": 151645,
     "hidden_act": "silu",
     "hidden_size": 2048,
     "rope_theta": 1000000.0,
     "sliding_window": null,
     "tie_word_embeddings": true,
     "use_cache": true,
     "use_sliding_window": false,
     "video_token_id": null,
     "vocab_size": 151936
   },
   "transformers_version": "4.57.1",
   "use_cache": true,
   "use_sliding_window": false,
   "video_token_id": 151656,
   "vision_config": {
     "depth": 32,
+    "dtype": "bfloat16",
     "fullatt_block_indexes": [
       7,
       15,

configuration.py CHANGED Viewed

@@ -87,15 +87,10 @@ class Fast_dVLMTextConfig(PretrainedConfig):
         rope_scaling=None,
         image_token_id=None,
         video_token_id=None,
-        bd_size=8,
         self_spec_inference_mode=None,
         block_length=None,
-        use_block_causal_mask=False,
-        complementary_mask=True,
         minimum_noise_level=1e-3,
-        entropy_loss=False,
-        entropy_loss_weight=1.0,
-        block_causal_no_dynamic=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -122,12 +117,7 @@ class Fast_dVLMTextConfig(PretrainedConfig):
         self.rope_scaling = rope_scaling
         self.bd_size = bd_size
         self.layer_types = layer_types
-        self.use_block_causal_mask = use_block_causal_mask
-        self.complementary_mask = complementary_mask
         self.minimum_noise_level = minimum_noise_level
-        self.entropy_loss = entropy_loss
-        self.entropy_loss_weight = entropy_loss_weight
-        self.block_causal_no_dynamic = block_causal_no_dynamic
         self.self_spec_inference_mode = self_spec_inference_mode
         self.block_length = block_length
         if self.layer_types is None:
@@ -166,7 +156,6 @@ class Fast_dVLMConfig(PretrainedConfig):
         vision_config=None,
         image_token_id=151655,
         video_token_id=151656,
-        enable_efficient_vision_embed=False,
         **kwargs,
     ):
         if isinstance(vision_config, dict):
@@ -182,7 +171,6 @@ class Fast_dVLMConfig(PretrainedConfig):
         self.image_token_id = image_token_id
         self.video_token_id = video_token_id
-        self.enable_efficient_vision_embed = enable_efficient_vision_embed
         super().__init__(**kwargs)

         rope_scaling=None,
         image_token_id=None,
         video_token_id=None,
+        bd_size=32,
         self_spec_inference_mode=None,
         block_length=None,
         minimum_noise_level=1e-3,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.rope_scaling = rope_scaling
         self.bd_size = bd_size
         self.layer_types = layer_types
         self.minimum_noise_level = minimum_noise_level
         self.self_spec_inference_mode = self_spec_inference_mode
         self.block_length = block_length
         if self.layer_types is None:
         vision_config=None,
         image_token_id=151655,
         video_token_id=151656,
         **kwargs,
     ):
         if isinstance(vision_config, dict):
         self.image_token_id = image_token_id
         self.video_token_id = video_token_id
         super().__init__(**kwargs)

modeling.py CHANGED Viewed

@@ -23,94 +23,14 @@ from .configuration import Fast_dVLMConfig, Fast_dVLMTextConfig, Fast_dVLMVision
 from torch.nn.attention.flex_attention import flex_attention, create_block_mask
 from functools import partial
-import random
 import math
 logger = logging.get_logger(__name__)
-# @torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs")
-# @torch.compile()
 def fused_flex_attention(q, k, v, mask=None):
     return flex_attention(q, k, v, block_mask=mask, enable_gqa=True)
-def block_diff_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
-    """
-    Constructs the specialized block diffusion attention mask for training
-    composed of three masks:
-    - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
-    - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
-    - **Block Causal Mask (M_BC)**: Attention to update x0
-    Args:
-        b, h: Batch and head indices (ignored for mask logic).
-        q_idx, kv_idx: Query and Key indices.
-        seq_len: Total sequence length.
-        block_size: Defines the block structure.
-    Returns:
-        A boolean attention mask.
-    """
-    # Indicate whether token belongs to xt or x0
-    x0_flag_q = (q_idx >= n)
-    x0_flag_kv = (kv_idx >= n)
-    # Compute block indices
-    block_q = torch.where(x0_flag_q == 1,
-                        (q_idx - n) // block_size,
-                        q_idx // block_size)
-    block_kv = torch.where(x0_flag_kv == 1,
-                        (kv_idx - n) // block_size,
-                        kv_idx // block_size)
-    # **1. Block Diagonal Mask (M_BD) **
-    block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)
-    # **2. Offset Block-Causal Mask (M_OBC) **
-    offset_block_causal = (
-    (block_q > block_kv)
-    & (x0_flag_kv == 1)
-    & (x0_flag_q == 0)
-    )
-    # **3. Block-Causal Mask (M_BC) **
-    block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)
-    # **4. Combine Masks **
-    return block_diagonal | offset_block_causal | block_causal
-def block_causal_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
-    # Indicate whether token belongs to xt or x0
-    x0_flag_q = (q_idx >= n)
-    x0_flag_kv = (kv_idx >= n)
-    # Compute block indices
-    block_q = torch.where(x0_flag_q == 1,
-                        (q_idx - n) // block_size,
-                        q_idx // block_size)
-    block_kv = torch.where(x0_flag_kv == 1,
-                        (kv_idx - n) // block_size,
-                        kv_idx // block_size)
-    # **1. Block Diagonal Mask (M_BD) **
-    block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)
-    # **2. Offset Block-Causal Mask (M_OBC) **
-    offset_block_causal = (
-    (block_q > block_kv)
-    & (x0_flag_kv == 1)
-    & (x0_flag_q == 0)
-    )
-    # **3. Block-Causal Mask (M_BC) **
-    block_causal = (q_idx >= kv_idx) & (x0_flag_kv == 1) & (x0_flag_q == 1)
-    # **4. Combine Masks **
-    return block_diagonal | offset_block_causal | block_causal
 def hybrid_block_causal_mask_multiturn(b, h, q_idx, kv_idx, response_block_idx=None, turn_idx=None, n=None):
     """
     Multi-turn hybrid mask: Prompt uses causal, Response uses block causal.
@@ -145,29 +65,20 @@ def hybrid_block_causal_mask_multiturn(b, h, q_idx, kv_idx, response_block_idx=N
     is_prompt_q = (block_q < 0)
     is_prompt_kv = (block_kv < 0)
-    # x_t region rules:
-    # 1. Can see all previous turns: turn_q > turn_kv
-    # 2. Within same turn, prompt: causal (turn same + is prompt + pos satisfies causal)
-    # 3. Within same turn, response: sees all prompt in same turn + block causal for response
-    # xt_same_turn_prompt_causal = ~x0_flag_q & ~x0_flag_kv & (turn_q == turn_kv) & is_prompt_q & (pos_q >= pos_kv)
-    # xt_same_turn_response = ~x0_flag_q & ~x0_flag_kv & (turn_q == turn_kv) & ~is_prompt_q & (
-    #     ~is_prompt_kv
-    # )
-    block_diagonal = ~x0_flag_q & ~x0_flag_kv & (turn_q == turn_kv)
-    # **2. Offset Block-Causal Mask (M_OBC) **
     offset_block_causal = (
-        (turn_q > turn_kv)
         & (x0_flag_kv == 1)
         & (x0_flag_q == 0)
     )
-    # x_0 region: standard causal
     x0_causal = x0_flag_q & x0_flag_kv & (pos_q >= pos_kv)
-    return (block_diagonal |
-            offset_block_causal |
-            x0_causal)
 def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
@@ -820,7 +731,6 @@ class Fast_dVLMAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             if update_kv_cache:
                 key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-            # elif len(past_key_values) > self.layer_idx:
             elif len(past_key_values) > self.layer_idx and past_key_values[self.layer_idx][0] is not None:
                 key_states = torch.cat((past_key_values[self.layer_idx][0], key_states), dim=-2)
                 value_states = torch.cat((past_key_values[self.layer_idx][1], value_states), dim=-2)
@@ -964,7 +874,12 @@ class Fast_dVLMTextModel(Fast_dVLMPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
     @auto_docstring
     def forward(
         self,
@@ -1036,34 +951,11 @@ class Fast_dVLMTextModel(Fast_dVLMPreTrainedModel):
             text_position_ids = position_ids[0]
             position_ids = position_ids[1:]
         else:
-            # If inputs are not packed (usual 3D positions), do not prepare mask from position_ids
             text_position_ids = None
-        # It may already have been prepared by e.g. `generate`
-        # if not isinstance(causal_mask_mapping := attention_mask, dict):
-        #     # Prepare mask arguments
-        #     mask_kwargs = {
-        #         "config": self.config,
-        #         "input_embeds": inputs_embeds,
-        #         "attention_mask": attention_mask,
-        #         "cache_position": cache_position,
-        #         "past_key_values": past_key_values,
-        #         "position_ids": text_position_ids,
-        #     }
-        #     # Create the masks
-        #     causal_mask_mapping = {
-        #         "full_attention": create_causal_mask(**mask_kwargs),
-        #     }
-        #     # The sliding window alternating layers are not always activated depending on the config
-        #     if self.has_sliding_layers:
-        #         causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
         hidden_states = inputs_embeds
-        # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
-        # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -1091,7 +983,6 @@ class Fast_dVLMTextModel(Fast_dVLMPreTrainedModel):
         hidden_states = self.norm(hidden_states)
-        # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -1121,7 +1012,6 @@ class Fast_dVLMModel(Fast_dVLMPreTrainedModel):
         self.visual = Fast_dVLMVisionTransformerPretrainedModel._from_config(config.vision_config)
         self.language_model = Fast_dVLMTextModel._from_config(config.text_config)
         self.rope_deltas = None  # cache rope_deltas here
-        self.use_block_causal_mask = config.use_block_causal_mask
         # Initialize weights and apply final processing
         self.post_init()
@@ -1307,13 +1197,6 @@ class Fast_dVLMModel(Fast_dVLMPreTrainedModel):
             mrope_position_deltas = torch.tensor(mrope_position_deltas).unsqueeze(1).to(device=input_ids.device)
             return position_ids, mrope_position_deltas
         else:
-            # if attention_mask is not None:
-            #     position_ids = attention_mask.long().cumsum(-1) - 1
-            #     position_ids.masked_fill_(attention_mask == 0, 1)
-            #     position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
-            #     max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
-            #     mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
-            # else:
             if self.training:
                 position_ids = (
                     torch.arange(input_ids.shape[1] // 2, device=input_ids.device)
@@ -1415,16 +1298,16 @@ class Fast_dVLMModel(Fast_dVLMPreTrainedModel):
         return special_image_mask, special_video_mask
-    def eval_mask(self, seqlen, block_size, cache_seq_len, update_kv_cache=False, use_block_causal_mask=False):
         q_indices = torch.arange(seqlen, device=self.device) + cache_seq_len
         k_indices = torch.arange(seqlen + cache_seq_len, device=self.device)
-        if use_block_causal_mask and update_kv_cache:
             mask = eval_causal_mask(q_indices[:, None], k_indices[None, :])
         else:
             mask = eval_block_diff_mask(
-                q_idx=q_indices[:, None],
-                kv_idx=k_indices[None, :],
-                block_size=block_size
             )
         return mask
@@ -1536,8 +1419,6 @@ class Fast_dVLMModel(Fast_dVLMPreTrainedModel):
                 position_ids = torch.arange(seq_length, device=inputs_embeds.device)
                 position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
-                # if cache_position is not None:
-                #     delta = (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
                 if past_key_values is not None:
                     delta = (past_key_values.get_seq_length() + self.rope_deltas).to(inputs_embeds.device)
                 else:
@@ -1547,7 +1428,7 @@ class Fast_dVLMModel(Fast_dVLMPreTrainedModel):
         position_ids = position_ids.to(inputs_embeds.device)
         if not self.training:
-            attention_mask = self.eval_mask(inputs_embeds.shape[1], self.bd_size if bd_size is None else bd_size, 0 if past_key_values is None else past_key_values.get_seq_length(), update_kv_cache=update_kv_cache, use_block_causal_mask=self.use_block_causal_mask).to(inputs_embeds.device)
         outputs = self.language_model(
             input_ids=None,
@@ -1620,19 +1501,9 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.bd_size = config.bd_size
         self.model.bd_size = self.bd_size
-        self.complementary_mask = getattr(config, 'complementary_mask', False)
-        self.always_mask_im_end = getattr(config, 'always_mask_im_end', False)
-        self.flexible_bd_size = getattr(config, 'flexible_bd_size', False)
-        self.use_block_causal_mask = getattr(config, 'use_block_causal_mask', False)
-        self.anneal_block_size = getattr(config, 'anneal_block_size', False)
-        self.enable_efficient_vision_embed = getattr(config, 'enable_efficient_vision_embed', False)
         self.minimum_noise_level = getattr(config, 'minimum_noise_level', 0.0)
-        self.entropy_loss = getattr(config, 'entropy_loss', False)
-        self.entropy_loss_weight = getattr(config, 'entropy_loss_weight', 1.0)
-        self.block_causal_no_dynamic = getattr(config, 'block_causal_no_dynamic', False)
         self.im_end_token_id = 151645  # <|im_end|> token id
-        # self.max_context_length = 4096
         # Vision-to-text aligner (if vision output dim != text hidden dim)
         vision_out_dim = config.vision_config.out_hidden_size
         text_hidden = config.text_config.hidden_size
@@ -1675,30 +1546,6 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
     def visual(self):
         return self.model.visual
-    def gen_mask(self, seqlen, block_size, B, H):
-        # ================== 修改开始 ==================
-        # flex_attention 要求闭包捕获的变量必须是 Tensor
-        # 将 int 转换为 Tensor，并放在对应的设备上
-        block_size_t = torch.tensor(block_size, device=self.device, dtype=torch.int32)
-        n_t = torch.tensor(seqlen, device=self.device, dtype=torch.int32)
-        mask = create_block_mask(
-            # 这里将原来的 block_size=block_size 改为传入 Tensor
-            partial(block_diff_mask, block_size=block_size_t, n=n_t),
-            B=B, H=H, Q_LEN=seqlen*2, KV_LEN=seqlen*2
-        )
-        # ================== 修改结束 ==================
-        return mask
-    def gen_block_causal_mask(self, seqlen, block_size, B, H):
-        block_size_t = torch.tensor(block_size, device=self.device, dtype=torch.int32)
-        n_t = torch.tensor(seqlen, device=self.device, dtype=torch.int32)
-        mask = create_block_mask(
-            partial(block_causal_mask, block_size=block_size_t, n=n_t),
-            B=B, H=H, Q_LEN=seqlen*2, KV_LEN=seqlen*2
-        )
-        return mask
     def compute_response_block_idx(self, labels, block_size):
         """
         Compute block index and turn index for each position.
@@ -1767,36 +1614,6 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
         )
         return mask
-    def compute_entropy_loss(self, logits, labels, num_items_in_batch=None):
-        """Compute entropy loss with optional global normalization.
-        Args:
-            logits: Model logits
-            labels: Ground truth labels (-100 for ignored tokens)
-            num_items_in_batch: Global number of non-ignored tokens for normalization.
-                               If provided, uses sum/num_items_in_batch for global norm.
-                               If None, uses mean() for micro-batch norm.
-        """
-        non_ignore_mask = labels != -100
-        logits = logits[non_ignore_mask]
-        labels = labels[non_ignore_mask]
-        correct_mask = logits.argmax(dim=-1) == labels
-        compute_logits = logits[correct_mask]
-        if correct_mask.sum() == 0:
-            return torch.tensor(0.0, device=logits.device)
-        p = F.softmax(compute_logits, dim=-1)
-        log_p = F.log_softmax(compute_logits, dim=-1)
-        entropy = -torch.sum(p * log_p, dim=-1)
-        if num_items_in_batch is not None:
-            # Global normalization: use same denominator as cross entropy loss
-            return entropy.sum() / num_items_in_batch
-        else:
-            return entropy.mean()
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -1839,34 +1656,22 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
         eval_bd_size (`int`, *optional*):
             Block diffusion size to use during evaluation. Overrides the model default when set.
         """
-        # input_ids = torch.tensor([[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]]).to(input_ids.device, dtype=input_ids.dtype)
-        # labels = torch.tensor([[-100,-100,3,4,5,6,-100,-100,-100,-100,11,12,13,14,15]]).to(labels.device, dtype=labels.dtype)
-        # pixel_values = None
-        # pixel_values_videos = None
-        # self.bd_size = 2
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         if self.training:
-            if self.anneal_block_size:
-                # Get update_ratio from kwargs (passed by trainer)
-                update_ratio = kwargs.get('update_ratio', 1.0)
-                # Compute possible bd_sizes: [2, 4, 8, ..., target_bd_size]
-                max_power = int(math.log2(self.bd_size))
-                possible_bd_sizes = [2**i for i in range(2, max_power + 1)]  # Start from 4
-                # sqrt mapping: larger block sizes get more training time
-                scaled_ratio = math.sqrt(update_ratio)
-                idx = min(int(scaled_ratio * len(possible_bd_sizes)), len(possible_bd_sizes) - 1)
-                bd_size = possible_bd_sizes[idx]
-            elif self.flexible_bd_size:
-                max_power = int(math.log2(self.bd_size))
-                possible_bd_sizes = [2**i for i in range(max_power + 1)]
-                bd_size = random.choice(possible_bd_sizes)
-            else:
-                bd_size = self.bd_size
-            if pixel_values is None and pixel_values_videos is None: # only train on text
                 batch_size, seq_len = input_ids.shape
                 original_labels = labels.clone()
@@ -1877,79 +1682,57 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                 response_mask = (labels != -100)  # [B, seq_len]
                 eps = self.minimum_noise_level
-                if self.use_block_causal_mask and not self.block_causal_no_dynamic:
-                    response_block_idx, turn_idx, n_blocks = self.compute_response_block_idx(labels, bd_size)
-                    # Sample t for each block: [n_blocks]
-                    # random sample t for each block from [self.minimum_noise_level, 1]
-                    t = torch.rand((n_blocks,), device=input_ids.device)
-                    p_mask_per_block = (1 - eps) * t + eps
-                    # Create mask_indices: [B, seq_len]
-                    mask_indices = torch.zeros_like(labels, dtype=torch.bool)
-                    for i in range(seq_len):
-                        block_i = response_block_idx[i].item()
-                        if block_i >= 0:  # response token
-                            mask_indices[:, i] = torch.rand((batch_size,), device=input_ids.device) < p_mask_per_block[block_i]
-                else:
-                    input_ids = input_ids.reshape(input_ids.shape[0] * input_ids.shape[1] // bd_size, bd_size)
-                    b, l = input_ids.shape
-                    t = torch.rand((b,), device=input_ids.device)
-                    p_mask = (1 - eps) * t + eps
-                    p_mask = p_mask[:, None].repeat(1, l)
-                    mask_indices = torch.rand((b, l), device=input_ids.device) < p_mask
-                    mask_indices = mask_indices.reshape(labels.shape) & response_mask
-                    input_ids = input_ids.reshape(labels.shape)
-                # Always mask <|im_end|> in response
-                if self.always_mask_im_end:
-                    im_end_mask = (input_ids == self.im_end_token_id) & response_mask
-                    mask_indices = mask_indices | im_end_mask
-                # Apply mask only to response
                 noisy_input_ids = input_ids.clone()
                 noisy_input_ids[mask_indices] = mask_id
-                # Update labels: only predict masked response tokens
                 labels = labels.clone()
                 labels[~mask_indices] = -100
-                # Concatenate [noisy | clean]
                 input_ids = torch.cat([noisy_input_ids, original_input_ids], dim=1)
-                # Complementary version
-                if self.complementary_mask:
-                    complementary_mask_indices = response_mask & ~mask_indices
-                    if self.always_mask_im_end:
-                        im_end_mask = (original_input_ids == self.im_end_token_id) & response_mask
-                        complementary_mask_indices = complementary_mask_indices | im_end_mask
-                    complementary_noisy_input_ids = original_input_ids.clone()
-                    complementary_noisy_input_ids[complementary_mask_indices] = mask_id
-                    complementary_labels = original_labels.clone()
-                    complementary_labels[~complementary_mask_indices] = -100
-                    complementary_input_ids = torch.cat([complementary_noisy_input_ids, original_input_ids], dim=1)
-                    input_ids = torch.cat([input_ids, complementary_input_ids], dim=0)
-                    labels = torch.cat([labels, complementary_labels], dim=0)
-                if self.use_block_causal_mask:
-                    if self.block_causal_no_dynamic:
-                        attention_mask = self.gen_block_causal_mask(seq_len, bd_size, input_ids.shape[0], self.config.num_attention_heads)
-                    else:
-                        attention_mask = self.gen_hybrid_block_causal_mask(seq_len, response_block_idx, turn_idx, input_ids.shape[0], self.config.num_attention_heads)
-                else:
-                    attention_mask = self.gen_mask(seq_len, bd_size, input_ids.shape[0], self.config.num_attention_heads)
-            else:  # 多模态 block diffusion
-                # Phase A: Embed + masked scatter vision
                 if inputs_embeds is None:
                     inputs_embeds = self.model.get_input_embeddings()(input_ids)
                 if pixel_values is not None:
                     image_embeds = self.model.get_image_features(pixel_values, image_grid_thw)
                     image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
@@ -1959,7 +1742,7 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                         input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
                     )
                     inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
                 if pixel_values_videos is not None:
                     video_embeds = self.model.get_video_features(pixel_values_videos, video_grid_thw)
                     video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
@@ -1969,8 +1752,8 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                         input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
                     )
                     inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
-                # Phase B: 生成 3D position_ids（在扩倍前，基于原长）
                 if position_ids is None:
                     position_ids, rope_deltas = self.model.get_rope_index(
                         input_ids=input_ids,
@@ -1979,189 +1762,107 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                         second_per_grid_ts=second_per_grid_ts,
                         attention_mask=attention_mask,
                     )
-                # Phase C: Block diffusion (保护 vision token 位置)
                 batch_size = input_ids.shape[0]
                 L = input_ids.shape[1]
                 seq_len = L
-                # if L > self.max_context_length:
-                #     L = self.max_context_length
-                #     input_ids = input_ids[:, :self.max_context_length]
-                #     labels = labels[:, :self.max_context_length]
-                #     position_ids = position_ids[:, :self.max_context_length]
-                #     attention_mask = attention_mask[:, :self.max_context_length]
-                #     inputs_embeds = inputs_embeds[:, :self.max_context_length]
                 hidden_size = inputs_embeds.shape[-1]
                 original_labels = labels.clone()
                 original_input_ids = input_ids.clone()
                 original_embeds = inputs_embeds.clone()
-                original_position_ids = position_ids.clone()  # 保存原长 position [3, B, L]
-                # 识别 vision tokens（不加噪声）
                 image_token_id = self.config.image_token_id
                 video_token_id = self.config.video_token_id
                 vision_start_token_id = self.config.vision_start_token_id
                 vision_token_mask = (input_ids == image_token_id) | (input_ids == video_token_id) | (input_ids == vision_start_token_id)
                 vision_mask_3d = vision_token_mask.unsqueeze(-1).expand(-1, -1, hidden_size)
-                # Block diffusion with multi-turn support
-                # Each response segment has independent blocks
                 response_block_idx, turn_idx, n_blocks = self.compute_response_block_idx(labels, bd_size)
-                # Compute response block index: -1 for prompt, >=0 for response
-                # Each response segment has independent blocks
-                response_mask = (labels != -100)  # [B, seq_len]
                 eps = self.minimum_noise_level
-                if self.use_block_causal_mask and not self.block_causal_no_dynamic:
-                    response_block_idx, turn_idx, n_blocks = self.compute_response_block_idx(labels, bd_size)
-                    # Sample t for each block: [n_blocks]
-                    # random sample t for each block from [self.minimum_noise_level, 1]
-                    t = torch.rand((n_blocks,), device=input_ids.device)
-                    p_mask_per_block = (1 - eps) * t + eps
-                    # Create mask_indices: [B, seq_len]
-                    mask_indices = torch.zeros_like(labels, dtype=torch.bool)
-                    for i in range(seq_len):
-                        block_i = response_block_idx[i].item()
-                        if block_i >= 0:  # response token
-                            mask_indices[:, i] = torch.rand((batch_size,), device=input_ids.device) < p_mask_per_block[block_i]
-                else:
-                    input_ids = input_ids.reshape(input_ids.shape[0] * input_ids.shape[1] // bd_size, bd_size)
-                    b, l = input_ids.shape
-                    t = torch.rand((b,), device=input_ids.device)
-                    p_mask = (1 - eps) * t + eps
-                    p_mask = p_mask[:, None].repeat(1, l)
-                    mask_indices = torch.rand((b, l), device=input_ids.device) < p_mask
-                    mask_indices = mask_indices.reshape(labels.shape) & response_mask
-                    input_ids = input_ids.reshape(labels.shape)
-                if self.always_mask_im_end:
-                    im_end_mask = (input_ids == self.im_end_token_id) & response_mask
-                    mask_indices = mask_indices | im_end_mask
                 noisy_input_ids = input_ids.clone()
                 noisy_input_ids[mask_indices] = mask_id
-                # Noisy embeds（保护 vision）
-                if self.enable_efficient_vision_embed:
-                    noisy_embeds = original_embeds.clone()
-                    text_mask_3d = mask_indices.unsqueeze(-1).expand(-1, -1, hidden_size)
-                    mask_embeds = self.model.language_model.embed_tokens(
-                        torch.full_like(input_ids, mask_id)
-                    )
-                    noisy_embeds = torch.where(text_mask_3d, mask_embeds, noisy_embeds)
-                else:
-                    noisy_embeds_raw = self.model.language_model.embed_tokens(noisy_input_ids)
-                    noisy_embeds = torch.where(vision_mask_3d, original_embeds, noisy_embeds_raw)
-                # 更新 labels
                 labels_noisy = labels.clone()
                 labels_noisy[~mask_indices] = -100
-                # 拼接 [noisy | clean]
                 input_ids_pair1 = torch.cat([noisy_input_ids, original_input_ids], dim=1)
                 embeds_pair1 = torch.cat([noisy_embeds, original_embeds], dim=1)
                 labels_pair1 = labels_noisy
-                position_ids_pair1 = original_position_ids  # [3, B, L]
-                input_ids = input_ids_pair1
-                inputs_embeds = embeds_pair1
-                labels = labels_pair1
-                position_ids = position_ids_pair1
-                # Complementary
-                if self.complementary_mask:
-                    complementary_mask_indices = response_mask & ~mask_indices
-                    if self.always_mask_im_end:
-                        im_end_mask = (original_input_ids == self.im_end_token_id) & response_mask
-                        complementary_mask_indices = complementary_mask_indices | im_end_mask
-                    complementary_noisy_input_ids = original_input_ids.clone()
-                    complementary_noisy_input_ids[complementary_mask_indices] = mask_id
-                    if self.enable_efficient_vision_embed:
-                        complementary_noisy_embeds = original_embeds.clone()
-                        text_mask_3d = complementary_mask_indices.unsqueeze(-1).expand(-1, -1, hidden_size)
-                        mask_embeds = self.model.language_model.embed_tokens(
-                            torch.full_like(original_input_ids, mask_id)
-                        )
-                        complementary_noisy_embeds = torch.where(text_mask_3d, mask_embeds, complementary_noisy_embeds)
-                    else:
-                        complementary_noisy_embeds_raw = self.model.language_model.embed_tokens(complementary_noisy_input_ids)
-                        complementary_noisy_embeds = torch.where(vision_mask_3d, original_embeds, complementary_noisy_embeds_raw)
-                    complementary_labels = original_labels.clone()
-                    complementary_labels[~complementary_mask_indices] = -100
-                    input_ids_pair2 = torch.cat([complementary_noisy_input_ids, original_input_ids], dim=1)
-                    embeds_pair2 = torch.cat([complementary_noisy_embeds, original_embeds], dim=1)
-                    labels_pair2 = complementary_labels
-                    position_ids_pair2 = original_position_ids
-                    # Batch 拼接
-                    input_ids = torch.cat([input_ids_pair1, input_ids_pair2], dim=0)
-                    inputs_embeds = torch.cat([embeds_pair1, embeds_pair2], dim=0)
-                    labels = torch.cat([labels_pair1, labels_pair2], dim=0)
-                    position_ids = torch.cat([position_ids_pair1, position_ids_pair2], dim=1)
-                if self.use_block_causal_mask:
-                    if self.block_causal_no_dynamic:
-                        attention_mask = self.gen_block_causal_mask(L, bd_size, input_ids.shape[0], self.config.num_attention_heads)
-                    else:
-                        attention_mask = self.gen_hybrid_block_causal_mask(L, response_block_idx, turn_idx, input_ids.shape[0], self.config.num_attention_heads)
-                else:
-                    attention_mask = self.gen_mask(L, bd_size, input_ids.shape[0], self.config.num_attention_heads)
-                # 清空 pixel_values（已替换）
-                pixel_values = None
-                pixel_values_videos = None
-            # Phase D: 调用内层（多模态时传 inputs_embeds，纯文本时传 input_ids）
-            if pixel_values is None and pixel_values_videos is None:
-                # 纯文本：传 input_ids（内层会 embed）
-                outputs = self.model(
-                    input_ids=input_ids,
-                    pixel_values=None,
-                    pixel_values_videos=None,
-                    image_grid_thw=None,
-                    video_grid_thw=None,
-                    position_ids=position_ids,
-                    attention_mask=attention_mask,
-                    past_key_values=past_key_values,
-                    inputs_embeds=inputs_embeds,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=True,
-                    cache_position=cache_position,
-                    update_kv_cache=update_kv_cache,
-                    bd_size=bd_size,
-                    **kwargs,
-                )
-            else:
-                # 多模态：传 inputs_embeds（已 masked_scatter）
-                outputs = self.model.language_model(
-                    input_ids=None,
-                    position_ids=position_ids,
-                    attention_mask=attention_mask,
-                    past_key_values=past_key_values,
-                    inputs_embeds=inputs_embeds,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    output_hidden_states=output_hidden_states,
-                    return_dict=True,
-                    cache_position=cache_position,
-                    update_kv_cache=update_kv_cache,
-                    bd_size=bd_size,
-                    **kwargs,
-                )
         else:
             outputs = self.model(
@@ -2193,31 +1894,18 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(mdm_hidden_states[:, slice_indices, :])
-            if self.use_block_causal_mask:
-                new_kwargs = {
-                    'num_items_in_batch': 2*kwargs['num_items_in_batch'],
-                }
-            else:
-                new_kwargs = kwargs
             if labels is not None:
                 loss = self.loss_function(
                     logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs
                 ) * 0.5
-            if self.use_block_causal_mask:
-                if self.complementary_mask:
-                    causal_hidden_states = hidden_states[:hidden_states.shape[0]//2, hidden_states.shape[1]//2:, :]
-                else:
-                    causal_hidden_states = hidden_states[:, :hidden_states.shape[1]//2, :]
-                causal_logits = self.lm_head(causal_hidden_states[:, slice_indices, :])
-                loss += self.loss_function(
-                    logits=causal_logits, labels=original_labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs
-                )
-            if self.entropy_loss:
-                # Use num_items_in_batch for global normalization (consistent with cross entropy)
-                num_items = kwargs.get('num_items_in_batch', None)
-                entropy_loss = self.compute_entropy_loss(logits, labels, num_items_in_batch=num_items)
-                loss += self.entropy_loss_weight * entropy_loss
         else:
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(hidden_states[:, slice_indices, :])

 from torch.nn.attention.flex_attention import flex_attention, create_block_mask
 from functools import partial
 import math
 logger = logging.get_logger(__name__)
 def fused_flex_attention(q, k, v, mask=None):
     return flex_attention(q, k, v, block_mask=mask, enable_gqa=True)
 def hybrid_block_causal_mask_multiturn(b, h, q_idx, kv_idx, response_block_idx=None, turn_idx=None, n=None):
     """
     Multi-turn hybrid mask: Prompt uses causal, Response uses block causal.
     is_prompt_q = (block_q < 0)
     is_prompt_kv = (block_kv < 0)
+    # Block diagonal: same turn, both in x_t region.
+    block_diagonal = ~x0_flag_q & ~x0_flag_kv & (turn_q == turn_kv)
+    # Offset block-causal: x_t can attend to x_0 of strictly earlier turns.
     offset_block_causal = (
+        (turn_q > turn_kv)
         & (x0_flag_kv == 1)
         & (x0_flag_q == 0)
     )
+    # x_0 region uses standard causal masking.
     x0_causal = x0_flag_q & x0_flag_kv & (pos_q >= pos_kv)
+    return block_diagonal | offset_block_causal | x0_causal
 def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             if update_kv_cache:
                 key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
             elif len(past_key_values) > self.layer_idx and past_key_values[self.layer_idx][0] is not None:
                 key_states = torch.cat((past_key_values[self.layer_idx][0], key_states), dim=-2)
                 value_states = torch.cat((past_key_values[self.layer_idx][1], value_states), dim=-2)
         # Initialize weights and apply final processing
         self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
     @auto_docstring
     def forward(
         self,
             text_position_ids = position_ids[0]
             position_ids = position_ids[1:]
         else:
             text_position_ids = None
         hidden_states = inputs_embeds
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         self.visual = Fast_dVLMVisionTransformerPretrainedModel._from_config(config.vision_config)
         self.language_model = Fast_dVLMTextModel._from_config(config.text_config)
         self.rope_deltas = None  # cache rope_deltas here
         # Initialize weights and apply final processing
         self.post_init()
             mrope_position_deltas = torch.tensor(mrope_position_deltas).unsqueeze(1).to(device=input_ids.device)
             return position_ids, mrope_position_deltas
         else:
             if self.training:
                 position_ids = (
                     torch.arange(input_ids.shape[1] // 2, device=input_ids.device)
         return special_image_mask, special_video_mask
+    def eval_mask(self, seqlen, block_size, cache_seq_len, update_kv_cache=False):
         q_indices = torch.arange(seqlen, device=self.device) + cache_seq_len
         k_indices = torch.arange(seqlen + cache_seq_len, device=self.device)
+        if update_kv_cache:
             mask = eval_causal_mask(q_indices[:, None], k_indices[None, :])
         else:
             mask = eval_block_diff_mask(
+                q_idx=q_indices[:, None],
+                kv_idx=k_indices[None, :],
+                block_size=block_size,
             )
         return mask
                 position_ids = torch.arange(seq_length, device=inputs_embeds.device)
                 position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
                 if past_key_values is not None:
                     delta = (past_key_values.get_seq_length() + self.rope_deltas).to(inputs_embeds.device)
                 else:
         position_ids = position_ids.to(inputs_embeds.device)
         if not self.training:
+            attention_mask = self.eval_mask(inputs_embeds.shape[1], self.bd_size if bd_size is None else bd_size, 0 if past_key_values is None else past_key_values.get_seq_length(), update_kv_cache=update_kv_cache).to(inputs_embeds.device)
         outputs = self.language_model(
             input_ids=None,
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.bd_size = config.bd_size
         self.model.bd_size = self.bd_size
         self.minimum_noise_level = getattr(config, 'minimum_noise_level', 0.0)
         self.im_end_token_id = 151645  # <|im_end|> token id
         # Vision-to-text aligner (if vision output dim != text hidden dim)
         vision_out_dim = config.vision_config.out_hidden_size
         text_hidden = config.text_config.hidden_size
     def visual(self):
         return self.model.visual
     def compute_response_block_idx(self, labels, block_size):
         """
         Compute block index and turn index for each position.
         )
         return mask
     @can_return_tuple
     @auto_docstring
     def forward(
         eval_bd_size (`int`, *optional*):
             Block diffusion size to use during evaluation. Overrides the model default when set.
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         if self.training:
+            # Anneal block size: pick from [4, 8, ..., target_bd_size] based on
+            # training progress. update_ratio is passed by the trainer (default 1.0
+            # corresponds to using the target block size).
+            update_ratio = kwargs.get('update_ratio', 1.0)
+            max_power = int(math.log2(self.bd_size))
+            possible_bd_sizes = [2**i for i in range(2, max_power + 1)]
+            scaled_ratio = math.sqrt(update_ratio)
+            idx = min(int(scaled_ratio * len(possible_bd_sizes)), len(possible_bd_sizes) - 1)
+            bd_size = possible_bd_sizes[idx]
+            if pixel_values is None and pixel_values_videos is None:  # text-only batch
                 batch_size, seq_len = input_ids.shape
                 original_labels = labels.clone()
                 response_mask = (labels != -100)  # [B, seq_len]
                 eps = self.minimum_noise_level
+                response_block_idx, turn_idx, n_blocks = self.compute_response_block_idx(labels, bd_size)
+                # Per-block noise level sampled from [minimum_noise_level, 1].
+                t = torch.rand((n_blocks,), device=input_ids.device)
+                p_mask_per_block = (1 - eps) * t + eps
+                # Build [B, seq_len] mask: prompt tokens stay clean, response tokens
+                # are masked block-wise according to p_mask_per_block.
+                mask_indices = torch.zeros_like(labels, dtype=torch.bool)
+                for i in range(seq_len):
+                    block_i = response_block_idx[i].item()
+                    if block_i >= 0:  # response token
+                        mask_indices[:, i] = torch.rand((batch_size,), device=input_ids.device) < p_mask_per_block[block_i]
+                # Always mask <|im_end|> tokens that fall inside the response.
+                im_end_mask = (input_ids == self.im_end_token_id) & response_mask
+                mask_indices = mask_indices | im_end_mask
                 noisy_input_ids = input_ids.clone()
                 noisy_input_ids[mask_indices] = mask_id
+                # Restrict the loss to masked response tokens only.
                 labels = labels.clone()
                 labels[~mask_indices] = -100
+                # Concatenate [noisy | clean] along the sequence dimension.
                 input_ids = torch.cat([noisy_input_ids, original_input_ids], dim=1)
+                # Complementary pair: mask the positions that were left clean above.
+                complementary_mask_indices = response_mask & ~mask_indices
+                im_end_mask = (original_input_ids == self.im_end_token_id) & response_mask
+                complementary_mask_indices = complementary_mask_indices | im_end_mask
+                complementary_noisy_input_ids = original_input_ids.clone()
+                complementary_noisy_input_ids[complementary_mask_indices] = mask_id
+                complementary_labels = original_labels.clone()
+                complementary_labels[~complementary_mask_indices] = -100
+                complementary_input_ids = torch.cat([complementary_noisy_input_ids, original_input_ids], dim=1)
+                input_ids = torch.cat([input_ids, complementary_input_ids], dim=0)
+                labels = torch.cat([labels, complementary_labels], dim=0)
+                attention_mask = self.gen_hybrid_block_causal_mask(seq_len, response_block_idx, turn_idx, input_ids.shape[0], self.config.num_attention_heads)
+            else:
+                # Multimodal block diffusion path.
+                # Phase A: embed input_ids and scatter vision features into placeholder positions.
                 if inputs_embeds is None:
                     inputs_embeds = self.model.get_input_embeddings()(input_ids)
                 if pixel_values is not None:
                     image_embeds = self.model.get_image_features(pixel_values, image_grid_thw)
                     image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
                         input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
                     )
                     inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
                 if pixel_values_videos is not None:
                     video_embeds = self.model.get_video_features(pixel_values_videos, video_grid_thw)
                     video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
                         input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
                     )
                     inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+                # Phase B: build 3D position_ids on the original (pre-doubled) length.
                 if position_ids is None:
                     position_ids, rope_deltas = self.model.get_rope_index(
                         input_ids=input_ids,
                         second_per_grid_ts=second_per_grid_ts,
                         attention_mask=attention_mask,
                     )
+                # Phase C: block diffusion that preserves vision token positions.
                 batch_size = input_ids.shape[0]
                 L = input_ids.shape[1]
                 seq_len = L
                 hidden_size = inputs_embeds.shape[-1]
                 original_labels = labels.clone()
                 original_input_ids = input_ids.clone()
                 original_embeds = inputs_embeds.clone()
+                original_position_ids = position_ids.clone()
+                # Identify vision tokens so noise is never applied to them.
                 image_token_id = self.config.image_token_id
                 video_token_id = self.config.video_token_id
                 vision_start_token_id = self.config.vision_start_token_id
                 vision_token_mask = (input_ids == image_token_id) | (input_ids == video_token_id) | (input_ids == vision_start_token_id)
                 vision_mask_3d = vision_token_mask.unsqueeze(-1).expand(-1, -1, hidden_size)
+                # Block diffusion with multi-turn support: each response segment has its own blocks.
                 response_block_idx, turn_idx, n_blocks = self.compute_response_block_idx(labels, bd_size)
+                response_mask = (labels != -100)
                 eps = self.minimum_noise_level
+                t = torch.rand((n_blocks,), device=input_ids.device)
+                p_mask_per_block = (1 - eps) * t + eps
+                mask_indices = torch.zeros_like(labels, dtype=torch.bool)
+                for i in range(seq_len):
+                    block_i = response_block_idx[i].item()
+                    if block_i >= 0:
+                        mask_indices[:, i] = torch.rand((batch_size,), device=input_ids.device) < p_mask_per_block[block_i]
+                im_end_mask = (input_ids == self.im_end_token_id) & response_mask
+                mask_indices = mask_indices | im_end_mask
                 noisy_input_ids = input_ids.clone()
                 noisy_input_ids[mask_indices] = mask_id
+                # Build noisy embeddings while keeping vision embeddings intact.
+                noisy_embeds_raw = self.model.language_model.embed_tokens(noisy_input_ids)
+                noisy_embeds = torch.where(vision_mask_3d, original_embeds, noisy_embeds_raw)
                 labels_noisy = labels.clone()
                 labels_noisy[~mask_indices] = -100
+                # Concatenate [noisy | clean] along the sequence dimension.
                 input_ids_pair1 = torch.cat([noisy_input_ids, original_input_ids], dim=1)
                 embeds_pair1 = torch.cat([noisy_embeds, original_embeds], dim=1)
                 labels_pair1 = labels_noisy
+                position_ids_pair1 = original_position_ids
+                # Complementary pair: mask the positions that were left clean above.
+                complementary_mask_indices = response_mask & ~mask_indices
+                im_end_mask = (original_input_ids == self.im_end_token_id) & response_mask
+                complementary_mask_indices = complementary_mask_indices | im_end_mask
+                complementary_noisy_input_ids = original_input_ids.clone()
+                complementary_noisy_input_ids[complementary_mask_indices] = mask_id
+                complementary_noisy_embeds_raw = self.model.language_model.embed_tokens(complementary_noisy_input_ids)
+                complementary_noisy_embeds = torch.where(vision_mask_3d, original_embeds, complementary_noisy_embeds_raw)
+                complementary_labels = original_labels.clone()
+                complementary_labels[~complementary_mask_indices] = -100
+                input_ids_pair2 = torch.cat([complementary_noisy_input_ids, original_input_ids], dim=1)
+                embeds_pair2 = torch.cat([complementary_noisy_embeds, original_embeds], dim=1)
+                labels_pair2 = complementary_labels
+                position_ids_pair2 = original_position_ids
+                # Stack the complementary pair along the batch dimension.
+                input_ids = torch.cat([input_ids_pair1, input_ids_pair2], dim=0)
+                inputs_embeds = torch.cat([embeds_pair1, embeds_pair2], dim=0)
+                labels = torch.cat([labels_pair1, labels_pair2], dim=0)
+                position_ids = torch.cat([position_ids_pair1, position_ids_pair2], dim=1)
+                attention_mask = self.gen_hybrid_block_causal_mask(L, response_block_idx, turn_idx, input_ids.shape[0], self.config.num_attention_heads)
+            # Phase D: forward through the inner model. Vision features (if any)
+            # have already been scattered into inputs_embeds, so pixel_values are
+            # cleared to skip re-processing inside `Fast_dVLMModel`.
+            outputs = self.model(
+                input_ids=input_ids,
+                pixel_values=None,
+                pixel_values_videos=None,
+                image_grid_thw=None,
+                video_grid_thw=None,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=True,
+                cache_position=cache_position,
+                update_kv_cache=update_kv_cache,
+                bd_size=bd_size,
+                **kwargs,
+            )
         else:
             outputs = self.model(
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(mdm_hidden_states[:, slice_indices, :])
+            new_kwargs = {
+                'num_items_in_batch': 2 * kwargs['num_items_in_batch'],
+            }
             if labels is not None:
                 loss = self.loss_function(
                     logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs
                 ) * 0.5
+            causal_hidden_states = hidden_states[:hidden_states.shape[0]//2, hidden_states.shape[1]//2:, :]
+            causal_logits = self.lm_head(causal_hidden_states[:, slice_indices, :])
+            loss += self.loss_function(
+                logits=causal_logits, labels=original_labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs
+            )
         else:
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(hidden_states[:, slice_indices, :])