inclusionAI
/

LLaDA2.1-flash

@@ -28,9 +28,7 @@ from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_causal_attention_mask_for_sdpa,
-)
 from transformers.modeling_outputs import (
     MoeModelOutputWithPast,
     MoeCausalLMOutputWithPast,
@@ -876,17 +874,11 @@ class LLaDA2MoeModel(LLaDA2MoePreTrainedModel):
                 device=inputs_embeds.device,
             )
             position_ids = position_ids.unsqueeze(0)
-        if attention_mask.size() == (batch_size, 1, seq_length, seq_length):
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_seen_tokens,
-            )
-        else:
-            raise ValueError(
-                f"LLaDA2.0 only support block attention mask with shape: {(batch_size, 1, seq_length, seq_length)}, the input attention with shape {attention_mask.size()=}!"
-            )
         # embed positions
         hidden_states = inputs_embeds

 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.masking_utils import create_bidirectional_mask
 from transformers.modeling_outputs import (
     MoeModelOutputWithPast,
     MoeCausalLMOutputWithPast,
                 device=inputs_embeds.device,
             )
             position_ids = position_ids.unsqueeze(0)
+        attention_mask = create_bidirectional_mask(
+            config=self.config,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+        )
         # embed positions
         hidden_states = inputs_embeds