kilianhaefeli
/

Fast_dLLM_v2_7B

Safetensors

English

Fast_dLLM_Qwen

custom_code

Model card Files Files and versions

xet

Community

kilianhaefeli commited on Dec 19, 2025

Commit

e5351ca

1 Parent(s): c277c56

...

Browse files

Files changed (2) hide show

modeling.py +34 -15
modeling_f.py +949 -0

modeling.py CHANGED Viewed

@@ -479,11 +479,15 @@ class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
                         block_start_position, block_start_position + inputs_embeds.shape[1], device=inputs_embeds.device
                     )
                 else:
                     cache_position = torch.arange(
                         past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
                     )
         # --- keep the user/tokenizer padding mask BEFORE you overwrite attention_mask ---
         padding_mask_2d = attention_mask  # shape [B, KV_LEN], 1=token, 0=pad
         # -------------------------
@@ -492,17 +496,21 @@ class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
         if position_ids is None:
             if (padding_mask_2d is not None) and (not self.training):
                 # full, per-sample positions over KV_LEN
                 pos_full = padding_mask_2d.long().cumsum(-1) - 1          # pads => -1
                 pos_full = pos_full.clamp_min(0)                          # pads => 0
                 q_len = inputs_embeds.shape[1]
                 kv_len = pos_full.shape[1]
                 if kv_len < q_len:
                     raise ValueError(f"attention_mask KV_LEN={kv_len} < input_len={q_len}. "
                                     "When using cache, pass the FULL mask (past+current).")
-                q_start = kv_len - q_len  # assumes current tokens are the last q_len positions
-                position_ids = pos_full[:, q_start:]
             else:
                 # no padding mask: same positions for all batch elements
                 position_ids = cache_position.unsqueeze(0)
@@ -527,19 +535,23 @@ class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
                     attention_mask = structural[None, None, :, :]  # [1,1,Q,KV]
                 else:
                     pad = padding_mask_2d.to(torch.bool)            # [B, KV]
-                    B, kv_len = pad.shape
-                    q_len = inputs_embeds.shape[1]
-                    q_start = kv_len - q_len
                     # Per-sample block ids computed from *non-pad* positions
-                    pos_full = pad.long().cumsum(-1) - 1
-                    pos_full = pos_full.clamp_min(0)
-                    block_full = pos_full // block_size             # [B, KV]
-                    block_q = block_full[:, q_start:]               # [B, Q]
-                    block_k = block_full                            # [B, KV]
-                    structural = block_q.unsqueeze(-1) >= block_k.unsqueeze(-2)  # [B, Q, KV]
                     # Mask keys AND queries (only valid tokens participate)
                     key_ok   = pad[:, None, None, :]                # [B,1,1,KV]
@@ -630,7 +642,7 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         mask_id: Optional[int] = 151665,
         **kwargs
     ) -> CausalLMOutputWithPastAndBlockCache:
         if self.training:
             original_labels = labels.clone()
             original_input_ids = input_ids.clone()
@@ -727,11 +739,13 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         assert attention_mask is not None, "attention_mask must be provided for this generate() implementation."
         # pad the initial input_ids and attention_mask to be multiple of block_size
-        if input_ids.shape[1] % block_size != 0:
             pad_len = block_size - (input_ids.shape[1] % block_size)
             input_ids = torch.cat([torch.full((input_ids.shape[0], pad_len), self.config.pad_token_id, device=input_ids.device), input_ids], dim=1)
             attention_mask = torch.cat([torch.zeros((attention_mask.shape[0], pad_len), device=attention_mask.device), attention_mask], dim=1)
         num_blocks = max_new_tokens // block_size
         device = input_ids.device
         batch_size = input_ids.size(0)
@@ -747,7 +761,9 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         # Handle prefix processing (Context Encoding)
         if input_ids.shape[1] >= block_size:
             output = self.forward(input_ids=input_ids[:, :(input_ids.shape[1] // block_size * block_size)], attention_mask=attention_mask[:, :(input_ids.shape[1] // block_size * block_size)], use_cache=True, update_past_key_values=True, block_size=block_size)
             logits, past_key_values = output.logits, output.past_key_values
             if input_ids.shape[1] % block_size == 0:
                 next_token = logits[:, -1:, :].argmax(dim=-1)
@@ -780,13 +796,16 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
             prompt_length = input_ids.shape[1]
             # Initialize x_init with mask_id with all mask tokens for the new block
-            x_init = mask_id * torch.ones((input_ids.shape[0], block_size-prompt_length%block_size), device=self.device, dtype=torch.long)
             # Concatenate input_ids with x_init to form the new input_ids (we added a block-1 of masks to our current generation)
             x_init = torch.cat([input_ids, x_init], dim=1)
             # mask extension is extending the current mask by the number of new tokens we are generating in this block by adding ones.
             mask_extension = unfinished_sequences.unsqueeze(1).repeat(1, block_size - prompt_length % block_size).to(dtype=attention_mask.dtype)
             # mask is the current attention mask extended by the new tokens we are generating in this block by adding ones.
             curr_attention_mask = torch.cat([attention_mask, mask_extension], dim=1)
@@ -795,7 +814,7 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
             while True:
                 # mask_idx indicates where the mask tokens are in the current block
                 mask_idx = (x_t[:, -block_size:] == mask_id)
-                # TODOL assert that first element is always not a mask
                 if mask_idx.sum() == 0:
                     # If no mask tokens left in the current block, then we generate the next token autoregressively

                         block_start_position, block_start_position + inputs_embeds.shape[1], device=inputs_embeds.device
                     )
                 else:
+                    # from past_seen_tokens to past_seen_tokens + current_input_length (for us this is always the last blocks + the current block)
                     cache_position = torch.arange(
                         past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
                     )
         # --- keep the user/tokenizer padding mask BEFORE you overwrite attention_mask ---
+        # kv mask contains however many tokens are in the
+        # kv len is always the previous processed blocks.
         padding_mask_2d = attention_mask  # shape [B, KV_LEN], 1=token, 0=pad
         # -------------------------
         if position_ids is None:
             if (padding_mask_2d is not None) and (not self.training):
                 # full, per-sample positions over KV_LEN
+                # first real token gets 0 and then araneg up all masks
                 pos_full = padding_mask_2d.long().cumsum(-1) - 1          # pads => -1
                 pos_full = pos_full.clamp_min(0)                          # pads => 0
                 q_len = inputs_embeds.shape[1]
                 kv_len = pos_full.shape[1]
                 if kv_len < q_len:
                     raise ValueError(f"attention_mask KV_LEN={kv_len} < input_len={q_len}. "
                                     "When using cache, pass the FULL mask (past+current).")
+                # position ids are the arange but only taking the last block of values!
+                q_start = kv_len - q_len  # assumes current tokens are the last q_len positions (assuming q length is one block which it always is).
+                position_ids = pos_full[:, q_start:] # TODO assert same as just taking last block
             else:
                 # no padding mask: same positions for all batch elements
                 position_ids = cache_position.unsqueeze(0)
                     attention_mask = structural[None, None, :, :]  # [1,1,Q,KV]
                 else:
                     pad = padding_mask_2d.to(torch.bool)            # [B, KV]
+                    B, kv_len = pad.shape # kv len is here the length of the mask so kvlen + 32
+                    q_len = inputs_embeds.shape[1] # inputs_embeds = 32 TODO check
+                    q_start = kv_len - q_len
                     # Per-sample block ids computed from *non-pad* positions
+                    # TODO fix!
+                    # pos_full = pad.long().cumsum(-1) - 1 # again basically arange on the
+                    # pos_full = pos_full.clamp_min(0)
+                    # block_full = pos_full // block_size             # [B, KV] # this makes it so that block transitions might be in wrong place! thus it will attend wrongly!
+                    pos_full = torch.arange(0, kv_len, device=inputs_embeds.device)[None, ...]
+                    block_full = pos_full // block_size # 0,0...,0,1...1,2...2,...
+                    block_q = block_full[:, q_start:]               # [B, Q] # get the latest block () TODO check this is of all the same values!!!
+                    block_k = block_full                            # [B, KV] # everything we attent to!
+                    structural = block_q.unsqueeze(-1) >= block_k.unsqueeze(-2)  # [B, Q, KV] # on if the block is in a larger one than the other one or equal so on for same block and for all old blocks.!
                     # Mask keys AND queries (only valid tokens participate)
                     key_ok   = pad[:, None, None, :]                # [B,1,1,KV]
         mask_id: Optional[int] = 151665,
         **kwargs
     ) -> CausalLMOutputWithPastAndBlockCache:
         if self.training:
             original_labels = labels.clone()
             original_input_ids = input_ids.clone()
         assert attention_mask is not None, "attention_mask must be provided for this generate() implementation."
         # pad the initial input_ids and attention_mask to be multiple of block_size
+        if False: # input_ids.shape[1] % block_size != 0:
             pad_len = block_size - (input_ids.shape[1] % block_size)
             input_ids = torch.cat([torch.full((input_ids.shape[0], pad_len), self.config.pad_token_id, device=input_ids.device), input_ids], dim=1)
             attention_mask = torch.cat([torch.zeros((attention_mask.shape[0], pad_len), device=attention_mask.device), attention_mask], dim=1)
+        # attention_mask length is same as padded prompts!
         num_blocks = max_new_tokens // block_size
         device = input_ids.device
         batch_size = input_ids.size(0)
         # Handle prefix processing (Context Encoding)
         if input_ids.shape[1] >= block_size:
+            # pass in the entire context apart from the overlapping tokens and caches them.
             output = self.forward(input_ids=input_ids[:, :(input_ids.shape[1] // block_size * block_size)], attention_mask=attention_mask[:, :(input_ids.shape[1] // block_size * block_size)], use_cache=True, update_past_key_values=True, block_size=block_size)
+            # if we passed all of them then we need to extend by one prediction.
             logits, past_key_values = output.logits, output.past_key_values
             if input_ids.shape[1] % block_size == 0:
                 next_token = logits[:, -1:, :].argmax(dim=-1)
             prompt_length = input_ids.shape[1]
             # Initialize x_init with mask_id with all mask tokens for the new block
+            x_init = mask_id * torch.ones((input_ids.shape[0], block_size-prompt_length%block_size), device=self.device, dtype=torch.long) # padd by however mnay needed to become multiple of 32
             # Concatenate input_ids with x_init to form the new input_ids (we added a block-1 of masks to our current generation)
             x_init = torch.cat([input_ids, x_init], dim=1)
             # mask extension is extending the current mask by the number of new tokens we are generating in this block by adding ones.
+            # mask is now of length of all tokens including the padded masks
             mask_extension = unfinished_sequences.unsqueeze(1).repeat(1, block_size - prompt_length % block_size).to(dtype=attention_mask.dtype)
             # mask is the current attention mask extended by the new tokens we are generating in this block by adding ones.
             curr_attention_mask = torch.cat([attention_mask, mask_extension], dim=1)
             while True:
                 # mask_idx indicates where the mask tokens are in the current block
                 mask_idx = (x_t[:, -block_size:] == mask_id)
+                # TODO: assert that first element is always not a mask
                 if mask_idx.sum() == 0:
                     # If no mask tokens left in the current block, then we generate the next token autoregressively

modeling_f.py ADDED Viewed

	@@ -0,0 +1,949 @@

+from typing import Callable, Optional, Union
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+from functools import partial
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import auto_docstring, can_return_tuple, logging
+from .configuration import Fast_dLLM_QwenConfig
+from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+from einops import rearrange, repeat
+logger = logging.get_logger(__name__)
+@dataclass
+class CausalLMOutputWithPastAndBlockCache(CausalLMOutputWithPast):
+    block_past_key_values: Optional[Cache] = None
+@dataclass
+class BaseModelOutputWithPastAndBlockCache(BaseModelOutputWithPast):
+    block_past_key_values: Optional[Cache] = None
+# @torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs")
+def fused_flex_attention(q, k, v, mask=None):
+    return flex_attention(q, k, v, block_mask=mask, enable_gqa=True)
+def block_diff_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
+    """
+    Constructs the specialized block diffusion attention mask for training
+    composed of three masks:
+    - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
+    - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
+    - **Block Causal Mask (M_BC)**: Attention to update x0
+    Args:
+        b, h: Batch and head indices (ignored for mask logic).
+        q_idx, kv_idx: Query and Key indices.
+        seq_len: Total sequence length.
+        block_size: Defines the block structure.
+    Returns:
+        A boolean attention mask.
+    """
+    # Indicate whether token belongs to xt or x0
+    x0_flag_q = (q_idx >= n)
+    x0_flag_kv = (kv_idx >= n)
+    # Compute block indices
+    block_q = torch.where(x0_flag_q == 1,
+                        (q_idx - n) // block_size,
+                        q_idx // block_size)
+    block_kv = torch.where(x0_flag_kv == 1,
+                        (kv_idx - n) // block_size,
+                        kv_idx // block_size)
+    # **1. Block Diagonal Mask (M_BD) **
+    block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)
+    # **2. Offset Block-Causal Mask (M_OBC) **
+    offset_block_causal = (
+    (block_q > block_kv)
+    & (x0_flag_kv == 1)
+    & (x0_flag_q == 0)
+    )
+    # **3. Block-Causal Mask (M_BC) **
+    block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)
+    # **4. Combine Masks **
+    return block_diagonal | offset_block_causal | block_causal
+def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
+    # Compute block indices
+    block_q = q_idx // block_size
+    block_kv = kv_idx // block_size
+    return block_q >= block_kv
+class Fast_dLLM_QwenMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Fast_dLLM_QwenAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Fast_dLLM_QwenConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        update_past_key_values: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if self.training:
+            #split q into two parts
+            q_1 = query_states[:,:,:query_states.shape[2]//2]
+            q_2 = query_states[:,:,query_states.shape[2]//2:]
+            #split k into two parts
+            k_1 = key_states[:,:,:key_states.shape[2]//2]
+            k_2 = key_states[:,:,key_states.shape[2]//2:]
+            q_1, k_1 = apply_rotary_pos_emb(q_1, k_1, cos, sin)
+            q_2, k_2 = apply_rotary_pos_emb(q_2, k_2, cos, sin)
+            query_states = torch.cat((q_1, q_2), dim=-2)
+            key_states = torch.cat((k_1, k_2), dim=-2)
+        else:
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if block_past_key_values is not None:
+            if len(block_past_key_values) <= self.layer_idx:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+                key_states, value_states = block_past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            else:
+                block_cache_key_states = block_past_key_values[self.layer_idx][0]
+                block_cache_value_states = block_past_key_values[self.layer_idx][1]
+                block_cache_key_states[:, :, replace_position:replace_position+key_states.shape[2]] = key_states
+                block_cache_value_states[:, :, replace_position:replace_position+value_states.shape[2]] = value_states
+                key_states = block_cache_key_states
+                value_states = block_cache_value_states
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            if update_past_key_values:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            elif len(past_key_value) > self.layer_idx:
+                key_states = torch.cat((past_key_value[self.layer_idx][0], key_states), dim=-2)
+                value_states = torch.cat((past_key_value[self.layer_idx][1], value_states), dim=-2)
+        if self.training:
+            attn_output = fused_flex_attention(query_states, key_states, value_states, mask=attention_mask)
+            attn_output = attn_output.transpose(1, 2).contiguous()
+        else:
+            attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                is_causal=False,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                scaling=self.scaling,
+                sliding_window=self.sliding_window,  # main diff with Llama
+                **kwargs,
+            )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+@use_kernel_forward_from_hub("RMSNorm")
+class Fast_dLLM_QwenRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Fast_dLLM_QwenRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class Fast_dLLM_QwenDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Fast_dLLM_QwenConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Fast_dLLM_QwenAttention(config=config, layer_idx=layer_idx)
+        self.mlp = Fast_dLLM_QwenMLP(config)
+        self.input_layernorm = Fast_dLLM_QwenRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Fast_dLLM_QwenRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        update_past_key_values: Optional[bool] = False,
+        use_block_cache: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        **kwargs
+    ) -> tuple[torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            update_past_key_values=update_past_key_values,
+            use_block_cache=use_block_cache,
+            block_past_key_values=block_past_key_values,
+            replace_position=replace_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Fast_dLLM_QwenPreTrainedModel(PreTrainedModel):
+    config_class = Fast_dLLM_QwenConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Fast_dLLM_QwenDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Fast_dLLM_QwenDecoderLayer,
+        "attentions": Fast_dLLM_QwenAttention,
+    }
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Fast_dLLM_QwenRMSNorm):
+            module.weight.data.fill_(1.0)
+class Fast_dLLM_QwenRotaryEmbedding(nn.Module):
+    def __init__(self, config: Fast_dLLM_QwenConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
+    def __init__(self, config: Fast_dLLM_QwenConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.bd_size = config.bd_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Fast_dLLM_QwenDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Fast_dLLM_QwenRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Fast_dLLM_QwenRotaryEmbedding(config=config)
+        self.gradient_checkpointing = True
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def eval_mask(self, seqlen, block_size, cache_seq_len):
+        q_indices = torch.arange(seqlen) + cache_seq_len
+        k_indices = torch.arange(seqlen + cache_seq_len)
+        mask = eval_block_diff_mask(
+            q_idx=q_indices[:, None],
+            kv_idx=k_indices[None, :],
+            block_size=block_size
+        )
+        return mask
+    def gen_mask(self, seqlen, block_size, B, H):
+        mask = create_block_mask(
+            partial(block_diff_mask, block_size=block_size, n=seqlen),
+            B=B, H=H, Q_LEN=seqlen*2, KV_LEN=seqlen*2)
+        return mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        update_past_key_values: Optional[bool] = False,
+        block_size: Optional[int] = 32,
+        use_block_cache: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        **kwargs
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if use_block_cache and block_past_key_values is None:
+            block_past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            if self.training:
+                cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]//2, device=inputs_embeds.device
+                )
+            else:
+                if use_block_cache:
+                    block_start_position = past_seen_tokens+replace_position if replace_position is not None else past_seen_tokens
+                    cache_position = torch.arange(
+                        block_start_position, block_start_position + inputs_embeds.shape[1], device=inputs_embeds.device
+                    )
+                else:
+                    cache_position = torch.arange(
+                        past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
+                    )
+        # --- keep the user/tokenizer padding mask BEFORE you overwrite attention_mask ---
+        padding_mask_2d = attention_mask  # shape [B, KV_LEN], 1=token, 0=pad
+        # -------------------------
+        # Position ids (left padding)
+        # -------------------------
+        if position_ids is None:
+            if (padding_mask_2d is not None) and (not self.training):
+                # full, per-sample positions over KV_LEN
+                pos_full = padding_mask_2d.long().cumsum(-1) - 1          # pads => -1
+                pos_full = pos_full.clamp_min(0)                          # pads => 0
+                q_len = inputs_embeds.shape[1]
+                kv_len = pos_full.shape[1]
+                if kv_len < q_len:
+                    raise ValueError(f"attention_mask KV_LEN={kv_len} < input_len={q_len}. "
+                                    "When using cache, pass the FULL mask (past+current).")
+                q_start = kv_len - q_len  # assumes current tokens are the last q_len positions
+                position_ids = pos_full[:, q_start:]
+            else:
+                # no padding mask: same positions for all batch elements
+                position_ids = cache_position.unsqueeze(0)
+        # -------------------------
+        # Attention mask (block-causal + padding), per sample
+        # -------------------------
+        if self.training:
+            attention_mask = self.gen_mask(labels.shape[1], self.bd_size, labels.shape[0], self.config.num_attention_heads).to(device=inputs_embeds.device)
+        else:
+            if use_block_cache and block_past_key_values.get_seq_length() != 0:
+                attention_mask = None
+            else:
+                # attention_mask = self.eval_mask(input_ids.shape[1], block_size, past_key_values.get_seq_length() if past_key_values is not None else 0).to(device=inputs_embeds.device)
+                if padding_mask_2d is None:
+                    # fallback: original behavior (no padding)
+                    structural = self.eval_mask(
+                        seqlen=input_ids.shape[1],
+                        block_size=block_size,
+                        cache_seq_len=past_key_values.get_seq_length() if past_key_values is not None else 0,
+                    ).to(device=inputs_embeds.device)
+                    attention_mask = structural[None, None, :, :]  # [1,1,Q,KV]
+                else:
+                    pad = padding_mask_2d.to(torch.bool)            # [B, KV]
+                    B, kv_len = pad.shape
+                    q_len = inputs_embeds.shape[1]
+                    q_start = kv_len - q_len
+                    # Per-sample block ids computed from *non-pad* positions
+                    pos_full = pad.long().cumsum(-1) - 1
+                    pos_full = pos_full.clamp_min(0)
+                    block_full = pos_full // block_size             # [B, KV]
+                    block_q = block_full[:, q_start:]               # [B, Q]
+                    block_k = block_full                            # [B, KV]
+                    structural = block_q.unsqueeze(-1) >= block_k.unsqueeze(-2)  # [B, Q, KV]
+                    # Mask keys AND queries (only valid tokens participate)
+                    key_ok   = pad[:, None, None, :]                # [B,1,1,KV]
+                    query_ok = pad[:, None, q_start:, None]         # [B,1,Q,1]
+                    attention_mask = structural[:, None, :, :] & key_ok & query_ok  # [B,1,Q,KV]
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                update_past_key_values=update_past_key_values,
+                use_block_cache=use_block_cache,
+                block_past_key_values=block_past_key_values,
+                replace_position=replace_position,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPastAndBlockCache(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            block_past_key_values=block_past_key_values if use_block_cache else None,
+        )
+class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Fast_dLLM_QwenModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.generate_statistics = {}
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        update_past_key_values: Optional[bool] = False,
+        block_size: Optional[int] = 32,
+        use_block_cache: Optional[bool] = False,
+        block_past_key_values: Optional[Cache] = None,
+        replace_position: Optional[int] = None,
+        mask_id: Optional[int] = 151665,
+        **kwargs
+    ) -> CausalLMOutputWithPastAndBlockCache:
+        if self.training:
+            original_labels = labels.clone()
+            original_input_ids = input_ids.clone()
+            noisy_input_ids = input_ids.clone()
+            input_ids = input_ids.reshape(input_ids.shape[0] * input_ids.shape[1] // self.model.bd_size, self.model.bd_size)
+            b, l = input_ids.shape
+            t = torch.rand((b,), device=input_ids.device)
+            eps=1e-3
+            p_mask = (1 - eps) * t + eps
+            p_mask = p_mask[:, None].repeat(1, l)
+            mask_indices = torch.rand((b, l), device=input_ids.device) < p_mask
+            x_t = torch.where(mask_indices, mask_id, input_ids).reshape(labels.shape)
+            noisy_input_ids[labels != -100] = x_t[labels != -100]
+            mask = (noisy_input_ids != mask_id)
+            labels[mask] = -100
+            input_ids = torch.cat([noisy_input_ids, input_ids.reshape(labels.shape)], dim=1)
+            complementary_noisy_input_ids = original_input_ids.clone()
+            complementary_labels = original_labels.clone()
+            complementary_input_ids = original_input_ids.reshape(original_input_ids.shape[0] * original_input_ids.shape[1] // self.model.bd_size, self.model.bd_size)
+            complementary_mask_indices = ~mask_indices
+            complementary_x_t = torch.where(complementary_mask_indices, mask_id, complementary_input_ids).reshape(labels.shape)
+            complementary_noisy_input_ids[complementary_labels != -100] = complementary_x_t[complementary_labels != -100]
+            complementary_mask = (complementary_noisy_input_ids != mask_id)
+            complementary_labels[complementary_mask] = -100
+            complementary_input_ids = torch.cat([complementary_noisy_input_ids, complementary_input_ids.reshape(complementary_labels.shape)], dim=1)
+            input_ids = torch.cat([input_ids, complementary_input_ids], dim=0)
+            labels = torch.cat([labels, complementary_labels], dim=0)
+        outputs: BaseModelOutputWithPastAndBlockCache = self.model(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            update_past_key_values=update_past_key_values,
+            block_size=block_size,
+            use_block_cache=use_block_cache,
+            block_past_key_values=block_past_key_values,
+            replace_position=replace_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        if self.training:
+            hidden_states = hidden_states[:, :hidden_states.shape[1]//2, :]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPastAndBlockCache(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            block_past_key_values=outputs.block_past_key_values,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids,
+        attention_mask=None, # --- ADDED ARGUMENT ---
+        max_new_tokens=20, # Added default value for safety
+        mask_id=151665,
+        threshold=1,
+        small_block_size=8,
+        block_size=32,
+        stop_token=151645,
+        stopping_criteria=None,
+        top_p=0.95,
+        temperature=0,
+        use_block_cache=False,
+        log_lengths=False,
+        log_steps=False,
+        **kwargs
+    ):
+        if use_block_cache:
+            raise ValueError("use_block_cache=True is not supported in this generate() implementation.")
+        assert attention_mask is not None, "attention_mask must be provided for this generate() implementation."
+        # pad the initial input_ids and attention_mask to be multiple of block_size
+        if input_ids.shape[1] % block_size != 0:
+            pad_len = block_size - (input_ids.shape[1] % block_size)
+            input_ids = torch.cat([torch.full((input_ids.shape[0], pad_len), self.config.pad_token_id, device=input_ids.device), input_ids], dim=1)
+            attention_mask = torch.cat([torch.zeros((attention_mask.shape[0], pad_len), device=attention_mask.device), attention_mask], dim=1)
+        num_blocks = max_new_tokens // block_size
+        device = input_ids.device
+        batch_size = input_ids.size(0)
+        original_input_length = input_ids.shape[1]
+        # Track which sequences in the batch are still active
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=device)
+        # Keep track of how many NFE each sequence uses and how many tokens are generated
+        iterations = torch.zeros((batch_size,), device=device)
+        n_generated_tokens = torch.zeros((batch_size,), device=device)
+        # Keep track if each sequence is finished
+        finished = torch.zeros((batch_size,), dtype=torch.bool, device=device)
+        # Handle prefix processing (Context Encoding)
+        if input_ids.shape[1] >= block_size:
+            output = self.forward(input_ids=input_ids[:, :(input_ids.shape[1] // block_size * block_size)], attention_mask=attention_mask[:, :(input_ids.shape[1] // block_size * block_size)], use_cache=True, update_past_key_values=True, block_size=block_size)
+            logits, past_key_values = output.logits, output.past_key_values
+            if input_ids.shape[1] % block_size == 0:
+                next_token = logits[:, -1:, :].argmax(dim=-1)
+                input_ids = torch.cat([input_ids, next_token], dim=1)
+                n_generated_tokens += (~finished).long()
+                iterations += (~finished).long()
+                # Update finished status
+                unfinished_sequences = unfinished_sequences & (next_token.squeeze(-1) != stop_token).long()
+                finished |= (next_token.squeeze(-1) == stop_token)
+                # Append to mask: If unfinished, append 1. If finished, append 0.
+                new_mask_col = unfinished_sequences.unsqueeze(1).to(dtype=attention_mask.dtype)
+                attention_mask = torch.cat([attention_mask, new_mask_col], dim=1)
+        else:
+            past_key_values = None
+        num_small_blocks = block_size // small_block_size
+        for block_idx in range(num_blocks):
+            new_tokens = input_ids[:, original_input_length:]
+            has_stop_now = (new_tokens == stop_token).any(dim=1) # check if any generated tokens ever are stop tokens
+            finished |= has_stop_now # whenever that is true we halt the sequence generation forever
+            if finished.all(): # whenever that is true we halt the sequence generation forever
+                break
+            # Length of current prompt
+            prompt_length = input_ids.shape[1]
+            # Initialize x_init with mask_id with all mask tokens for the new block
+            x_init = mask_id * torch.ones((input_ids.shape[0], block_size-prompt_length%block_size), device=self.device, dtype=torch.long)
+            # Concatenate input_ids with x_init to form the new input_ids (we added a block-1 of masks to our current generation)
+            x_init = torch.cat([input_ids, x_init], dim=1)
+            # mask extension is extending the current mask by the number of new tokens we are generating in this block by adding ones.
+            mask_extension = unfinished_sequences.unsqueeze(1).repeat(1, block_size - prompt_length % block_size).to(dtype=attention_mask.dtype)
+            # mask is the current attention mask extended by the new tokens we are generating in this block by adding ones.
+            curr_attention_mask = torch.cat([attention_mask, mask_extension], dim=1)
+            x_t = x_init.clone()
+            block_past_key_values = None
+            while True:
+                # mask_idx indicates where the mask tokens are in the current block
+                mask_idx = (x_t[:, -block_size:] == mask_id)
+                # TODOL assert that first element is always not a mask
+                if mask_idx.sum() == 0:
+                    # If no mask tokens left in the current block, then we generate the next token autoregressively
+                    output = self.forward(input_ids=x_t[:, -block_size:], attention_mask=curr_attention_mask, use_cache=True, past_key_values=past_key_values, update_past_key_values=True, block_size=block_size)
+                    logits, past_key_values = output.logits, output.past_key_values
+                    next_token = logits[:, -1:, :].argmax(dim=-1)
+                    x_t = torch.cat([x_t, next_token], dim=1)
+                    # generating one extra token means the mask needs to be extended by one more position 1 if not finished and 0 else
+                    curr_attention_mask = torch.cat([curr_attention_mask, unfinished_sequences.unsqueeze(1).to(curr_attention_mask.dtype)], dim=1)
+                    # add 1 to iterations for each unfinished sequence
+                    iterations += (~finished).long()
+                    n_generated_tokens += (~finished).long()
+                    # TODO: we dont update the finished status here because we only care about tokens generated in the masked positions
+                    break
+                for small_block_idx in range(num_small_blocks):
+                    small_block_start_idx = small_block_idx * small_block_size
+                    small_block_end_idx = small_block_start_idx + small_block_size
+                    start = -block_size + small_block_start_idx
+                    end = None if block_size == small_block_end_idx else -block_size + small_block_end_idx
+                    while True:
+                        mask_idx = (x_t[:, -block_size:] == mask_id)
+                        if mask_idx[:, start:end].sum() == 0:
+                            break # loop untill all tokens are generated in this sub-block
+                            # is it batch invariant? If one seq finishes then we loop until all seq finished
+                        if use_block_cache:
+                            assert False, "use_block_cache=True is not supported in this generate() implementation."
+                            if block_past_key_values is None or (x_t[:, -block_size+small_block_start_idx] == mask_id).any():
+                                output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True)
+                                logits, block_past_key_values = output.logits, output.block_past_key_values
+                                logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
+                                logits = logits[:, start:end]
+                            else:
+                                logits = self.forward(input_ids=x_t[:,start:end], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True, block_past_key_values=block_past_key_values, replace_position=small_block_start_idx).logits
+                                logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
+                        else:
+                            # input ids are the most recent block_size tokens, attention mask needs to
+                            logits = self.forward(input_ids=x_t[:, -block_size:], attention_mask=curr_attention_mask, use_cache=True, past_key_values=past_key_values, update_past_key_values=False,block_size=block_size,).logits
+                            # the logits to be sampled from are the most recent 32 tokens
+                            # shift because of autoregressive  conversion and valid by appending anything to the start since first token mask is off anyways always.
+                            logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1) # TODO maybe prepend nan or sth
+                            logits = logits[:, start:end]
+                        x_1, p_1t = self.sample_with_top_p(logits, top_p=top_p, temperature=temperature)
+                        # Select tokens with probability greater than threshold from p_1t
+                        x1_p = torch.squeeze(torch.gather(p_1t, dim=-1, index=torch.unsqueeze(x_1, -1)), -1)
+                        x1_p = torch.where(mask_idx[:, start:end], x1_p, -torch.inf)
+                        unmask_idx = (x1_p > threshold)
+                        # Ensure at least one token is unmasked in the current small block
+                        max_prob_idx = x1_p.argmax(dim=-1)
+                        unmask_idx[torch.arange(x_1.shape[0]), max_prob_idx] = True
+                        unmask_idx = unmask_idx & mask_idx[:, start:end]
+                        # Add 1 to iterations if the sequence is not stopped AND at least one token is generated in this iteration
+                        # aka if not finished and unmask id has some True value
+                        iterations += (~finished & unmask_idx.any(dim=1)).long()
+                        # Count number of generated tokens in this iteration if not stopped
+                        n_generated_iter = torch.where(finished, 0, unmask_idx.sum(dim=1)) # if not finished then count generated tokens
+                        n_generated_tokens += n_generated_iter
+                        # Only update the positions where unmask_idx is True AND the sequence if not finished TODO check this, otherwise
+                        x_t[:, start:end][unmask_idx] = x_1[unmask_idx]
+                        # new_tokens = input_ids[:, original_input_length:]
+                        # check if any newly generated token is stop token
+                        # has_stop_now = (new_tokens == stop_token).any(dim=1)
+                        # finished |= has_stop_now # TODO confirm if that is true here.
+            input_ids = x_t
+            attention_mask = curr_attention_mask
+        if log_lengths:
+            if self.generate_statistics.get("generation_lengths", None) is None:
+                self.generate_statistics["generation_lengths"] = []
+            self.generate_statistics["generation_lengths"].extend(n_generated_tokens.cpu().tolist())
+        if log_steps:
+            if self.generate_statistics.get("generation_steps", None) is None:
+                self.generate_statistics["generation_steps"] = []
+            self.generate_statistics["generation_steps"].extend(iterations.cpu().tolist())
+        # Final truncation: keep everything up to the *latest* first stop_token
+        new_tokens = input_ids[:, original_input_length:]
+        has_stop = (new_tokens == stop_token)
+        gen = input_ids[:, original_input_length:]  # (B, T)
+        T = gen.size(1)
+        if T > 0:
+            device = input_ids.device
+            B = input_ids.size(0)
+            idx = torch.arange(T, device=device).unsqueeze(0).expand(B, T)
+            stop_mask = gen.eq(stop_token)
+            first_stop = torch.where(stop_mask, idx, torch.full_like(idx, T)).min(dim=1).values
+            has_stop = first_stop < T
+            keep = torch.where(has_stop, first_stop + 1, torch.full_like(first_stop, T))
+            pad_id = self.config.pad_token_id if getattr(self.config, "pad_token_id", None) is not None else stop_token
+            after = idx >= keep.unsqueeze(1)
+            gen = gen.clone()
+            gen[after] = pad_id
+            input_ids = torch.cat([input_ids[:, :original_input_length], gen], dim=1)
+        return input_ids
+    def sample_with_top_p(self, logits, top_p=0.95, temperature=1.0):
+        # Calculate probabilities
+        if temperature > 0:
+            scaled_logits = logits / temperature
+        else:
+            p_1t = torch.softmax(logits, dim=-1)
+            x_1 = p_1t.argmax(dim=-1)
+            return x_1, p_1t
+        probs = torch.softmax(scaled_logits, dim=-1) # [B, seq_len, vocab_size]
+        sorted_probs, sorted_indices = torch.sort(probs, dim=-1, descending=True)  # [B, seq_len, sorted(vocab_size)]
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1) # [B, seq_len, cumsum(sorted(vocab_size))]
+        sorted_indices_to_remove = cumulative_probs > top_p # [B, seq_len, bool(sorted(vocab_size))]
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() # clone the tensor to avoid in-place operation error
+        sorted_indices_to_remove[..., 0] = 0 # always keep at least one token
+        indices_to_remove = torch.zeros_like(probs, dtype=torch.bool).scatter_(
+            dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+        ) # [B, seq_len, vocab_size], take 0 array and
+        # set True at the indices where sorted_indices_to_remove is True
+        # we index using the sorted indices in order to put the values back to their original position
+        # prev: probs[indices_to_remove] = 0, indices_to_remove is of the same shape as probs
+        # and therefore this operation just selects
+        probs = probs.masked_fill(indices_to_remove, 0.0)
+        probs_sum = probs.sum(dim=-1, keepdim=True).clamp_min(1e-12)
+        p_1t = probs / probs_sum
+        vocab = p_1t.shape[-1]
+        flat = p_1t.reshape(-1, vocab)
+        samples = torch.multinomial(flat, num_samples=1).squeeze(-1)
+        x_1 = samples.view(*p_1t.shape[:-1])
+        return x_1, p_1t