Upload FastSLMForCausalLM

Browse files

Files changed (4) hide show

config.json +2 -2
delta_net.py +1 -2
model.safetensors.index.json +1 -0
modeling_fast_slm.py +130 -30

config.json CHANGED Viewed

@@ -13,6 +13,7 @@
   "bos_token_id": 1,
   "calc_logits_for_entire_prompt": false,
   "d_conv": 4,
   "eos_token_id": 2,
   "ffn_expand_ratio": 3,
   "global_attn_idx": [],
@@ -127,8 +128,7 @@
   "router_aux_loss_coef": 0.001,
   "sliding_window": null,
   "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.48.2",
   "use_cache": false,
   "use_mamba_kernels": true,
   "v_head_dim": -1,

   "bos_token_id": 1,
   "calc_logits_for_entire_prompt": false,
   "d_conv": 4,
+  "dtype": "bfloat16",
   "eos_token_id": 2,
   "ffn_expand_ratio": 3,
   "global_attn_idx": [],
   "router_aux_loss_coef": 0.001,
   "sliding_window": null,
   "tie_word_embeddings": true,
+  "transformers_version": "4.56.2",
   "use_cache": false,
   "use_mamba_kernels": true,
   "v_head_dim": -1,

delta_net.py CHANGED Viewed

@@ -10,7 +10,6 @@ import torch.nn as nn
 from einops import rearrange
 from torch.nn import functional as F
-import fla
 from fla.modules import FusedRMSNormSwishGate, RMSNorm, ShortConvolution
 from fla.ops.delta_rule import chunk_delta_rule, fused_recurrent_delta_rule
@@ -331,7 +330,7 @@ class Cache(transformers.cache_utils.Cache):
         self,
         seen_tokens: int = 0
     ) -> Cache:
-        super().__init__()
         self.states: List[Dict[str, Any]] = []

 from einops import rearrange
 from torch.nn import functional as F
 from fla.modules import FusedRMSNormSwishGate, RMSNorm, ShortConvolution
 from fla.ops.delta_rule import chunk_delta_rule, fused_recurrent_delta_rule
         self,
         seen_tokens: int = 0
     ) -> Cache:
+        super().__init__(layers=[0])
         self.states: List[Dict[str, Any]] = []

model.safetensors.index.json CHANGED Viewed

@@ -1,5 +1,6 @@
 {
   "metadata": {
     "total_size": 5500034112
   },
   "weight_map": {

 {
   "metadata": {
+    "total_parameters": 2750017056,
     "total_size": 5500034112
   },
   "weight_map": {

modeling_fast_slm.py CHANGED Viewed

@@ -46,6 +46,12 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
 from transformers.utils import (
     add_start_docstrings,
@@ -280,7 +286,6 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
     def __init__(self, config, batch_size, dtype=torch.float16, device=None, layer_type=None):
         self.dtype = dtype
         # self.layers_block_type = config.layers_block_type
-        self.has_previous_state = False
         intermediate_size = config.mamba_expand * config.hidden_size
         ssm_state_size = config.mamba_d_state
         conv_kernel_size = config.mamba_d_conv
@@ -804,6 +809,75 @@ class FastSLMFlashAttention2(FastSLMAttention):
         )
 class FastSLMFused_MHA(FastSLMAttention):
     """
     FastSLM flash attention module. This module inherits from `FastSLMAttention` as the weights of the module stays
@@ -938,9 +1012,6 @@ class FastSLMFused_MHA(FastSLMAttention):
         v_dim = query_states.shape[-2] * value_states.shape[-1]
         attn_output = attn_output.reshape(bsz, q_len, v_dim).contiguous()
-        if past_key_value is not None:
-            past_key_value.has_previous_state = True
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
@@ -952,6 +1023,7 @@ class FastSLMFused_MHA(FastSLMAttention):
 JAMBA_ATTENTION_CLASSES = {
     "flash_attention_2": FastSLMFlashAttention2,
     "fused_mha": FastSLMFused_MHA,
 }
 class FastSLMMLP(nn.Module):
@@ -1633,6 +1705,8 @@ class FastSLMModel(FastSLMPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -1684,21 +1758,13 @@ class FastSLMModel(FastSLMPreTrainedModel):
                 )
                 use_cache = False
-        past_key_values_length = 0
-        if use_cache:
-            if past_key_values is not None:
-                past_key_values_length = past_key_values.get_usable_length(seq_length, 0)
-            else:
-                use_cache = False
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
             )
             position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
         else:
-            if self.config.num_memory_tokens > 0 and past_key_values is not None and not past_key_values.has_previous_state:
                 position_ids = position_ids.view(-1, seq_length + self.config.num_memory_tokens).long()
             else:
                 position_ids = position_ids.view(-1, seq_length).long()
@@ -1708,7 +1774,7 @@ class FastSLMModel(FastSLMPreTrainedModel):
         ori_b, ori_n = inputs_embeds.shape[0], inputs_embeds.shape[1]
-        if self.config.num_memory_tokens > 0 and (past_key_values is None or not past_key_values.has_previous_state):
             mem = repeat(self.memory_tokens, 'n d -> b n d', b = inputs_embeds.shape[0]) # prepend the memory to every segment of m by repeating the memory tokens
             inputs_embeds, mem_packed_shape = pack((mem, inputs_embeds), 'b * d')
@@ -1718,6 +1784,7 @@ class FastSLMModel(FastSLMPreTrainedModel):
             if attention_mask is not None and attention_mask.shape[1] < inputs_embeds.shape[1]:
                 assert attention_mask.shape[1] + self.config.num_memory_tokens == inputs_embeds.shape[1]
                 attention_mask = torch.cat([torch.ones(inputs_embeds.shape[0], self.config.num_memory_tokens, device=attention_mask.device), attention_mask], dim=1)
         if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
@@ -1784,21 +1851,12 @@ class FastSLMModel(FastSLMPreTrainedModel):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
-        if self.config.num_memory_tokens > 0 and (past_key_values is None or not past_key_values.has_previous_state):
             mem, hidden_states = unpack(hidden_states, mem_packed_shape, 'b * d')
             hidden_states = hidden_states[:, :ori_n, :]
-        if past_key_values and not past_key_values.has_previous_state:
-            for layer_idx_ in range(len(self.layers)):
-                if past_key_values.get_seq_length(layer_idx_) > 0:
-                    past_key_values.has_previous_state = True
-                    break
-            if mamba_inference_params is not None and mamba_inference_params.seqlen_offset > 0:
-                past_key_values.has_previous_state = True
-            if fla_past_key_values is not None and len(fla_past_key_values.states) > 0:
-                past_key_values.has_previous_state = True
         next_cache = None
         if use_cache:
@@ -2011,7 +2069,7 @@ class FastSLMForCausalLM(FastSLMPreTrainedModel):
             static_logits = torch.zeros((batch_size, self.config.vocab_size), device=device)
             # Set up for graph capture
-            past_key_values.has_previous_state = True
             if mamba_inference_params is not None:
                 mamba_inference_params.seqlen_offset = 1
@@ -2055,7 +2113,7 @@ class FastSLMForCausalLM(FastSLMPreTrainedModel):
             if hasattr(module, 'reset_kv_cache'):
                 module.reset_kv_cache()
-        past_key_values.has_previous_state = False
         # Return generation state
         generation_state = {
@@ -2134,7 +2192,7 @@ class FastSLMForCausalLM(FastSLMPreTrainedModel):
                     if hasattr(module, 'reset_kv_cache'):
                         module.reset_kv_cache()
-                past_key_values.has_previous_state = False
                 # Prefill phase - process input sequence
                 position_ids = torch.arange(
@@ -2215,6 +2273,48 @@ class FastSLMForCausalLM(FastSLMPreTrainedModel):
             return generated_ids
 def sample_token(logits, temperature=1.0, top_k=0, top_p=0.9):
     """
     Sample a token from logits with temperature, top-k, and top-p filtering.

     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
+try:
+    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+except ImportError:
+    pass
 from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
 from transformers.utils import (
     add_start_docstrings,
     def __init__(self, config, batch_size, dtype=torch.float16, device=None, layer_type=None):
         self.dtype = dtype
         # self.layers_block_type = config.layers_block_type
         intermediate_size = config.mamba_expand * config.hidden_size
         ssm_state_size = config.mamba_d_state
         conv_kernel_size = config.mamba_d_conv
         )
+class FastSLMSDPAAttention(nn.Module):
+    def __init__(self, config, layer_idx: int, reuse_kv=False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=False
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=False
+        )
+        self.sliding_window = self.config.sliding_window if self.layer_idx not in self.config.global_attn_idx else None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        # position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        # cos, sin = position_embeddings
+        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) # , cache_kwargs)
+        attention_interface = ALL_ATTENTION_FUNCTIONS['flash_attention_2']
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights, past_key_value, (key_states, value_states)
 class FastSLMFused_MHA(FastSLMAttention):
     """
     FastSLM flash attention module. This module inherits from `FastSLMAttention` as the weights of the module stays
         v_dim = query_states.shape[-2] * value_states.shape[-1]
         attn_output = attn_output.reshape(bsz, q_len, v_dim).contiguous()
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
 JAMBA_ATTENTION_CLASSES = {
     "flash_attention_2": FastSLMFlashAttention2,
     "fused_mha": FastSLMFused_MHA,
+    "sdpa": FastSLMSDPAAttention,
 }
 class FastSLMMLP(nn.Module):
         # Initialize weights and apply final processing
         self.post_init()
+        self.has_previous_state = False
     def get_input_embeddings(self):
         return self.embed_tokens
                 )
                 use_cache = False
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device
             )
             position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
         else:
+            if self.config.num_memory_tokens > 0 and past_key_values is not None and not self.has_previous_state:
                 position_ids = position_ids.view(-1, seq_length + self.config.num_memory_tokens).long()
             else:
                 position_ids = position_ids.view(-1, seq_length).long()
         ori_b, ori_n = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        if self.config.num_memory_tokens > 0 and (past_key_values is None or not self.has_previous_state):
             mem = repeat(self.memory_tokens, 'n d -> b n d', b = inputs_embeds.shape[0]) # prepend the memory to every segment of m by repeating the memory tokens
             inputs_embeds, mem_packed_shape = pack((mem, inputs_embeds), 'b * d')
             if attention_mask is not None and attention_mask.shape[1] < inputs_embeds.shape[1]:
                 assert attention_mask.shape[1] + self.config.num_memory_tokens == inputs_embeds.shape[1]
                 attention_mask = torch.cat([torch.ones(inputs_embeds.shape[0], self.config.num_memory_tokens, device=attention_mask.device), attention_mask], dim=1)
         if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
+        if self.config.num_memory_tokens > 0 and (past_key_values is None or not self.has_previous_state):
             mem, hidden_states = unpack(hidden_states, mem_packed_shape, 'b * d')
             hidden_states = hidden_states[:, :ori_n, :]
+        if past_key_values is not None and not self.has_previous_state:
+            self.has_previous_state = True
         next_cache = None
         if use_cache:
             static_logits = torch.zeros((batch_size, self.config.vocab_size), device=device)
             # Set up for graph capture
+            self.model.has_previous_state = True
             if mamba_inference_params is not None:
                 mamba_inference_params.seqlen_offset = 1
             if hasattr(module, 'reset_kv_cache'):
                 module.reset_kv_cache()
+        self.model.has_previous_state = False
         # Return generation state
         generation_state = {
                     if hasattr(module, 'reset_kv_cache'):
                         module.reset_kv_cache()
+                self.model.has_previous_state = False
                 # Prefill phase - process input sequence
                 position_ids = torch.arange(
             return generated_ids
+    def prepare_inputs_for_generation(
+            self,
+            input_ids,
+            past_key_values=None,
+            attention_mask=None,
+            inputs_embeds=None,
+            output_router_logits=False,
+            **kwargs,
+    ):
+        if self.config.num_memory_tokens > 0:
+            attention_mask = torch.cat([torch.ones(input_ids.shape[0], self.config.num_memory_tokens, device=attention_mask.device), attention_mask], dim=1)
+        past_key_values = None   # Disable cache for now
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None:
+            if input_ids.shape[1] == 0:
+                model_inputs = {"inputs_embeds": inputs_embeds}
+            else:
+                inputs_embeds_new = self.model.embed_tokens(input_ids)
+                model_inputs = {"inputs_embeds": torch.cat([inputs_embeds, inputs_embeds_new], dim=1)}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
 def sample_token(logits, temperature=1.0, top_k=0, top_p=0.9):
     """
     Sample a token from logits with temperature, top-k, and top-p filtering.