Upload FastSLMForCausalLM

Browse files

Files changed (9) hide show

config.json +7 -79
configuration_fast_slm.py +246 -0
delta_net.py +168 -26
fused_mha_with_cache.py +126 -0
mamba2.py +134 -1097
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
modeling_fast_slm.py +0 -0
triton_attention.py +2714 -0

config.json CHANGED Viewed

@@ -1,49 +1,28 @@
 {
   "architectures": [
-    "JambaForCausalLM"
   ],
   "attention_dropout": 0.0,
   "attn_hidden_size": -1,
-  "attn_implementation": "flash_attention_2",
-  "attn_implementation_new": "flash_attention_2",
-  "attn_layer_offset": 4,
-  "attn_layer_period": 8,
-  "attn_reuse_every_i_layer": -1,
   "auto_map": {
-    "AutoConfig": "configuration_jamba.JambaConfig",
-    "AutoModelForCausalLM": "modeling_jamba.JambaForCausalLM"
   },
   "bos_token_id": 1,
   "calc_logits_for_entire_prompt": false,
-  "compact_gating": false,
-  "compute_attn_mat": false,
   "d_conv": 4,
-  "dense_public_ffn_structure": false,
-  "double_v_dim": false,
-  "enable_mod": false,
   "eos_token_id": 2,
-  "expert_layer_offset": 1,
-  "expert_layer_period": 2,
   "ffn_expand_ratio": 3,
-  "ffn_reuse_every_i_layer": -1,
-  "ffn_sharing_config": null,
-  "fully_parallel_jamba": false,
-  "fused_multihead_config": null,
   "global_attn_idx": [],
-  "gradient_checkpoint_layer": null,
-  "hash_grid_config": null,
-  "hash_grid_config_mlp": null,
   "hidden_act": "silu",
   "hidden_size": 3072,
-  "hybrid_block_indices": [],
   "hybrid_decoder_layer": "mamba",
   "initializer_range": 0.02,
   "intermediate_size": 0,
   "kq_head_dim": -1,
   "kq_norm": "none",
-  "kv_reuse_every_i_layer": -1,
-  "kv_reuse_group": null,
-  "kv_weight_reuse": false,
   "layer_type": [
     "m",
     "a",
@@ -120,89 +99,38 @@
     "m2",
     "f"
   ],
-  "layerwise_memory_token": false,
-  "local_expand_ratio": 1,
-  "local_global_dual_branch": false,
-  "local_global_dual_branch_merge_op": "mean",
-  "lookback_mode": "",
-  "macro_arch": "",
   "mamba2_headdim": 64,
-  "mamba_attnaug_config": null,
   "mamba_conv_bias": true,
   "mamba_d_conv": 4,
   "mamba_d_state": 128,
   "mamba_dt_rank": 192,
   "mamba_expand": 2,
   "mamba_inner_layernorms": true,
-  "mamba_latent_size": null,
-  "mamba_multihead_config": null,
   "mamba_proj_bias": false,
-  "mamba_reuse_every_i_layer": -1,
-  "max_position_embeddings": 22528,
-  "memory_tokens_interspersed_every": 0,
   "mlp_hidden_act": "silu",
-  "mod_topk": 2,
   "model_type": "jamba",
-  "moe_config": null,
-  "nGPT_config": {
-    "extra_grad": false,
-    "gate_scaling": false,
-    "init_norm": false,
-    "learned_scaling": false,
-    "norm_bc": false,
-    "norm_gating": false,
-    "norm_ssm_input": false,
-    "post_norm": false,
-    "qk_norm": false,
-    "weight_norm": true
-  },
-  "nGPT_mode": null,
   "new_seq_length": 2048,
-  "no_dt_bias": false,
   "num_attention_heads": 24,
-  "num_attn_per_ffn": 3,
   "num_experts": 1,
   "num_experts_per_tok": 1,
-  "num_ffn": 1,
   "num_hidden_layers": 36,
   "num_key_value_heads": 6,
-  "num_mamba": 1,
   "num_memory_tokens": 256,
   "orig_max_position_embeddings": 4096,
-  "other_args": null,
   "output_router_logits": false,
   "pad_token_id": 0,
-  "public_ffn_structure": false,
-  "pure_linear_attn": false,
-  "reduce_attn_ratio": 0.5,
-  "reduce_method": "mean",
-  "repeat_ffn": null,
   "rms_norm_eps": 1e-06,
   "rope": true,
   "rope_theta": 10000.0,
   "rope_type": "ntk",
   "router_aux_loss_coef": 0.001,
-  "save_input_output": false,
-  "self_attn_type": null,
-  "seq_length": 1024,
-  "sequential_jamba": false,
-  "share_kv": false,
-  "shared_module_attn": "",
-  "shared_module_mamba": "",
   "sliding_window": null,
-  "sliding_window_size": null,
-  "supernet_config": null,
-  "swa_full_head": false,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.45.0",
   "use_cache": false,
-  "use_mamba2": false,
   "use_mamba_kernels": true,
-  "use_nGPT": true,
-  "use_nemotron5": false,
   "v_head_dim": -1,
-  "visual_attn": false,
-  "visual_entropy": false,
   "vocab_size": 131072
 }

 {
   "architectures": [
+    "FastSLMForCausalLM"
   ],
   "attention_dropout": 0.0,
   "attn_hidden_size": -1,
+  "attn_implementation": "fused_mha",
+  "attn_implementation_new": "fused_mha",
   "auto_map": {
+    "AutoConfig": "configuration_fast_slm.FastSLMConfig",
+    "AutoModelForCausalLM": "modeling_fast_slm.FastSLMForCausalLM"
   },
   "bos_token_id": 1,
   "calc_logits_for_entire_prompt": false,
   "d_conv": 4,
   "eos_token_id": 2,
   "ffn_expand_ratio": 3,
   "global_attn_idx": [],
   "hidden_act": "silu",
   "hidden_size": 3072,
   "hybrid_decoder_layer": "mamba",
   "initializer_range": 0.02,
   "intermediate_size": 0,
   "kq_head_dim": -1,
   "kq_norm": "none",
   "layer_type": [
     "m",
     "a",
     "m2",
     "f"
   ],
   "mamba2_headdim": 64,
   "mamba_conv_bias": true,
   "mamba_d_conv": 4,
   "mamba_d_state": 128,
   "mamba_dt_rank": 192,
   "mamba_expand": 2,
   "mamba_inner_layernorms": true,
   "mamba_proj_bias": false,
+  "max_position_embeddings": 29000,
   "mlp_hidden_act": "silu",
   "model_type": "jamba",
   "new_seq_length": 2048,
   "num_attention_heads": 24,
   "num_experts": 1,
   "num_experts_per_tok": 1,
   "num_hidden_layers": 36,
   "num_key_value_heads": 6,
   "num_memory_tokens": 256,
   "orig_max_position_embeddings": 4096,
   "output_router_logits": false,
   "pad_token_id": 0,
   "rms_norm_eps": 1e-06,
   "rope": true,
   "rope_theta": 10000.0,
   "rope_type": "ntk",
   "router_aux_loss_coef": 0.001,
   "sliding_window": null,
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.2",
   "use_cache": false,
   "use_mamba_kernels": true,
   "v_head_dim": -1,
   "vocab_size": 131072
 }

configuration_fast_slm.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Jamba model configuration"""
+import math
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class FastSLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JambaModel`]. It is used to instantiate a
+    Jamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the jamba-small architecture.
+    [ai21labs/jamba-small](https://huggingface.co/ai21labs/Jamba-v0.1)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the Jamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`JambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        calc_logits_for_entire_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not to calculate logits for entire prompt during generation. If `False`, only the logits of the
+            last prompt token will be calculated, which are the only logits needed for generation. For long sequences,
+            the logits for the entire sequence may use a lot of memory so setting `calc_logits_for_entire_prompt=False`
+            will reduce memory footprint significantly.
+            Note: some generation features may not be available if this is set to `False`.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        n_ctx (`int`, *optional*, defaults to 262144):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+        mamba_inner_layernorms (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to apply layernorms to internal mamba activations
+    """
+    model_type = "jamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+            self,
+            vocab_size=65536,
+            tie_word_embeddings=False,
+            hidden_size=4096,
+            intermediate_size=14336,
+            num_hidden_layers=32,
+            num_attention_heads=32,
+            num_key_value_heads=8,
+            hidden_act="silu",
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            calc_logits_for_entire_prompt=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            sliding_window=None,
+            max_position_embeddings=262144,
+            orig_max_position_embeddings=None,
+            attention_dropout=0.0,
+            num_experts_per_tok=2,
+            num_experts=16,
+            use_mamba_kernels=True,
+            mamba_d_state=16,
+            mamba_d_conv=4,
+            mamba_expand=2,
+            mamba_dt_rank="auto",
+            mamba_conv_bias=True,
+            mamba_proj_bias=False,
+            mamba_inner_layernorms=True,
+            hybrid_decoder_layer='mamba',
+            global_attn_idx=None,
+            attn_implementation_new='flash_attention_2',
+            mamba2_headdim=64,
+            rope_type=None,
+            layer_types=None,
+            ffn_expand_ratio=None,
+            d_conv=4,
+            **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.orig_max_position_embeddings = orig_max_position_embeddings
+        self.attention_dropout = attention_dropout
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.calc_logits_for_entire_prompt = calc_logits_for_entire_prompt
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.mamba_inner_layernorms = mamba_inner_layernorms
+        # added by Xin
+        self.kq_norm = kwargs.pop("kq_norm", None)
+        self.rope = kwargs.pop("rope", False)
+        self.rope_theta = kwargs.pop("rope_theta", 10000.0)
+        self.num_memory_tokens = kwargs.pop("num_memory_tokens", 0)
+        self.attn_hidden_size = kwargs.pop("attn_hidden_size", -1)
+        self.kq_head_dim = kwargs.pop("kq_head_dim", -1)
+        self.v_head_dim = kwargs.pop("v_head_dim", -1)
+        #! adhoc change
+        self.new_seq_length = 2048
+        self.hybrid_decoder_layer = hybrid_decoder_layer
+        self.global_attn_idx = global_attn_idx
+        self.attn_implementation_new = attn_implementation_new
+        self.mamba2_headdim = mamba2_headdim
+        self.rope_type = rope_type
+        self.layer_types = layer_types
+        self.ffn_expand_ratio = ffn_expand_ratio
+        self.d_conv = d_conv
+        self.mlp_hidden_act = kwargs.pop("mlp_hidden_act", "silu")
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

delta_net.py CHANGED Viewed

@@ -10,9 +10,15 @@ import torch.nn as nn
 from einops import rearrange
 from torch.nn import functional as F
 from fla.modules import FusedRMSNormSwishGate, RMSNorm, ShortConvolution
 from fla.ops.delta_rule import chunk_delta_rule, fused_recurrent_delta_rule
 if TYPE_CHECKING:
     from transformers.processing_utils import Unpack
@@ -97,12 +103,6 @@ class DeltaNet(nn.Module):
         assert self.qk_activation in ['silu', 'relu', 'elu', 'identity']
         assert self.qk_norm in ['l2', 'sum']
-        self.config = config
-        if self.config is not None and self.config.use_nGPT and 'extra_grad' in self.config.nGPT_config and self.config.nGPT_config['extra_grad']:
-            self.weight_norm = True
-        else:
-            self.weight_norm = False
         if d_model is not None:
             hidden_size = d_model
@@ -199,7 +199,7 @@ class DeltaNet(nn.Module):
         last_state = None
         if past_key_values is not None and len(past_key_values) > self.layer_idx:
             last_state = past_key_values[self.layer_idx]
         if self.use_short_conv:
             conv_state_q, conv_state_k, conv_state_v = None, None, None
             if last_state is not None:
@@ -208,9 +208,7 @@ class DeltaNet(nn.Module):
             position_ids = kwargs.get('position_ids', None)
             q = self.q_proj(hidden_states)
-            if self.weight_norm:
-                q = q / self.q_proj.weight.norm(p=2, dim=1)
             q, conv_state_q = self.q_conv1d(x=q,
                                             mask=conv_mask,
                                             cache=conv_state_q,
@@ -218,8 +216,7 @@ class DeltaNet(nn.Module):
                                             seq_idx=position_ids)
             k = self.k_proj(hidden_states)
-            if self.weight_norm:
-                k = k / self.k_proj.weight.norm(p=2, dim=1)
             k, conv_state_k = self.k_conv1d(x=k,
                                             mask=conv_mask,
                                             cache=conv_state_k,
@@ -227,8 +224,7 @@ class DeltaNet(nn.Module):
                                             seq_idx=position_ids)
             v = self.v_proj(hidden_states)
-            if self.weight_norm:
-                v = v / self.v_proj.weight.norm(p=2, dim=1)
             v, conv_state_v = self.v_conv1d(x=v,
                                             mask=conv_mask,
                                             cache=conv_state_v,
@@ -239,11 +235,6 @@ class DeltaNet(nn.Module):
             k = self.k_proj(hidden_states)
             v = self.v_proj(hidden_states)
-            if self.weight_norm:
-                q = q / self.q_proj.weight.norm(p=2, dim=1)
-                k = k / self.k_proj.weight.norm(p=2, dim=1)
-                v = v / self.v_proj.weight.norm(p=2, dim=1)
             if self.qk_activation == 'silu':
                 q, k = self.silu(q), self.silu(k)
@@ -267,10 +258,6 @@ class DeltaNet(nn.Module):
         if self.use_beta:
             beta = self.b_proj(hidden_states)
-            if self.weight_norm:
-                beta = beta / self.b_proj.weight.norm(p=2, dim=1)
             beta = beta.sigmoid()
         else:
             beta = q.new_ones(q.shape[0], q.shape[1], q.shape[2])
@@ -283,6 +270,7 @@ class DeltaNet(nn.Module):
             beta = beta.mul(attention_mask[:, -beta.shape[-2]:, None])
         recurrent_state = last_state['recurrent_state'] if last_state is not None else None
         cu_seqlens = kwargs.get('cu_seqlens', None)
         if mode == 'fused_recurrent':
             o, recurrent_state = fused_recurrent_delta_rule(
@@ -327,7 +315,161 @@ class DeltaNet(nn.Module):
         o = rearrange(o, 'b t h d -> b t (h d)')
         o = self.o_proj(o)
-        if self.weight_norm:
-            o = o / self.o_proj.weight.norm(p=2, dim=0)
-        return o, None, past_key_values

 from einops import rearrange
 from torch.nn import functional as F
+import fla
 from fla.modules import FusedRMSNormSwishGate, RMSNorm, ShortConvolution
 from fla.ops.delta_rule import chunk_delta_rule, fused_recurrent_delta_rule
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import transformers
 if TYPE_CHECKING:
     from transformers.processing_utils import Unpack
         assert self.qk_activation in ['silu', 'relu', 'elu', 'identity']
         assert self.qk_norm in ['l2', 'sum']
         if d_model is not None:
             hidden_size = d_model
         last_state = None
         if past_key_values is not None and len(past_key_values) > self.layer_idx:
             last_state = past_key_values[self.layer_idx]
         if self.use_short_conv:
             conv_state_q, conv_state_k, conv_state_v = None, None, None
             if last_state is not None:
             position_ids = kwargs.get('position_ids', None)
             q = self.q_proj(hidden_states)
             q, conv_state_q = self.q_conv1d(x=q,
                                             mask=conv_mask,
                                             cache=conv_state_q,
                                             seq_idx=position_ids)
             k = self.k_proj(hidden_states)
             k, conv_state_k = self.k_conv1d(x=k,
                                             mask=conv_mask,
                                             cache=conv_state_k,
                                             seq_idx=position_ids)
             v = self.v_proj(hidden_states)
             v, conv_state_v = self.v_conv1d(x=v,
                                             mask=conv_mask,
                                             cache=conv_state_v,
             k = self.k_proj(hidden_states)
             v = self.v_proj(hidden_states)
             if self.qk_activation == 'silu':
                 q, k = self.silu(q), self.silu(k)
         if self.use_beta:
             beta = self.b_proj(hidden_states)
             beta = beta.sigmoid()
         else:
             beta = q.new_ones(q.shape[0], q.shape[1], q.shape[2])
             beta = beta.mul(attention_mask[:, -beta.shape[-2]:, None])
         recurrent_state = last_state['recurrent_state'] if last_state is not None else None
         cu_seqlens = kwargs.get('cu_seqlens', None)
         if mode == 'fused_recurrent':
             o, recurrent_state = fused_recurrent_delta_rule(
         o = rearrange(o, 'b t h d -> b t (h d)')
         o = self.o_proj(o)
+        return o, None, past_key_values
+class Cache(transformers.cache_utils.Cache):
+    """
+    A cache used for storing hidden states produced by flash linear attention models.
+    It stores the states of each layer as the tensor of shape `[batch_size, key_dim, value_dim]`.
+    """
+    is_compileable = True
+    def __init__(
+        self,
+        seen_tokens: int = 0
+    ) -> Cache:
+        super().__init__()
+        self.states: List[Dict[str, Any]] = []
+        self._seen_tokens = seen_tokens  # Used in `generate` to keep tally of how many tokens the cache has seen
+    def __getitem__(self, layer_idx: int) -> Dict[str, Any]:
+        if layer_idx < len(self):
+            return self.states[layer_idx]
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+    def __iter__(self):
+        for state in self.states:
+            yield state
+    def __len__(self):
+        return len(self.states)
+    def reset(self):
+        for state in self.states:
+            for key in state:
+                if state[key] is not None:
+                    if type(state[key]) == tuple:
+                        for subkey in state[key]:
+                            subkey.zero_()
+                    else:
+                        state[key].zero_()
+        self._seen_tokens = 0
+    def update(
+        self,
+        recurrent_state: Optional[Tuple[torch.Tensor]] = None,
+        attn_state: Optional[Tuple[torch.Tensor]] = None,
+        conv_state: Optional[Tuple[torch.Tensor]] = None,
+        ffn_state: Optional[Tuple[torch.Tensor]] = None,
+        layer_idx: int = 0,
+        offset: Optional[int] = 1,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Args:
+            recurrent_state (`torch.Tensor`):
+                The new recurrent state to cache.
+            attn_state (`Tuple[torch.Tensor]`):
+                The new attention key/value states to cache.
+            conv_state (`Tuple[torch.Tensor]`):
+                The new convolution state to cache.
+            ffn_state (`Tuple[torch.Tensor]`):
+                The new feed-forward state to cache.
+            layer_idx (`int`, defaults to 0):
+                The index of the layer to cache the states for.
+            offset (`int`, defaults to 1):
+                The number of new tokens being processed.
+            cache_kwargs (`Dict[str, Any]`):
+                Additional arguments for the cache subclass.
+        Return:
+            Dictionary of the updated state.
+        """
+        if cache_kwargs is None:
+            cache_kwargs = {}
+        if attn_state is not None:
+            input_size = attn_state[0].shape[1]
+            window_size = cache_kwargs.get('window_size', None)
+            if not (isinstance(attn_state, Tuple) or isinstance(attn_state, List)):
+                raise ValueError("`attn_state` must be a tuple of tensors for key/value states")
+        if len(self.states) <= layer_idx:
+            # update the number of seen tokens
+            if layer_idx == 0:
+                self._seen_tokens += offset
+            if attn_state is not None:
+                if window_size is not None and input_size > window_size:
+                    attn_state = [state[:, -window_size:].contiguous() for state in attn_state]
+            state = dict(
+                recurrent_state=recurrent_state,
+                attn_state=attn_state,
+                conv_state=conv_state,
+                ffn_state=ffn_state
+            )
+            self.states.append(state)
+        else:
+            # update the number of seen tokens
+            if layer_idx == len(self.states) - 1:
+                self._seen_tokens += offset
+            state = self.states[layer_idx]
+            if recurrent_state is not None:
+                state['recurrent_state'].copy_(recurrent_state)
+            if attn_state is not None:
+                if window_size is not None and state['attn_state'][0].shape[1] == window_size:
+                    for i, (old_state, new_state) in enumerate(zip(state['attn_state'], attn_state)):
+                        # DO NOT allocate new memory if the cache is full
+                        # roll the key/value states to the left by `input_size`
+                        old_state = old_state.roll(-input_size, 1)
+                        # replace the last `input_size` tokens with the new key/value states
+                        old_state[:, -input_size:] = new_state
+                        state['attn_state'][i].copy_(old_state)
+                else:
+                    attn_state = [
+                        torch.cat([old_state, new_state], 1)
+                        for old_state, new_state in zip(state['attn_state'], attn_state)
+                    ]
+                    state['attn_state'].copy_(attn_state)
+            if conv_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = state['conv_state']
+                conv_state_q.copy_(conv_state[0])
+                conv_state_k.copy_(conv_state[1])
+                conv_state_v.copy_(conv_state[2])
+            if ffn_state is not None:
+                state['ffn_state'].copy_(ffn_state)
+        return state
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.states) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. Cache does not have a maximum length."""
+        return None
+    def to_legacy_cache(self) -> Tuple:
+        return tuple(self.states)
+    @classmethod
+    @torch.compiler.disable
+    def from_legacy_cache(
+        cls,
+        past_key_values: Optional[Tuple] = None,
+        seen_tokens: int = 0
+    ) -> Cache:
+        """Converts a cache in the legacy cache format into an equivalent `Cache`."""
+        cache = cls(seen_tokens)
+        if isinstance(past_key_values, list):
+            for layer_idx in range(len(past_key_values)):
+                cache.states.append(past_key_values[layer_idx])
+        return cache

fused_mha_with_cache.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+from typing import Optional, Tuple
+from .triton_attention import (
+    fused_mha_with_paged_cache, fused_mha_with_cache
+)
+dtype_int = torch.int32
+def fused_mha_interface(
+    query_states: torch.Tensor,             # [batch, q_len, heads, head_dim]
+    key_states:   torch.Tensor,             # [batch, kv_len, heads, head_dim]
+    value_states: torch.Tensor,             # [batch, kv_len, heads, head_dim]
+    k_cache: torch.Tensor,   # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD] or [num_pages, page_size, n, d] for paged attn
+    v_cache: torch.Tensor,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    position_ids: torch.Tensor=None,
+    page_table: torch.Tensor=None, # [b, max_num_pages_per_seq] # loc of the block page in the cache.
+    max_seq_len = None,
+) -> torch.Tensor:
+    """
+    Replacement for _flash_attention_forward(...) that uses
+    Triton’s fused_mha_with_paged_cache under the hood.
+    Returns: [batch, q_len, heads*head_dim]
+    """
+    # unpack shapes
+    b, ql, n_heads, head_dim = query_states.shape
+    _, kvl, n_kv_heads, _ = key_states.shape
+    q = query_states.reshape(b, ql, n_heads * head_dim)
+    k = key_states.reshape(b, kvl, n_kv_heads * head_dim)
+    v = value_states.reshape(b, kvl, n_kv_heads * head_dim)
+    if position_ids is not None:
+        if ql == 1:  # Generate phase - single token
+            input_pos = position_ids[:, -1]  # Use the last position for each sequence
+        else:  # Context phase - multiple tokens
+            input_pos = position_ids[:, 0]   # Use the starting position for each sequence
+    else:
+        # Fallback: assume starting from 0 for all sequences
+        input_pos = torch.zeros(b, device=q.device, dtype=torch.int32)
+    freqs_cis = None
+    if page_table is None:
+        y = torch.ops.attention.fused_mha_with_cache(
+            q, k, v,
+            input_pos,
+            k_cache, v_cache,
+            freqs_cis,
+        )
+    else:
+        batch_size = b
+        # cache_loc: identity mapping [0, 1, ..., b-1]
+        cache_loc = torch.arange(batch_size, device=q.device, dtype=dtype_int)
+        # input_positions: assume pure context (all start from 0)
+        input_positions = torch.zeros(batch_size, device=q.device, dtype=dtype_int)
+        # seq_len: each sequence length is kvl
+        seq_len = torch.full((batch_size,), kvl, device=q.device, dtype=dtype_int)
+        # seq_start: flattened starting index for each sequence
+        seq_start = (seq_len.cumsum(0) - seq_len).to(dtype=dtype_int)
+        assert max_seq_len is not None, "max_seq_len must be provided when using paged attention."
+        y = torch.ops.attention.fused_mha_with_paged_cache(
+            q, k, v,
+            input_positions, cache_loc,
+            seq_len, seq_start,
+            page_table, max_seq_len,
+            k_cache, v_cache,
+            freqs_cis,
+        )
+    y = y.view(b, ql, n_heads, head_dim)
+    return y
+def main():
+    #––– Test hyperparameters –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
+    batch_size = 1
+    q_len      = 1
+    kv_len     = 1
+    num_heads  = 16
+    n_kv_heads = 16
+    head_dim   = 128
+    max_batch_size = 1
+    max_seq_len = 1024
+    page_size = 256
+    device = "cuda"
+    #––– Random query, key, value tensors –––––––––––––––––––––––––––––––––––––––––––––––––––
+    query_states = torch.randn(batch_size, q_len, num_heads, head_dim, device=device)
+    key_states   = torch.randn(batch_size, kv_len, num_heads, head_dim, device=device)
+    value_states = torch.randn(batch_size, kv_len, num_heads, head_dim, device=device)
+    k_cache = torch.randn(max_batch_size, max_seq_len, num_heads, head_dim, device=device)
+    v_cache = torch.randn(max_batch_size, max_seq_len, num_heads, head_dim, device=device)
+    attn_out = fused_mha_interface(
+        query_states,
+        key_states,
+        value_states,
+        k_cache=k_cache,
+        v_cache=v_cache,
+    )
+    expected_shape = (batch_size, q_len, num_heads, head_dim)
+    print(f"[test] output shape: {attn_out.shape} (expected {expected_shape})")
+    if attn_out.shape == expected_shape:
+        print("[test] ✅ Success: output tensor has correct shape.")
+    else:
+        print("[test] ❌ Failure: shape mismatch.")
+if __name__ == "__main__":
+    main()

mamba2.py CHANGED Viewed

@@ -18,9 +18,6 @@ try:
 except ImportError:
     causal_conv1d_varlen_states = None
-import sys
-# sys.path.insert(0, '/lustre/fsw/portfolios/nvr/users/yongganf/TLM/')
 from mamba_ssm.ops.triton.selective_state_update import selective_state_update
 from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
@@ -124,13 +121,10 @@ class Mamba2(nn.Module):
         # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
         inv_dt = dt + torch.log(-torch.expm1(-dt))
-        if config.no_dt_bias:
-            self.dt_bias = None
-        else:
-            self.dt_bias = nn.Parameter(inv_dt)
-            # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
-            # name.endswith("bias") in param_grouping.py
-            self.dt_bias._no_weight_decay = True
         assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
         A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(*A_init_range)
@@ -154,39 +148,6 @@ class Mamba2(nn.Module):
                                               process_group=self.process_group, sequence_parallel=self.sequence_parallel,
                                               **factory_kwargs)
-        self.mamba_multihead_config = config.mamba_multihead_config
-        if self.mamba_multihead_config is not None:
-            assert self.mamba_multihead_config['alpha_mode'] == 'sparsity' or self.mamba_multihead_config['alpha_mode'] == 'cummax'
-            if self.mamba_multihead_config['alpha_mode'] == 'cummax':
-                self.learned_dt_scale = nn.Parameter(torch.ones(1, device=device))
-            if self.mamba_multihead_config['alpha_mode'] == 'sparsity':
-                if 'use_learned_thres' in self.mamba_multihead_config and self.mamba_multihead_config['use_learned_thres']:
-                    self.learned_thres = nn.Parameter(torch.zeros(self.nheads, device=device))
-                    self.smooth_factor = self.mamba_multihead_config['smooth_factor']
-                    self.detach_dt = self.mamba_multihead_config['detach_dt']
-                    if 'use_cummax' in self.mamba_multihead_config and self.mamba_multihead_config['use_cummax']:
-                        self.use_cummax = True
-                        self.cummax_lower_bound = self.mamba_multihead_config['cummax_lower_bound']
-                    else:
-                        self.use_cummax = False
-                else:
-                    self.learned_thres = None
-                    self.smooth_factor = None
-                    self.detach_dt = None
-                    self.sparsity_split = self.mamba_multihead_config['sparsity_split']
-                    self.sparsity_ratio = self.mamba_multihead_config['sparsity_ratio']
-        if self.config.layerwise_memory_token:
-            assert self.config.num_memory_tokens > 0
-            self.memory_tokens = nn.Parameter(torch.randn(self.config.num_memory_tokens, self.config.hidden_size))
-        else:
-            self.memory_tokens = None
     def forward(self, hidden_states, attention_mask=None, past_key_value=None, seqlen=None, seq_idx=None, cu_seqlens=None, inference_params=None):
         """
@@ -198,11 +159,6 @@ class Mamba2(nn.Module):
         """
         # assert past_key_value is None, "Not implemented yet!!!"
-        if self.memory_tokens is not None:
-            hidden_states = hidden_states[:,self.config.num_memory_tokens:,...]
-            mem = repeat(self.memory_tokens, 'n d -> b n d', b = hidden_states.shape[0]) # prepend the memory to every segment of m by repeating the memory tokens
-            hidden_states, mem_packed_shape = pack((mem, hidden_states), 'b * d')
         seqlen_og = seqlen
         if seqlen is None:
             batch, seqlen, dim = hidden_states.shape
@@ -211,19 +167,18 @@ class Mamba2(nn.Module):
             batch = batch_seqlen // seqlen
         conv_state, ssm_state = None, None
         if inference_params is not None:
             inference_batch = cu_seqlens.shape[0] - 1 if cu_seqlens is not None else batch
             conv_state, ssm_state = self._get_states_from_cache(inference_params, inference_batch)
             if inference_params.seqlen_offset > 0:
                 # The states are updated inplace
                 out, _, _ = self.step(hidden_states, conv_state, ssm_state)
-                return out
         zxbcdt = self.in_proj(hidden_states)  # (B, L, d_in_proj) or (B * L, d_in_proj)
-        if self.config.use_nGPT and 'extra_grad' in self.config.nGPT_config and self.config.nGPT_config['extra_grad']:
-            zxbcdt = zxbcdt / self.in_proj.weight.norm(p=2, dim=1)
         if seqlen_og is not None:
             zxbcdt = rearrange(zxbcdt, "(b l) d -> b l d", l=seqlen)
         # If the model is loaded in fp16, without the .float() here, A might be -inf
@@ -261,6 +216,7 @@ class Mamba2(nn.Module):
                 [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
                 dim=-1
             )
             if conv_state is not None:
                 if cu_seqlens is None:
                     # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
@@ -288,27 +244,9 @@ class Mamba2(nn.Module):
                     activation=self.activation,
                     # seq_idx=seq_idx,
                 ).transpose(1, 2)
             x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
-            no_dt_bias = False
-            if self.mamba_multihead_config is not None and self.mamba_multihead_config['alpha_mode'] == 'cummax':  ### todo: implement this in the fused kernel
-                dt = dt + self.dt_bias
-                dt = torch.nn.functional.softmax(dt, dim=-1)
-                dt = torch.cumsum(dt, dim=-1)
-                dt = dt * self.learned_dt_scale
-                no_dt_bias = True
-            if self.mamba_multihead_config is not None and self.mamba_multihead_config['alpha_mode'] == 'sparsity':
-                dt = dt + self.dt_bias
-                if self.learned_thres is not None:
-                    dt = self.sparsify_learned_thres(dt)
-                else:
-                    dt = self.split_and_sparsify(dt, self.sparsity_split, self.sparsity_ratio)
-                no_dt_bias = True
             y = mamba_chunk_scan_combined(
                 rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
@@ -317,9 +255,10 @@ class Mamba2(nn.Module):
                 rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
                 rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
                 chunk_size=self.chunk_size,
-                D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
                 z=rearrange(z, "b l (h p) -> b l h p", p=self.headdim) if not self.rmsnorm else None,
-                dt_bias=self.dt_bias if not no_dt_bias else None,
                 dt_softplus=True,
                 seq_idx=seq_idx,
                 cu_seqlens=cu_seqlens,
@@ -336,186 +275,153 @@ class Mamba2(nn.Module):
                     ssm_state.copy_(varlen_states)
             y = rearrange(y, "b l h p -> b l (h p)")
             if self.rmsnorm:
-                y = self.norm(y, z)
             if d_mlp > 0:
                 y = torch.cat([F.silu(z0) * x0, y], dim=-1)
             if seqlen_og is not None:
                 y = rearrange(y, "b l d -> (b l) d")
-            if self.config.use_nGPT and 'extra_grad' in self.config.nGPT_config and self.config.nGPT_config['extra_grad']:
-                y = y / self.out_proj.weight.norm(p=2, dim=0)
             out = self.out_proj(y)
         return out, past_key_value
-    def sparsify_learned_thres(self, dt):
-        """
-        Args:
-            dt: Tensor of shape [bs, seq_len, nheads]
-        Returns:
-            pruned_dt: Pruned tensor with the same shape as dt
-        """
-        # Compute sigmoid scores
-        if self.use_cummax:
-            learned_thres = torch.nn.functional.softmax(self.learned_thres, dim=-1)
-            learned_thres = torch.cumsum(learned_thres, dim=-1) - self.cummax_lower_bound  ## keep the dt_normalized larger than 1 - self.cummax_lower_bound
-            dt_normalized = (dt - dt.min(dim=-1, keepdim=True)[0]) / (dt.max(dim=-1, keepdim=True)[0] - dt.min(dim=-1, keepdim=True)[0])
-            scores = torch.sigmoid((dt_normalized.detach() - self.learned_thres) / self.smooth_factor)
         else:
-            if self.detach_dt:
-                scores = torch.sigmoid((dt.detach() - self.learned_thres) / self.smooth_factor)
-            else:
-                scores = torch.sigmoid((dt - self.learned_thres) / self.smooth_factor)
-        # Generate binary mask for pruning (forward pass)
-        mask = (scores >= 0.5).float()
-        # Apply mask in the forward pass and backward using sigmoid
-        pruned_dt = (dt * mask - dt * scores).detach() + dt * scores
-        # print(pruned_dt.mean())
-        return pruned_dt
-    def split_and_sparsify(self, dt, sparsity_split, sparsity_ratio):
-        """
-        dt:             a torch.Tensor of shape [bs, seq_len, dim]
-        sparsity_split: list of ratios (e.g., [0.4, 0.3, 0.3]) that sum to 1
-                        and define how to split dt along the last dimension
-        sparsity_ratio: list of ratios (e.g., [0.2, 0.5, 0.3]) that sum to 1
-                        and define how many time steps (along seq_len) to keep
-        """
-        bs, seq_len, dim = dt.shape
-        assert sum(sparsity_split) == 1
-        # Compute the exact split sizes (watching out for integer rounding)
-        split_sizes = [int(r * dim) for r in sparsity_split]
-        # Fix potential off-by-one rounding in the last split
-        split_sizes[-1] = dim - sum(split_sizes[:-1])
-        # Split the original tensor along the last dimension
-        splitted_tensors = torch.split(dt, split_sizes, dim=-1)
-        results = []
-        for i, sub_tensor in enumerate(splitted_tensors):
-            # sub_tensor has shape [bs, seq_len, split_dim_i]
-            k = int(sparsity_ratio[i] * seq_len)
-            ### Strategy 1: keep at least one token
-            k = max(k, 1)
-            ### Strategy 2: the #tokens is the same as training
-            # if self.config.orig_max_position_embeddings is not None:
-            #     k = int(self.config.orig_max_position_embeddings * self.sparsity_ratio[i])
-            # else:
-            #     assert self.config.max_position_embeddings is not None
-            #     k = int(self.config.max_position_embeddings * self.sparsity_ratio[i])
-            # k = min(seq_len, k)
-            # print(self.config.max_position_embeddings, sparsity_ratio[i], seq_len, k)
-            # 1) Average over the feature dimension (the last dim),
-            #    resulting in shape [bs, seq_len]
-            averaged_values = sub_tensor.mean(dim=-1)
-            # 2) Get top-k indices (along seq_len = dim=1)
-            topk_values, _ = torch.topk(averaged_values, k=k, dim=1)
-            # The smallest value among the top-k per batch element
-            threshold = topk_values[:, -1].unsqueeze(-1)  # shape [bs, 1]
-            # 3) Create a mask of shape [bs, seq_len] => True if >= threshold
-            averaged_mask = (averaged_values >= threshold)
-            # 4) Expand that mask back to [bs, seq_len, split_dim_i]
-            mask_3d = averaged_mask.unsqueeze(-1).expand_as(sub_tensor)
-            # 5) Zero out everything that is not in top-k
-            sparsified_sub = sub_tensor * mask_3d
-            # print((sparsified_sub == 0).float().mean().item())
-            # input()
-            results.append(sparsified_sub)
-        # Concatenate the results back along the last dimension
-        output = torch.cat(results, dim=-1)
-        return output
-    def step(self, hidden_states, conv_state, ssm_state):
-        dtype = hidden_states.dtype
-        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
-        zxbcdt = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
         d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads) // 2
-        z0, x0, z, xBC, dt = torch.split(
-            zxbcdt,
-            [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
-            dim=-1
-        )
-        # Conv step
-        if causal_conv1d_update is None:
-            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
-            conv_state[:, :, -1] = xBC
-            xBC = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
-            if self.conv1d.bias is not None:
-                xBC = xBC + self.conv1d.bias
-            xBC = self.act(xBC).to(dtype=dtype)
         else:
-            xBC = causal_conv1d_update(
-                xBC,
-                conv_state,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.activation,
             )
         x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
         A = -torch.exp(self.A_log.float())  # (nheads,)
-        # SSM step
-        if selective_state_update is None:
-            assert self.ngroups == 1, "Only support ngroups=1 for this inference code path"
-            # Discretize A and B
-            dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
-            dA = torch.exp(dt * A)  # (batch, nheads)
-            x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
-            ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
-            y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
-            y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
-            y = rearrange(y, "b h p -> b (h p)")
-            if not self.rmsnorm:
-                y = y * self.act(z)  # (B D)
         else:
-            A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32)
-            dt = repeat(dt, "b h -> b h p", p=self.headdim)
-            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
-            D = repeat(self.D, "h -> h p", p=self.headdim)
-            B = rearrange(B, "b (g n) -> b g n", g=self.ngroups)
-            C = rearrange(C, "b (g n) -> b g n", g=self.ngroups)
-            x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            if not self.rmsnorm:
-                z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
-            y = selective_state_update(
-                ssm_state, x_reshaped, dt, A, B, C, D, z=z if not self.rmsnorm else None,
-                dt_bias=dt_bias, dt_softplus=True
             )
-            y = rearrange(y, "b h p -> b (h p)")
         if self.rmsnorm:
             y = self.norm(y, z)
         if d_mlp > 0:
             y = torch.cat([F.silu(z0) * x0, y], dim=-1)
         out = self.out_proj(y)
-        return out.unsqueeze(1), conv_state, ssm_state
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
         device = self.out_proj.weight.device
@@ -555,873 +461,4 @@ class Mamba2(nn.Module):
             if initialize_states:
                 conv_state.zero_()
                 ssm_state.zero_()
-        return conv_state, ssm_state
-class Mamba2_Fused(nn.Module):
-    def __init__(
-        self,
-        config,
-        layer_idx=None,  # Absorb kwarg for general module
-        reuse_kv=False,
-        conv_init=None,
-        d_ssm=None,  # If not None, we only apply SSM on this many dimensions, the rest uses gated MLP
-        ngroups=1,
-        A_init_range=(1, 16),
-        D_has_hdim=False,
-        rmsnorm=True,
-        norm_before_gate=False,
-        dt_min=0.001,
-        dt_max=0.1,
-        dt_init_floor=1e-4,
-        dt_limit=(0.0, float("inf")),
-        bias=False,
-        conv_bias=True,
-        # Fused kernel and sharding options
-        chunk_size=256,
-        use_mem_eff_path=False, # True,
-        process_group=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.config = config
-        self.d_model = config.hidden_size
-        self.d_state = config.mamba_d_state
-        self.d_conv = config.mamba_d_conv
-        self.conv_init = conv_init
-        self.expand = config.mamba_expand
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.world_size = 1 if process_group is None else process_group.size()
-        self.local_rank = 0 if process_group is None else process_group.rank()
-        self.d_inner = (self.expand * self.d_model) // self.world_size
-        assert self.d_inner * self.world_size == self.expand * self.d_model
-        self.headdim = config.mamba2_headdim
-        self.d_ssm = self.d_inner if d_ssm is None else d_ssm // self.world_size
-        assert ngroups % self.world_size == 0
-        self.ngroups = ngroups // self.world_size
-        assert self.d_ssm % self.headdim == 0
-        self.nheads = self.d_ssm // self.headdim
-        self.D_has_hdim = D_has_hdim
-        self.rmsnorm = rmsnorm
-        self.norm_before_gate = norm_before_gate
-        self.dt_limit = dt_limit
-        self.activation = "silu"
-        self.chunk_size = chunk_size
-        self.use_mem_eff_path = use_mem_eff_path
-        self.layer_idx = layer_idx
-        assert (self.d_model * self.expand / self.headdim) % 8 == 0
-        self.fused_multihead_config = config.fused_multihead_config
-        assert self.fused_multihead_config['expand_v'], "Only implemented Hymba for Mamba"
-        self.reuse_kv = reuse_kv
-        self.hidden_size = config.hidden_size
-        self.attn_hidden_size = config.hidden_size
-        self.num_attention_heads = config.num_attention_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.k_hidden_size = int(self.num_key_value_heads/self.num_attention_heads * self.attn_hidden_size)
-        self.v_hidden_size = int(self.num_key_value_heads/self.num_attention_heads * self.attn_hidden_size * self.expand) if self.fused_multihead_config['expand_v'] else int(self.num_key_value_heads/self.num_attention_heads * self.attn_hidden_size)
-        if self.fused_multihead_config['expand_v']:
-            config.v_head_dim = self.d_inner // self.num_attention_heads
-        self.self_attn = config.attn_op(config, layer_idx, attn_only_wo_proj=True, reuse_kv=reuse_kv)
-        if self.reuse_kv:   # Order: [q, z, x, B, C, dt]
-            d_in_proj = self.attn_hidden_size + 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
-        else:   # Order: [q, k, v, z, x, B, C, dt]
-            d_in_proj = self.attn_hidden_size + self.k_hidden_size + self.v_hidden_size + 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
-        if self.process_group is None:
-            self.in_proj = nn.Linear(self.d_model, d_in_proj, bias=bias, **factory_kwargs)
-        else:
-            self.in_proj = ColumnParallelLinear(self.d_model, d_in_proj * self.world_size, bias=bias,
-                                                process_group=self.process_group, sequence_parallel=self.sequence_parallel,
-                                                **factory_kwargs)
-        self.pre_avg_layernorm1 = JambaRMSNorm(self.d_inner, eps=config.rms_norm_eps)
-        self.pre_avg_layernorm2 = JambaRMSNorm(self.d_inner, eps=config.rms_norm_eps)
-        conv_dim = self.d_ssm + 2 * self.ngroups * self.d_state
-        self.conv1d = nn.Conv1d(
-            in_channels=conv_dim,
-            out_channels=conv_dim,
-            bias=conv_bias,
-            kernel_size=self.d_conv,
-            groups=conv_dim,
-            padding=self.d_conv - 1,
-            **factory_kwargs,
-        )
-        if self.conv_init is not None:
-            nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
-        self.act = nn.SiLU()
-        # Initialize log dt bias
-        dt = torch.exp(
-            torch.rand(self.nheads, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
-            + math.log(dt_min)
-        )
-        dt = torch.clamp(dt, min=dt_init_floor)
-        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-        inv_dt = dt + torch.log(-torch.expm1(-dt))
-        self.dt_bias = nn.Parameter(inv_dt)
-        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
-        # name.endswith("bias") in param_grouping.py
-        self.dt_bias._no_weight_decay = True
-        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
-        A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(*A_init_range)
-        A_log = torch.log(A).to(dtype=dtype)
-        self.A_log = nn.Parameter(A_log)
-        self.A_log._no_weight_decay = True
-        # D "skip" parameter
-        self.D = nn.Parameter(torch.ones(self.d_ssm if self.D_has_hdim else self.nheads, device=device))
-        self.D._no_weight_decay = True
-        if self.rmsnorm:
-            assert RMSNormGated is not None
-            self.norm = RMSNormGated(self.d_ssm, eps=1e-5, norm_before_gate=self.norm_before_gate,
-                                     group_size=self.d_ssm // ngroups, **factory_kwargs)
-        if self.process_group is None:
-            self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
-        else:
-            self.out_proj = RowParallelLinear(self.d_inner * self.world_size, self.d_model, bias=bias,
-                                              process_group=self.process_group, sequence_parallel=self.sequence_parallel,
-                                              **factory_kwargs)
-    def forward(self, hidden_states, attention_mask=None, past_key_value=None, position_ids=None, kv_last_layer=None, use_cache=False, use_swa=False, seqlen=None, seq_idx=None, cu_seqlens=None, inference_params=None):
-        """
-        hidden_states: (batch, seqlen, hidden_dim) if seqlen=None.
-            If seqlen is not None, hidden_states is (batch * seqlen, hidden_dim). This is so that when we
-            split hidden_states during sequence parallel, we split the batch * seqlen dimension
-            (in case batch is small).
-        Returns: same shape as u
-        """
-        # assert past_key_value is None, "Not implemented yet!!!"
-        seqlen_og = seqlen
-        if seqlen is None:
-            batch, seqlen, dim = hidden_states.shape
-        else:
-            batch_seqlen, dim = hidden_states.shape
-            batch = batch_seqlen // seqlen
-        conv_state, ssm_state = None, None
-        if inference_params is not None:
-            inference_batch = cu_seqlens.shape[0] - 1 if cu_seqlens is not None else batch
-            conv_state, ssm_state = self._get_states_from_cache(inference_params, inference_batch)
-            if inference_params.seqlen_offset > 0:
-                # The states are updated inplace
-                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
-                return out
-        zxbcdt = self.in_proj(hidden_states)  # (B, L, d_in_proj) or (B * L, d_in_proj)
-        if self.reuse_kv:
-            query_states, zxbcdt = zxbcdt.tensor_split((self.attn_hidden_size,), dim=-1)
-            # query_states = query_states.transpose(1,2)
-        else:
-            query_states, key_states, value_states, zxbcdt = zxbcdt.tensor_split((self.attn_hidden_size, self.attn_hidden_size + self.k_hidden_size, self.attn_hidden_size + self.k_hidden_size + self.v_hidden_size), dim=-1)
-            # query_states = query_states.transpose(1,2)
-            # key_states = key_states.transpose(1,2)
-            # value_states = value_states.transpose(1,2)
-        if self.reuse_kv:
-            assert kv_last_layer is not None
-            attn_outputs, attn_key_value = self.self_attn(attention_mask=attention_mask, position_ids=position_ids, query_states=query_states, kv_last_layer=kv_last_layer, use_swa=use_swa, use_cache=use_cache, past_key_value=past_key_value)
-        else:
-            if 'use_linear_attn' in self.fused_multihead_config and self.fused_multihead_config['use_linear_attn'] and self.linear_attn_op == 'gla':
-                attn_outputs, _, attn_key_value = self.self_attn(hidden_states=value_states, position_ids=position_ids, attention_mask=attention_mask, Q=query_states, K=key_states, V=value_states, past_key_value=past_key_value)
-            else:
-                attn_outputs, attn_key_value = self.self_attn(attention_mask=attention_mask, position_ids=position_ids, query_states=query_states, key_states=key_states, value_states=value_states, use_swa=use_swa, use_cache=use_cache, past_key_value=past_key_value)
-        if seqlen_og is not None:
-            zxbcdt = rearrange(zxbcdt, "(b l) d -> b l d", l=seqlen)
-        # If the model is loaded in fp16, without the .float() here, A might be -inf
-        A = -torch.exp(self.A_log.float())  # (nheads) or (d_inner, d_state)
-        dt_limit_kwargs = {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
-        if self.use_mem_eff_path and inference_params is None:
-            out = mamba_split_conv1d_scan_combined(
-                zxbcdt,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.dt_bias,
-                A,
-                D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
-                chunk_size=self.chunk_size,
-                seq_idx=seq_idx,
-                activation=self.activation,
-                rmsnorm_weight=self.norm.weight if self.rmsnorm else None,
-                rmsnorm_eps=self.norm.eps if self.rmsnorm else 1e-6,
-                outproj_weight=self.out_proj.weight,
-                outproj_bias=self.out_proj.bias,
-                headdim=None if self.D_has_hdim else self.headdim,
-                ngroups=self.ngroups,
-                norm_before_gate=self.norm_before_gate,
-                **dt_limit_kwargs,
-            )
-            if seqlen_og is not None:
-                out = rearrange(out, "b l d -> (b l) d")
-            if self.process_group is not None:
-                reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-                out = reduce_fn(out, self.process_group)
-        else:
-            d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads) // 2
-            z0, x0, z, xBC, dt = torch.split(
-                zxbcdt,
-                [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
-                dim=-1
-            )
-            if conv_state is not None:
-                if cu_seqlens is None:
-                    # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
-                    # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
-                    xBC_t = rearrange(xBC, "b l d -> b d l")
-                    conv_state.copy_(F.pad(xBC_t, (self.d_conv - xBC_t.shape[-1], 0)))  # Update state (B D W)
-                else:
-                    assert causal_conv1d_varlen_states is not None, "varlen inference requires causal_conv1d package"
-                    assert batch == 1, "varlen inference only supports batch dimension 1"
-                    conv_varlen_states = causal_conv1d_varlen_states(
-                        xBC.squeeze(0), cu_seqlens, state_len=conv_state.shape[-1]
-                    )
-                    conv_state.copy_(conv_varlen_states)
-            assert self.activation in ["silu", "swish"]
-            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
-                assert seq_idx is None, "varlen conv1d requires the causal_conv1d package"
-                xBC = self.act(
-                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)[:, -(self.dconv - 1):]
-                )  # (B, L, self.d_ssm + 2 * ngroups * d_state)
-            else:
-                xBC = causal_conv1d_fn(
-                    xBC.transpose(1, 2),
-                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                    bias=self.conv1d.bias,
-                    activation=self.activation,
-                    # seq_idx=seq_idx,
-                ).transpose(1, 2)
-            x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
-            y = mamba_chunk_scan_combined(
-                rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
-                dt,
-                A,
-                rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
-                rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
-                chunk_size=self.chunk_size,
-                D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
-                z=rearrange(z, "b l (h p) -> b l h p", p=self.headdim) if not self.rmsnorm else None,
-                dt_bias=self.dt_bias,
-                dt_softplus=True,
-                seq_idx=seq_idx,
-                cu_seqlens=cu_seqlens,
-                **dt_limit_kwargs,
-                return_final_states=ssm_state is not None,
-                return_varlen_states=cu_seqlens is not None and inference_params is not None,
-            )
-            if ssm_state is not None:
-                y, last_state, *rest = y
-                if cu_seqlens is None:
-                    ssm_state.copy_(last_state)
-                else:
-                    varlen_states = rest[0]
-                    ssm_state.copy_(varlen_states)
-            y = rearrange(y, "b l h p -> b l (h p)")
-            if self.rmsnorm:
-                y = self.norm(y, z)
-            if d_mlp > 0:
-                y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-            if seqlen_og is not None:
-                y = rearrange(y, "b l d -> (b l) d")
-            scan_outputs = y
-            if 'repeat_v' in self.fused_multihead_config and self.fused_multihead_config['repeat_v']:
-                num_repeat = scan_outputs.shape[-1] // attn_outputs.shape[-1]
-                attn_outputs = attn_outputs.repeat(1, 1, num_repeat)
-            hidden_states = (self.pre_avg_layernorm1(attn_outputs) + self.pre_avg_layernorm2(scan_outputs)) / 2
-            out = self.out_proj(hidden_states)
-        return out, attn_key_value, past_key_value
-    def step(self, hidden_states, conv_state, ssm_state):
-        dtype = hidden_states.dtype
-        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
-        zxbcdt = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
-        d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads) // 2
-        z0, x0, z, xBC, dt = torch.split(
-            zxbcdt,
-            [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
-            dim=-1
-        )
-        # Conv step
-        if causal_conv1d_update is None:
-            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
-            conv_state[:, :, -1] = xBC
-            xBC = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
-            if self.conv1d.bias is not None:
-                xBC = xBC + self.conv1d.bias
-            xBC = self.act(xBC).to(dtype=dtype)
-        else:
-            xBC = causal_conv1d_update(
-                xBC,
-                conv_state,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.activation,
-            )
-        x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
-        A = -torch.exp(self.A_log.float())  # (nheads,)
-        # SSM step
-        if selective_state_update is None:
-            assert self.ngroups == 1, "Only support ngroups=1 for this inference code path"
-            # Discretize A and B
-            dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
-            dA = torch.exp(dt * A)  # (batch, nheads)
-            x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
-            ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
-            y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
-            y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
-            y = rearrange(y, "b h p -> b (h p)")
-            if not self.rmsnorm:
-                y = y * self.act(z)  # (B D)
-        else:
-            A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32)
-            dt = repeat(dt, "b h -> b h p", p=self.headdim)
-            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
-            D = repeat(self.D, "h -> h p", p=self.headdim)
-            B = rearrange(B, "b (g n) -> b g n", g=self.ngroups)
-            C = rearrange(C, "b (g n) -> b g n", g=self.ngroups)
-            x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            if not self.rmsnorm:
-                z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
-            y = selective_state_update(
-                ssm_state, x_reshaped, dt, A, B, C, D, z=z if not self.rmsnorm else None,
-                dt_bias=dt_bias, dt_softplus=True
-            )
-            y = rearrange(y, "b h p -> b (h p)")
-        if self.rmsnorm:
-            y = self.norm(y, z)
-        if d_mlp > 0:
-            y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-        out = self.out_proj(y)
-        print(out)
-        input()
-        return out.unsqueeze(1), conv_state, ssm_state
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        device = self.out_proj.weight.device
-        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
-        conv_state = torch.zeros(
-            batch_size, self.d_conv, self.conv1d.weight.shape[0], device=device, dtype=conv_dtype
-        ).transpose(1, 2)
-        ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype
-        ssm_state = torch.zeros(
-            batch_size, self.nheads, self.headdim, self.d_state, device=device, dtype=ssm_dtype
-        )
-        return conv_state, ssm_state
-    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
-        assert self.layer_idx is not None
-        if self.layer_idx not in inference_params.key_value_memory_dict:
-            batch_shape = (batch_size,)
-            conv_state = torch.zeros(
-                batch_size,
-                self.d_conv,
-                self.conv1d.weight.shape[0],
-                device=self.conv1d.weight.device,
-                dtype=self.conv1d.weight.dtype,
-            ).transpose(1, 2)
-            ssm_state = torch.zeros(
-                batch_size,
-                self.nheads,
-                self.headdim,
-                self.d_state,
-                device=self.in_proj.weight.device,
-                dtype=self.in_proj.weight.dtype,
-            )
-            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
-        else:
-            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
-            # TODO: What if batch size changes between generation, and we reuse the same states?
-            if initialize_states:
-                conv_state.zero_()
-                ssm_state.zero_()
-        return conv_state, ssm_state
-class Mamba2_Multihead(nn.Module):
-    def __init__(
-        self,
-        config,
-        conv_init=None,
-        headdim=64,
-        d_ssm=None,  # If not None, we only apply SSM on this many dimensions, the rest uses gated MLP
-        ngroups=1,
-        A_init_range=(1, 16),
-        D_has_hdim=False,
-        rmsnorm=True,
-        norm_before_gate=False,
-        dt_min=0.001,
-        dt_max=0.1,
-        dt_init_floor=1e-4,
-        dt_limit=(0.0, float("inf")),
-        bias=False,
-        conv_bias=True,
-        # Fused kernel and sharding options
-        chunk_size=256,
-        use_mem_eff_path=False, # True,
-        layer_idx=None,  # Absorb kwarg for general module
-        process_group=None,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.config = config
-        self.d_model = config.hidden_size
-        self.d_state = config.mamba_d_state
-        self.d_conv = config.mamba_d_conv
-        self.conv_init = conv_init
-        self.expand = config.mamba_expand
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
-        self.world_size = 1 if process_group is None else process_group.size()
-        self.local_rank = 0 if process_group is None else process_group.rank()
-        self.d_inner = (self.expand * self.d_model) // self.world_size
-        assert self.d_inner * self.world_size == self.expand * self.d_model
-        self.headdim = config.mamba2_headdim
-        self.d_ssm = self.d_inner if d_ssm is None else d_ssm // self.world_size
-        assert ngroups % self.world_size == 0
-        self.ngroups = ngroups // self.world_size
-        assert self.d_ssm % self.headdim == 0
-        self.nheads = self.d_ssm // self.headdim
-        self.D_has_hdim = D_has_hdim
-        self.rmsnorm = rmsnorm
-        self.norm_before_gate = norm_before_gate
-        self.dt_limit = dt_limit
-        self.activation = "silu"
-        self.chunk_size = chunk_size
-        self.use_mem_eff_path = use_mem_eff_path
-        self.layer_idx = layer_idx
-        assert (self.d_model * self.expand / self.headdim) % 8 == 0
-        self.mamba_multihead_config = config.mamba_multihead_config
-        self.share_ratio = self.mamba_multihead_config['share_ratio']
-        self.reuse_ssm = self.mamba_multihead_config['reuse_ssm']
-        self.num_ssm_param = 1 if self.reuse_ssm else self.share_ratio
-        if self.reuse_ssm:
-            if self.mamba_multihead_config['alpha_mode'] == 'learnable':
-                self.alpha = nn.Parameter(torch.ones(self.share_ratio))
-            elif self.mamba_multihead_config['alpha_mode'] == 'manual':
-                manual_alpha_base = self.mamba_multihead_config['manual_alpha_base']
-                self.alpha = [1 / manual_alpha_base ** k for k in range(self.share_ratio)]
-            else:
-                raise ValueError(f"No such alpha_mode: {self.mamba_multihead_config['alpha_mode']}")
-        # Order: [z, x, B, C, dt]
-        d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads * self.num_ssm_param
-        if self.process_group is None:
-            self.in_proj = nn.Linear(self.d_model, d_in_proj, bias=bias, **factory_kwargs)
-        else:
-            self.in_proj = ColumnParallelLinear(self.d_model, d_in_proj * self.world_size, bias=bias,
-                                                process_group=self.process_group, sequence_parallel=self.sequence_parallel,
-                                                **factory_kwargs)
-        conv_dim = self.d_ssm + 2 * self.ngroups * self.d_state
-        self.conv1d = nn.Conv1d(
-            in_channels=conv_dim,
-            out_channels=conv_dim,
-            bias=conv_bias,
-            kernel_size=self.d_conv,
-            groups=conv_dim,
-            padding=self.d_conv - 1,
-            **factory_kwargs,
-        )
-        if self.conv_init is not None:
-            nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
-        self.act = nn.SiLU()
-        # Initialize log dt bias
-        dt = torch.exp(
-            torch.rand(self.nheads, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
-            + math.log(dt_min)
-        )
-        dt = torch.clamp(dt, min=dt_init_floor)
-        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-        inv_dt = dt + torch.log(-torch.expm1(-dt))
-        self.dt_bias = nn.ParameterList([nn.Parameter(inv_dt) for _ in range(self.num_ssm_param)])
-        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
-        # name.endswith("bias") in param_grouping.py
-        self.dt_bias._no_weight_decay = True
-        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
-        A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(*A_init_range)
-        A_log = torch.log(A).to(dtype=dtype)
-        self.A_log = nn.ParameterList([nn.Parameter(A_log) for _ in range(self.num_ssm_param)])
-        self.A_log._no_weight_decay = True
-        # D "skip" parameter
-        self.D = nn.ParameterList([nn.Parameter(torch.ones(self.d_ssm if self.D_has_hdim else self.nheads, device=device)) for _ in range(self.num_ssm_param)])
-        self.D._no_weight_decay = True
-        if self.rmsnorm:
-            assert RMSNormGated is not None
-            self.norm = RMSNormGated(self.d_ssm, eps=1e-5, norm_before_gate=self.norm_before_gate,
-                                     group_size=self.d_ssm // ngroups, **factory_kwargs)
-        if self.process_group is None:
-            self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
-        else:
-            self.out_proj = RowParallelLinear(self.d_inner * self.world_size, self.d_model, bias=bias,
-                                              process_group=self.process_group, sequence_parallel=self.sequence_parallel,
-                                              **factory_kwargs)
-        if self.mamba_multihead_config['merge_op'] == 'norm':
-            self.multihead_layernorm = nn.ModuleList([JambaRMSNorm(self.d_ssm, eps=config.rms_norm_eps) for _ in range(self.share_ratio)])
-        elif self.mamba_multihead_config['merge_op'] == 'scalar_gate':
-            self.multi_head_selection_layer = nn.Linear(self.d_ssm,  self.share_ratio)
-        elif self.mamba_multihead_config['merge_op'] == 'concat':
-            assert self.d_ssm % self.share_ratio == 0
-            self.multihead_layernorm = nn.ModuleList([JambaRMSNorm(self.d_ssm, eps=config.rms_norm_eps) for _ in range(self.share_ratio)])
-            self.reduction_layer = nn.Linear(self.d_ssm,  self.d_ssm//self.share_ratio)
-    def forward(self, hidden_states, attention_mask=None, past_key_value=None, seqlen=None, seq_idx=None, cu_seqlens=None, inference_params=None):
-        """
-        hidden_states: (batch, seqlen, hidden_dim) if seqlen=None.
-            If seqlen is not None, hidden_states is (batch * seqlen, hidden_dim). This is so that when we
-            split hidden_states during sequence parallel, we split the batch * seqlen dimension
-            (in case batch is small).
-        Returns: same shape as u
-        """
-        assert past_key_value is None, "Not implemented yet!!!"
-        seqlen_og = seqlen
-        if seqlen is None:
-            batch, seqlen, dim = hidden_states.shape
-        else:
-            batch_seqlen, dim = hidden_states.shape
-            batch = batch_seqlen // seqlen
-        conv_state, ssm_state = None, None
-        if inference_params is not None:
-            inference_batch = cu_seqlens.shape[0] - 1 if cu_seqlens is not None else batch
-            conv_state, ssm_state = self._get_states_from_cache(inference_params, inference_batch)
-            if inference_params.seqlen_offset > 0:
-                # The states are updated inplace
-                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
-                return out
-        zxbcdt = self.in_proj(hidden_states)  # (B, L, d_in_proj) or (B * L, d_in_proj)
-        if seqlen_og is not None:
-            zxbcdt = rearrange(zxbcdt, "(b l) d -> b l d", l=seqlen)
-        dt_limit_kwargs = {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
-        if self.use_mem_eff_path and inference_params is None:
-            # If the model is loaded in fp16, without the .float() here, A might be -inf
-            A = -torch.exp(self.A_log.float())  # (nheads) or (d_inner, d_state)
-            out = mamba_split_conv1d_scan_combined(
-                zxbcdt,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.dt_bias,
-                A,
-                D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
-                chunk_size=self.chunk_size,
-                seq_idx=seq_idx,
-                activation=self.activation,
-                rmsnorm_weight=self.norm.weight if self.rmsnorm else None,
-                rmsnorm_eps=self.norm.eps if self.rmsnorm else 1e-6,
-                outproj_weight=self.out_proj.weight,
-                outproj_bias=self.out_proj.bias,
-                headdim=None if self.D_has_hdim else self.headdim,
-                ngroups=self.ngroups,
-                norm_before_gate=self.norm_before_gate,
-                **dt_limit_kwargs,
-            )
-            if seqlen_og is not None:
-                out = rearrange(out, "b l d -> (b l) d")
-            if self.process_group is not None:
-                reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
-                out = reduce_fn(out, self.process_group)
-        else:
-            d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads * self.num_ssm_param) // 2
-            z0, x0, z, xBC, dt = torch.split(
-                zxbcdt,
-                [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads * self.num_ssm_param],
-                dim=-1
-            )
-            if conv_state is not None:
-                if cu_seqlens is None:
-                    # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
-                    # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
-                    xBC_t = rearrange(xBC, "b l d -> b d l")
-                    conv_state.copy_(F.pad(xBC_t, (self.d_conv - xBC_t.shape[-1], 0)))  # Update state (B D W)
-                else:
-                    assert causal_conv1d_varlen_states is not None, "varlen inference requires causal_conv1d package"
-                    assert batch == 1, "varlen inference only supports batch dimension 1"
-                    conv_varlen_states = causal_conv1d_varlen_states(
-                        xBC.squeeze(0), cu_seqlens, state_len=conv_state.shape[-1]
-                    )
-                    conv_state.copy_(conv_varlen_states)
-            assert self.activation in ["silu", "swish"]
-            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
-                assert seq_idx is None, "varlen conv1d requires the causal_conv1d package"
-                xBC = self.act(
-                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)[:, -(self.dconv - 1):]
-                )  # (B, L, self.d_ssm + 2 * ngroups * d_state)
-            else:
-                xBC = causal_conv1d_fn(
-                    xBC.transpose(1, 2),
-                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                    bias=self.conv1d.bias,
-                    activation=self.activation,
-                    seq_idx=seq_idx,
-                ).transpose(1, 2)
-            x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
-            x = rearrange(x, "b l (h p) -> b l h p", p=self.headdim)
-            B = rearrange(B, "b l (g n) -> b l g n", g=self.ngroups)
-            C = rearrange(C, "b l (g n) -> b l g n", g=self.ngroups)
-            outputs_list = []
-            dt_list = dt
-            for i in range(self.num_ssm_param):
-                dt = dt_list[..., self.nheads*i:self.nheads*(i+1)]
-                A = -torch.exp(self.A_log[i].float())  # (nheads) or (d_inner, d_state)
-                D = rearrange(self.D[i], "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D[i]
-                dt_bias = self.dt_bias[i]
-                if self.reuse_ssm:
-                    #### duplicate heads with different decays
-                    if self.mamba_multihead_config['alpha_mode'] == 'learnable':
-                        decay = self.alpha  # [share_ratio]
-                    elif self.mamba_multihead_config['alpha_mode'] == 'manual':
-                        decay = torch.tensor(self.alpha).to(dt)  # [share_ratio]
-                    dt = dt.repeat(1, 1, self.share_ratio)  # [bs, seq_len, self.nheads * share_ratio]
-                    decay = decay.view(-1, 1).repeat(1, self.nheads).view(-1)  # [self.nheads * share_ratio]
-                    dt = dt * decay  # [bs, seq_len, nheads * share_ratio]
-                    dt_bias = dt_bias.repeat(self.share_ratio) * decay  # [nheads * share_ratio]
-                    x = x.repeat(1,1,self.share_ratio,1)  # [bs, seq_len, nheads * share_ratio, head_dim]
-                    D = D.repeat(self.share_ratio,1) if self.D_has_hdim else D.repeat(self.share_ratio)  # [nheads * share_ratio]
-                    A = A.repeat(self.share_ratio)  # [nheads * share_ratio]
-                y = mamba_chunk_scan_combined(
-                    x,
-                    dt,
-                    A,
-                    B,
-                    C,
-                    chunk_size=self.chunk_size,
-                    D=D,
-                    z=rearrange(z, "b l (h p) -> b l h p", p=self.headdim).repeat(1,1,self.share_ratio,1) if not self.rmsnorm else None,
-                    dt_bias=dt_bias,
-                    dt_softplus=True,
-                    seq_idx=seq_idx,
-                    cu_seqlens=cu_seqlens,
-                    **dt_limit_kwargs,
-                    return_final_states=ssm_state is not None,
-                    return_varlen_states=cu_seqlens is not None and inference_params is not None,
-                )
-                if ssm_state is not None:
-                    y, last_state, *rest = y
-                    if cu_seqlens is None:
-                        ssm_state.copy_(last_state)
-                    else:
-                        varlen_states = rest[0]
-                        ssm_state.copy_(varlen_states)
-                outputs_list.append(y)
-            if len(outputs_list) > 1:
-                y = torch.cat(outputs_list, dim=2)
-            #### merge heads
-            num_repeat = y.shape[2] // self.nheads
-            head_outputs = torch.chunk(y, num_repeat, dim=2)
-            head_outputs = [rearrange(item, "b l h p -> b l (h p)") for item in head_outputs]
-            if self.mamba_multihead_config['merge_op'] == 'norm':
-                y = sum([self.multihead_layernorm[k](item) for k, item in enumerate(head_outputs)])
-            elif self.mamba_multihead_config['merge_op'] == 'concat':
-                head_outputs = [self.reduction_layer(self.multihead_layernorm[k](item)) for k, item in enumerate(head_outputs)]
-                y = torch.cat(head_outputs, dim=-1)
-            else:
-                raise ValueError(f"No such merge_op: {self.mamba_multihead_config['merge_op']}")
-            if self.rmsnorm:
-                y = self.norm(y, z)
-            if d_mlp > 0:
-                y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-            if seqlen_og is not None:
-                y = rearrange(y, "b l d -> (b l) d")
-            out = self.out_proj(y)
-        return out, past_key_value
-    def step(self, hidden_states, conv_state, ssm_state):
-        dtype = hidden_states.dtype
-        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
-        zxbcdt = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
-        d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads) // 2
-        z0, x0, z, xBC, dt = torch.split(
-            zxbcdt,
-            [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
-            dim=-1
-        )
-        # Conv step
-        if causal_conv1d_update is None:
-            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
-            conv_state[:, :, -1] = xBC
-            xBC = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
-            if self.conv1d.bias is not None:
-                xBC = xBC + self.conv1d.bias
-            xBC = self.act(xBC).to(dtype=dtype)
-        else:
-            xBC = causal_conv1d_update(
-                xBC,
-                conv_state,
-                rearrange(self.conv1d.weight, "d 1 w -> d w"),
-                self.conv1d.bias,
-                self.activation,
-            )
-        x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
-        A = -torch.exp(self.A_log.float())  # (nheads,)
-        # SSM step
-        if selective_state_update is None:
-            assert self.ngroups == 1, "Only support ngroups=1 for this inference code path"
-            # Discretize A and B
-            dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
-            dA = torch.exp(dt * A)  # (batch, nheads)
-            x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
-            ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
-            y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
-            y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
-            y = rearrange(y, "b h p -> b (h p)")
-            if not self.rmsnorm:
-                y = y * self.act(z)  # (B D)
-        else:
-            A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32)
-            dt = repeat(dt, "b h -> b h p", p=self.headdim)
-            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
-            D = repeat(self.D, "h -> h p", p=self.headdim)
-            B = rearrange(B, "b (g n) -> b g n", g=self.ngroups)
-            C = rearrange(C, "b (g n) -> b g n", g=self.ngroups)
-            x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
-            if not self.rmsnorm:
-                z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
-            y = selective_state_update(
-                ssm_state, x_reshaped, dt, A, B, C, D, z=z if not self.rmsnorm else None,
-                dt_bias=dt_bias, dt_softplus=True
-            )
-            y = rearrange(y, "b h p -> b (h p)")
-        if self.rmsnorm:
-            y = self.norm(y, z)
-        if d_mlp > 0:
-            y = torch.cat([F.silu(z0) * x0, y], dim=-1)
-        out = self.out_proj(y)
-        return out.unsqueeze(1), conv_state, ssm_state
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        device = self.out_proj.weight.device
-        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
-        conv_state = torch.zeros(
-            batch_size, self.d_conv, self.conv1d.weight.shape[0], device=device, dtype=conv_dtype
-        ).transpose(1, 2)
-        ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype
-        ssm_state = torch.zeros(
-            batch_size, self.nheads, self.headdim, self.d_state, device=device, dtype=ssm_dtype
-        )
-        return conv_state, ssm_state
-    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
-        assert self.layer_idx is not None
-        if self.layer_idx not in inference_params.key_value_memory_dict:
-            batch_shape = (batch_size,)
-            conv_state = torch.zeros(
-                batch_size,
-                self.d_conv,
-                self.conv1d.weight.shape[0],
-                device=self.conv1d.weight.device,
-                dtype=self.conv1d.weight.dtype,
-            ).transpose(1, 2)
-            ssm_state = torch.zeros(
-                batch_size,
-                self.nheads,
-                self.headdim,
-                self.d_state,
-                device=self.in_proj.weight.device,
-                dtype=self.in_proj.weight.dtype,
-            )
-            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
-        else:
-            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
-            # TODO: What if batch size changes between generation, and we reuse the same states?
-            if initialize_states:
-                conv_state.zero_()
-                ssm_state.zero_()
-        return conv_state, ssm_state
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Jamba
-class JambaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        JambaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)

 except ImportError:
     causal_conv1d_varlen_states = None
 from mamba_ssm.ops.triton.selective_state_update import selective_state_update
 from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
         # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
         inv_dt = dt + torch.log(-torch.expm1(-dt))
+        self.dt_bias = nn.Parameter(inv_dt)
+        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
+        # name.endswith("bias") in param_grouping.py
+        self.dt_bias._no_weight_decay = True
         assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
         A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(*A_init_range)
                                               process_group=self.process_group, sequence_parallel=self.sequence_parallel,
                                               **factory_kwargs)
     def forward(self, hidden_states, attention_mask=None, past_key_value=None, seqlen=None, seq_idx=None, cu_seqlens=None, inference_params=None):
         """
         """
         # assert past_key_value is None, "Not implemented yet!!!"
         seqlen_og = seqlen
         if seqlen is None:
             batch, seqlen, dim = hidden_states.shape
             batch = batch_seqlen // seqlen
         conv_state, ssm_state = None, None
         if inference_params is not None:
             inference_batch = cu_seqlens.shape[0] - 1 if cu_seqlens is not None else batch
             conv_state, ssm_state = self._get_states_from_cache(inference_params, inference_batch)
             if inference_params.seqlen_offset > 0:
                 # The states are updated inplace
                 out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out, past_key_value
         zxbcdt = self.in_proj(hidden_states)  # (B, L, d_in_proj) or (B * L, d_in_proj)
         if seqlen_og is not None:
             zxbcdt = rearrange(zxbcdt, "(b l) d -> b l d", l=seqlen)
         # If the model is loaded in fp16, without the .float() here, A might be -inf
                 [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
                 dim=-1
             )
             if conv_state is not None:
                 if cu_seqlens is None:
                     # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
                     activation=self.activation,
                     # seq_idx=seq_idx,
                 ).transpose(1, 2)
             x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
             y = mamba_chunk_scan_combined(
                 rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
                 rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
                 rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
                 chunk_size=self.chunk_size,
+                # D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
+                D=self.D,
                 z=rearrange(z, "b l (h p) -> b l h p", p=self.headdim) if not self.rmsnorm else None,
+                dt_bias=self.dt_bias,
                 dt_softplus=True,
                 seq_idx=seq_idx,
                 cu_seqlens=cu_seqlens,
                     ssm_state.copy_(varlen_states)
             y = rearrange(y, "b l h p -> b l (h p)")
             if self.rmsnorm:
+                y_full = y
+                z_full = z
+                y = self.norm(y_full, z_full)
             if d_mlp > 0:
                 y = torch.cat([F.silu(z0) * x0, y], dim=-1)
             if seqlen_og is not None:
                 y = rearrange(y, "b l d -> (b l) d")
             out = self.out_proj(y)
         return out, past_key_value
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        # Remove single token limitation - now supports hidden_states.shape[1] > 1
+        batch_size, seq_len, _ = hidden_states.shape
+        if seq_len == 1:
+            # Single token case - keep existing optimized path
+            zxbcdt = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
         else:
+            # Multi-token case - process without squeezing
+            zxbcdt = self.in_proj(hidden_states)  # (B L 2D)
         d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads) // 2
+        if seq_len == 1:
+            z0, x0, z, xBC, dt = torch.split(
+                zxbcdt,
+                [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
+                dim=-1
+            )
         else:
+            z0, x0, z, xBC, dt = torch.split(
+                zxbcdt,
+                [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
+                dim=-1
             )
+        # Conv step - handle both single and multi-token cases
+        if seq_len == 1:
+            # Single token optimized path
+            if causal_conv1d_update is None:
+                conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+                conv_state[:, :, -1] = xBC
+                xBC = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+                if self.conv1d.bias is not None:
+                    xBC = xBC + self.conv1d.bias
+                xBC = self.act(xBC).to(dtype=dtype)
+            else:
+                xBC = causal_conv1d_update(
+                    xBC,
+                    conv_state,
+                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    self.conv1d.bias,
+                    self.activation,
+                )
+        else:
+            # Multi-token case - update conv_state and process sequence
+            # Update conv_state with the new sequence
+            xBC_t = rearrange(xBC, "b l d -> b d l")
+            conv_state.copy_(F.pad(xBC_t, (self.d_conv - xBC_t.shape[-1], 0)))  # Update state (B D W)
+            # Process convolution for the full sequence
+            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                xBC = self.act(
+                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)[:, -(self.d_conv - 1):]
+                )  # (B, L, self.d_ssm + 2 * ngroups * d_state)
+            else:
+                xBC = causal_conv1d_fn(
+                    xBC.transpose(1, 2),
+                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                ).transpose(1, 2)
         x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
         A = -torch.exp(self.A_log.float())  # (nheads,)
+        # SSM step - handle both single and multi-token cases
+        if seq_len == 1:
+            # Single token optimized path
+            if selective_state_update is None:
+                assert self.ngroups == 1, "Only support ngroups=1 for this inference code path"
+                # Discretize A and B
+                dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
+                dA = torch.exp(dt * A)  # (batch, nheads)
+                x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+                dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
+                ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
+                y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
+                y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
+                y = rearrange(y, "b h p -> b (h p)")
+                if not self.rmsnorm:
+                    y = y * self.act(z)  # (B D)
+            else:
+                A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32)
+                dt = repeat(dt, "b h -> b h p", p=self.headdim)
+                dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
+                D = repeat(self.D, "h -> h p", p=self.headdim)
+                B = rearrange(B, "b (g n) -> b g n", g=self.ngroups)
+                C = rearrange(C, "b (g n) -> b g n", g=self.ngroups)
+                x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+                if not self.rmsnorm:
+                    z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
+                y = selective_state_update(
+                    ssm_state, x_reshaped, dt, A, B, C, D, z=z if not self.rmsnorm else None,
+                    dt_bias=dt_bias, dt_softplus=True
+                )
+                y = rearrange(y, "b h p -> b (h p)")
         else:
+            # Multi-token case - use mamba_chunk_scan_combined similar to forward method
+            dt_limit_kwargs = {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
+            y = mamba_chunk_scan_combined(
+                rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
+                dt,
+                A,
+                rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
+                rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
+                chunk_size=self.chunk_size,
+                D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
+                z=rearrange(z, "b l (h p) -> b l h p", p=self.headdim) if not self.rmsnorm else None,
+                dt_bias=self.dt_bias,
+                dt_softplus=True,
+                **dt_limit_kwargs,
+                return_final_states=True,
             )
+            # Extract final state and update ssm_state
+            y, final_ssm_state = y
+            ssm_state.copy_(final_ssm_state)
+            y = rearrange(y, "b l h p -> b l (h p)")
         if self.rmsnorm:
             y = self.norm(y, z)
         if d_mlp > 0:
             y = torch.cat([F.silu(z0) * x0, y], dim=-1)
         out = self.out_proj(y)
+        # Ensure output shape consistency
+        if seq_len == 1 and out.dim() == 2:
+            out = out.unsqueeze(1)  # (B, 1, D)
+        return out, conv_state, ssm_state
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
         device = self.out_proj.weight.device
             if initialize_states:
                 conv_state.zero_()
                 ssm_state.zero_()
+        return conv_state, ssm_state

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e8a0875ed4decf5cbbf676868cbba137f3248a5a592a85597f31614080a25c6
 size 4987939472

 version https://git-lfs.github.com/spec/v1
+oid sha256:1b9b0f9876dd16860790782a1f166be5173253c2ea303c4e3ab40b0b8218af2f
 size 4987939472

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f361fe0361ab0e101d95f5161ee1a724501ae664e0d27c28496d9288b71ebc3
 size 512102640

 version https://git-lfs.github.com/spec/v1
+oid sha256:915482f874991ab97f5e2afbb2064d38f829ef6a76907ddf337a495e14bba382
 size 512102640

modeling_fast_slm.py ADDED Viewed

The diff for this file is too large to render. See raw diff

triton_attention.py ADDED Viewed

	@@ -0,0 +1,2714 @@

+"""Custom ops for MHA/XQA attention."""
+import math
+from dataclasses import astuple
+from typing import List, Optional
+import torch
+import torch.nn.functional as F
+import triton
+from triton import language as tl
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field, fields
+from typing import Dict, List, Literal, Optional, Protocol, Sequence, Tuple, Type, Union
+import torch
+from torch.export import Dim
+@triton.jit
+def update_kv_cache(
+    k_ptr,  # [B*S, N, D]
+    v_ptr,  # [B*S, N, D]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    input_pos_ptr,  # Specifies the sequence index in the caches at which to write the provided kv
+    cache_loc_ptr,  # Specifies the batch index for each of the input sequences
+    MAX_SEQ_LENGTH: tl.constexpr,
+    N_KV_HEADS: tl.constexpr,
+    Q_D_HEAD: tl.constexpr,
+    V_D_HEAD: tl.constexpr,
+    SEQ_BLOCK: tl.constexpr,
+    GENERATE_ONLY: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    if GENERATE_ONLY:
+        seq_start_index = batch_id
+        seq_len: tl.constexpr = 1
+    else:
+        seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+        seq_len = tl.load(seq_len_ptr + batch_id)
+    # cache is [bsnd]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    k_cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH * K_D_HEAD
+    v_cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH * V_D_HEAD
+    k_dhead_offsets = tl.arange(0, triton.next_power_of_2(K_D_HEAD))
+    k_dhead_mask = k_dhead_offsets < K_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    k_load_mask = seq_mask[:, None] * k_dhead_mask[None, :]
+    v_load_mask = seq_mask[:, None] * v_dhead_mask[None, :]
+    k_batch_offset = seq_start_index * N_KV_HEADS * K_D_HEAD
+    v_batch_offset = seq_start_index * N_KV_HEADS * V_D_HEAD
+    # Write back to kv-caches
+    ks = tl.load(
+        k_ptr
+        + k_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * K_D_HEAD
+        + head_id * K_D_HEAD
+        + k_dhead_offsets[None, :],
+        mask=k_load_mask,
+    )
+    vs = tl.load(
+        v_ptr
+        + v_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * V_D_HEAD
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        mask=v_load_mask,
+    )
+    kv_writeback_seq_offsets = seq_offsets + kv_position
+    k_cache_offset = (
+        k_cache_batch_offset
+        + kv_writeback_seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+        + head_id * K_D_HEAD
+        + k_dhead_offsets[None, :]
+    )
+    v_cache_offset = (
+        v_cache_batch_offset
+        + kv_writeback_seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :]
+    )
+    tl.store(k_cache_ptr + k_cache_offset, ks, k_load_mask)
+    tl.store(v_cache_ptr + v_cache_offset, vs, v_load_mask)
+@triton.jit
+def gqa_attention_kv_stage1(
+    q_ptr,  # [Batch, 1, N_HEADS, D_HEAD]
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    cache_loc_ptr,  # [Batch] # Specifies the batch index for each of the generate tokens.
+    input_pos_ptr,  # [Batch]
+    output_values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    output_logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    num_blocks,
+    MAX_SEQ_LEN: tl.constexpr,  # Maximum supported sequence length
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    Q_D_HEAD: tl.constexpr,  # Dimension of each query head.
+    V_D_HEAD: tl.constexpr,  # Dimension of each key/value head
+    SEQ_BLOCK_SIZE: tl.constexpr,  # Block size used for tiling the sequence dim.
+    HEAD_BLOCK_SIZE: tl.constexpr,  # pad to 16 if HEAD_RATIO is < 16 to invoke tensor cores.
+):
+    """Attention kernel to be used for generate-only batches.
+    Specialized for GQA.
+    Assumes that kv caches have been updated.
+    Supports non-power-of-2 D_HEAD
+    Uses flash decoding.
+    KV-cache layout is assumed to be [Batch,Seq, Head, Dim]
+    1. Fetch the K-cache from 0 to input_pos
+    2. Fetch the V-cache from 0 to input_pos
+    3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
+    4. S = softmax(A)
+    5. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
+    """
+    # Assume KV-cache layout: [Batch, Seq, Head, Dim]
+    # A program is responsible for 1 batch, 1 head and a block of sequences.
+    batch_id = tl.program_id(axis=0)
+    kv_head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    kv_batch_id = tl.load(cache_loc_ptr + batch_id)
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    batch_offset = kv_batch_id * N_KV_HEADS * MAX_SEQ_LEN
+    # Offsets for the block of sequences this program processes.
+    seq_start_pos = seq_block_id * SEQ_BLOCK_SIZE
+    # The number of Q heads that map to each KV head.
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS  # This needs to be a power-of-2
+    if seq_start_pos > kv_position:
+        return
+    seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+    seq_mask = seq_offsets <= kv_position
+    # Need to pad the head dim to 16 if HEAD_RATIO is < 16 so that tensor cores can be invoked
+    #
+    head_offsets = kv_head_id * HEAD_RATIO + tl.arange(0, HEAD_BLOCK_SIZE)
+    head_mask = head_offsets < (kv_head_id * HEAD_RATIO + HEAD_RATIO)
+    # Assuming D_HEAD is a power of 2
+    q_dhead_offsets = tl.arange(0, triton.next_power_of_2(Q_D_HEAD))
+    q_dhead_mask = q_dhead_offsets < Q_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    sm_scale: tl.constexpr = 1.0 / (Q_D_HEAD**0.5)
+    # Program loads the entire Q for the head assigned to it.
+    # [NUM_HEADS, Q_D_HEAD]
+    q_batch_offset = batch_id * N_HEADS * Q_D_HEAD
+    q_head_offsets = head_offsets * Q_D_HEAD
+    # Q layout : BSND
+    q = tl.load(
+        q_ptr + q_batch_offset + q_head_offsets[:, None] + q_dhead_offsets[None, :],
+        mask=head_mask[:, None] * q_dhead_mask[None, :],
+        other=0.0,
+    )
+    # [BSND]
+    k_block_offsets = (
+        batch_offset * K_D_HEAD
+        + seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+        + kv_head_id * K_D_HEAD
+        + q_dhead_offsets[None, :]
+    )
+    k_mask = seq_mask[:, None] * q_dhead_mask[None, :]  # K and Q share the same head dim
+    k = tl.load(k_cache_ptr + k_block_offsets, mask=k_mask, other=0.0)
+    v_block_offsets = (
+        batch_offset * V_D_HEAD
+        + seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+        + kv_head_id * V_D_HEAD
+        + v_dhead_offsets[None, :]
+    )
+    v_mask = seq_mask[:, None] * v_dhead_mask[None, :]
+    # [seq_block, V_D_HEAD]
+    v = tl.load(v_cache_ptr + v_block_offsets, mask=v_mask, other=0.0)
+    # Note: check the output precision of the sum.
+    # compute q*K^T
+    # [NUM_HEADS, Q_D_HEAD] * [seq_block, Q_D_HEAD], sum along axis 1
+    attn = tl.dot(q, k.trans())  # [N, seq_block]
+    attn = attn.to(tl.float32)
+    attn *= sm_scale
+    max_attn = tl.max(attn, axis=1)  # [N, 1]
+    # Set to -inf attn values where mask is not set. This forces exp(attn) to 0.
+    attn = tl.where(head_mask[:, None] * seq_mask[None, :], attn, float("-inf"))
+    exp_attn = tl.exp(attn - max_attn[:, None])
+    sumexp = tl.sum(exp_attn, axis=1)  # [N, 1]
+    # [NUM_HEADS, seq_len] * [seq_len, V_D_HEAD], sum along axis 0
+    output = tl.dot(exp_attn.to(v.dtype), v)
+    output = output / sumexp[:, None]  # [N, D_HEAD]
+    # We store the log-sum-exp after removing the max.
+    logsumexp = tl.log(sumexp) + max_attn
+    # when seq_mask is all false, max_attn will be -inf and sumexp is zero
+    tl.store(
+        output_values_ptr
+        + batch_id * N_HEADS * V_D_HEAD * num_blocks
+        + head_offsets[:, None] * V_D_HEAD * num_blocks
+        + seq_block_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        output,
+        mask=head_mask[:, None] * v_dhead_mask[None, :],
+    )
+    tl.store(
+        output_logsumexp_ptr
+        + batch_id * N_HEADS * num_blocks
+        + head_offsets * num_blocks
+        + seq_block_id,
+        logsumexp,
+        mask=head_mask,
+    )
+@triton.jit
+def attention_kv_stage1(
+    q_ptr,  # [Batch, 1, N_HEADS, D_HEAD]
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    cache_loc_ptr,  # [Batch] # Specifies the batch index for each of the generate tokens.
+    input_pos_ptr,  # [Batch]
+    output_values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    output_logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    num_blocks,
+    MAX_SEQ_LEN: tl.constexpr,  # Maximum supported sequence length
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    SEQ_BLOCK_SIZE: tl.constexpr,  # Block size used for tiling the sequence dim.
+):
+    """Attention kernel to be used for generate-only batches.
+    Assumes that kv caches have been updated.
+    Uses flash decoding.
+    KV-cache layout is assumed to be [Batch,Seq, Head, Dim]
+    1. Fetch the K-cache from 0 to input_pos
+    2. Fetch the V-cache from 0 to input_pos
+    3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
+    4. S = softmax(A)
+    5. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
+    """
+    # Assume KV-cache layout: [Batch, Seq, Head, Dim]
+    # A program is responsible for 1 batch, 1 head and a block of sequences.
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    epsilon: tl.constexpr = 1e-38  # float32 smallest positive number
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    kv_batch_id = tl.load(cache_loc_ptr + batch_id)
+    kv_batch_offset = kv_batch_id * N_KV_HEADS * MAX_SEQ_LEN * D_HEAD
+    # Offsets for the block of sequences this program processes.
+    seq_start_pos = seq_block_id * SEQ_BLOCK_SIZE
+    if seq_start_pos > kv_position:
+        return
+    seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+    seq_mask = seq_offsets <= kv_position
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, triton.next_power_of_2(D_HEAD))
+    dhead_mask = dhead_offsets < D_HEAD
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    kv_head_offset = (head_id // HEAD_RATIO) * D_HEAD
+    sm_scale: tl.constexpr = 1.0 / (D_HEAD**0.5)
+    # Program loads the entire Q for the head assigned to it.
+    # [D_HEAD]
+    q_batch_offset = batch_id * N_HEADS * D_HEAD
+    q_head_offset = head_id * D_HEAD
+    q = tl.load(q_ptr + q_batch_offset + q_head_offset + dhead_offsets, mask=dhead_mask)
+    kv_block_offsets = (
+        kv_batch_offset
+        + seq_offsets[:, None] * D_HEAD * N_KV_HEADS
+        + kv_head_offset
+        + dhead_offsets[None, :]
+    )  # [BSND]
+    kv_mask = seq_mask[:, None] * dhead_mask[None, :]
+    # [seq_block, D_HEAD]
+    k = tl.load(k_cache_ptr + kv_block_offsets, mask=kv_mask, other=0.0)
+    v = tl.load(v_cache_ptr + kv_block_offsets, mask=kv_mask, other=0.0)
+    # Note: check the output precision of the sum.
+    # compute q*K^T
+    # [D_HEAD] * [seq_block, D_HEAD], sum along axis 1
+    attn = tl.sum(q[None, :].to(tl.float32) * k.to(tl.float32), axis=1)  # [seq_block]
+    attn *= sm_scale
+    max_attn = tl.max(attn)
+    # Set to -inf attn values where mask is not set. This forces exp(attn) to 0.
+    attn = tl.where(seq_mask, attn, float("-inf"))
+    exp_attn = tl.exp(attn - max_attn)
+    exp_attn = tl.where(exp_attn == 0, epsilon, exp_attn)
+    sumexp = tl.sum(exp_attn, axis=0)  # scalar.
+    # [seq_len] * [seq_len, D_HEAD], sum along axis 0
+    output = tl.sum(exp_attn[:, None] * v, axis=0)  # [D_HEAD]
+    output = output / sumexp
+    # We store the log-sum-exp after removing the max.
+    logsumexp = tl.log(sumexp) + max_attn
+    # when seq_mask is all false, max_attn will be -inf and sumexp is zero
+    tl.store(
+        output_values_ptr
+        + batch_id * N_HEADS * D_HEAD * num_blocks
+        + head_id * D_HEAD * num_blocks
+        + seq_block_id * D_HEAD
+        + dhead_offsets,
+        output,
+        mask=dhead_mask,
+    )
+    tl.store(
+        output_logsumexp_ptr
+        + batch_id * N_HEADS * num_blocks
+        + head_id * num_blocks
+        + seq_block_id,
+        logsumexp,
+    )
+@triton.jit
+def attention_kv_stage2(
+    values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    output_ptr,  # [Batch, N_HEADS, D_HEAD]
+    input_pos_ptr,
+    NUM_BLOCKS: tl.constexpr,
+    N_HEADS: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    SEQ_BLOCK_SIZE: tl.constexpr,  # Nearest power of 2 for num_blocks
+):
+    # There are batch * N_HEADS programs
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    dhead_offsets = tl.arange(0, triton.next_power_of_2(D_HEAD))
+    dhead_mask = dhead_offsets < D_HEAD
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    block_id = kv_position // SEQ_BLOCK_SIZE + 1
+    NUM_BLOCKS_POW2: tl.constexpr = triton.next_power_of_2(NUM_BLOCKS)
+    block_offsets = tl.arange(0, NUM_BLOCKS_POW2)
+    block_mask = block_offsets < block_id
+    logsumexp = tl.load(
+        logsumexp_ptr + batch_id * N_HEADS * NUM_BLOCKS + head_id * NUM_BLOCKS + block_offsets,
+        mask=block_mask,
+        other=float("-inf"),
+    )
+    max_logsumexp = tl.max(logsumexp)
+    sumexp = tl.exp(logsumexp - max_logsumexp)  # [NUM_BLOCKS_POW2]
+    aggregate_sumexp = tl.sum(sumexp, axis=0)
+    values_offsets = block_offsets[:, None] * D_HEAD + dhead_offsets[None, :]
+    values_mask = block_mask[:, None] * dhead_mask[None, :]
+    values = tl.load(
+        values_ptr
+        + batch_id * N_HEADS * D_HEAD * NUM_BLOCKS
+        + head_id * D_HEAD * NUM_BLOCKS
+        + values_offsets,
+        mask=values_mask,
+        other=0.0,
+    )  # [BLOCK_SIZE, D_HEAD]
+    values *= sumexp[:, None]
+    values /= aggregate_sumexp
+    output = tl.sum(values, axis=0)  # [DHEAD]
+    tl.store(
+        output_ptr + batch_id * N_HEADS * D_HEAD + head_id * D_HEAD + dhead_offsets,
+        output,
+        mask=dhead_mask,
+    )
+@triton.jit
+def context_attention_kv(
+    q_ptr,  # [bsnd]
+    k_ptr,  # [bsnd]
+    v_ptr,  # [bsnd]
+    k_cache_ptr,  # [bsnd]
+    v_cache_ptr,  # [bsnd]
+    seq_len,
+    o_ptr,
+    softmax_scale,
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    Q_D_HEAD: tl.constexpr,  # Dimension of each query head.
+    V_D_HEAD: tl.constexpr,  # Dimension of each value head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+):
+    """Kernel for context phase.
+    Assuming:
+    1. Self-attention [seqlen(Q) == seqlen(K)]
+    2. Causal attention
+    3. QKV layout: [bsnd]
+    """
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    q_dhead_offsets = tl.arange(0, triton.next_power_of_2(Q_D_HEAD))
+    q_dhead_mask = q_dhead_offsets < Q_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    q_load_mask = seq_mask[:, None] * q_dhead_mask[None, :]
+    q_batch_offset = batch_id * seq_len * N_HEADS
+    kv_batch_offset = batch_id * seq_len * N_KV_HEADS
+    k_head_offset = (head_id // HEAD_RATIO) * K_D_HEAD
+    v_head_offset = (head_id // HEAD_RATIO) * V_D_HEAD
+    # Q will stay in SRAM
+    q = tl.load(
+        q_ptr
+        + q_batch_offset * Q_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * Q_D_HEAD
+        + head_id * Q_D_HEAD
+        + q_dhead_offsets[None, :],
+        mask=q_load_mask,
+    )
+    acc = tl.zeros([SEQ_BLOCK, triton.next_power_of_2(V_D_HEAD)], dtype=tl.float32)
+    lse_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    for s in range(0, seq_block_id + 1, 1):
+        kv_seq_offsets = s * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+        kv_seq_mask = kv_seq_offsets < seq_len
+        k_load_mask = kv_seq_mask[:, None] * q_dhead_mask[None, :]
+        k = tl.load(
+            k_ptr
+            + kv_batch_offset * K_D_HEAD
+            + kv_seq_offsets[:, None] * N_KV_HEADS * K_D_HEAD
+            + k_head_offset
+            + q_dhead_offsets[None, :],
+            mask=k_load_mask,
+        )
+        qk = tl.zeros([SEQ_BLOCK, SEQ_BLOCK], dtype=tl.float32)
+        qk += tl.dot(q, k.trans())
+        # causal mask
+        qk = tl.where(seq_offsets[:, None] >= kv_seq_offsets[None, :], qk, float("-inf"))
+        qk *= softmax_scale
+        # rowmax
+        m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+        p = tl.exp(qk - m_ij[:, None])  # [S,S]
+        v = tl.load(
+            v_ptr
+            + kv_batch_offset * V_D_HEAD
+            + kv_seq_offsets[:, None] * N_KV_HEADS * V_D_HEAD
+            + v_head_offset
+            + v_dhead_offsets[None, :],
+            mask=kv_seq_mask[:, None] * v_dhead_mask[None, :],
+        )
+        l_ij = tl.sum(p, 1)
+        acc_scale = tl.exp(m_i - m_ij)
+        acc = acc * acc_scale[:, None]
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    acc = acc * o_scale[:, None]
+    tl.store(
+        o_ptr
+        + batch_id * seq_len * N_HEADS * V_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * V_D_HEAD
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        acc,
+        mask=seq_mask[:, None] * v_dhead_mask[None, :],
+    )
+    # Write back to kv-caches
+    ks = tl.load(
+        k_ptr
+        + kv_batch_offset * K_D_HEAD
+        + seq_offsets[:, None] * N_KV_HEADS * K_D_HEAD
+        + k_head_offset
+        + q_dhead_offsets[None, :],
+        mask=seq_mask[:, None] * q_dhead_mask[None, :],
+    )
+    vs = tl.load(
+        v_ptr
+        + kv_batch_offset * V_D_HEAD
+        + seq_offsets[:, None] * N_KV_HEADS * V_D_HEAD
+        + v_head_offset
+        + v_dhead_offsets[None, :],
+        mask=seq_mask[:, None] * v_dhead_mask[None, :],
+    )
+    # cache is [bsnd]
+    k_cache_offset = (
+        batch_id * N_KV_HEADS * MAX_SEQ_LENGTH * K_D_HEAD
+        + seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+        + k_head_offset
+        + q_dhead_offsets[None, :]
+    )
+    v_cache_offset = (
+        batch_id * N_KV_HEADS * MAX_SEQ_LENGTH * V_D_HEAD
+        + seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+        + v_head_offset
+        + v_dhead_offsets[None, :]
+    )
+    tl.store(k_cache_ptr + k_cache_offset, ks, seq_mask[:, None] * q_dhead_mask[None, :])
+    tl.store(v_cache_ptr + v_cache_offset, vs, seq_mask[:, None] * v_dhead_mask[None, :])
+@triton.jit
+def context_attention_kv_flattened(
+    q_ptr,  # [b*s,nd]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [bsnd]
+    v_cache_ptr,  # [bsnd]
+    input_pos_ptr,  # [b] # specifies the location in the sequence where kv must be written back.
+    cache_loc_ptr,  # [b] # location of the sequence in the cache.
+    o_ptr,
+    softmax_scale: tl.constexpr,
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    Q_D_HEAD: tl.constexpr,  # Dimension of each query head.
+    V_D_HEAD: tl.constexpr,  # Dimension of each value head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+):
+    """Kernel for context phase.
+    Assumes that kv caches have been updated.
+    Assuming QKV layout: [b*s,n,d]
+    """
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+    seq_len = tl.load(seq_len_ptr + batch_id)
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    # cache is [bsnd]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH
+    cache_head_offset = head_id // HEAD_RATIO
+    q_dhead_offsets = tl.arange(0, triton.next_power_of_2(Q_D_HEAD))
+    q_dhead_mask = q_dhead_offsets < Q_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    # Q will stay in SRAM
+    q = tl.load(
+        q_ptr
+        + seq_start_index * N_HEADS * Q_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * Q_D_HEAD
+        + head_id * Q_D_HEAD
+        + q_dhead_offsets[None, :],
+        mask=seq_mask[:, None] * q_dhead_mask[None, :],
+    )
+    acc = tl.zeros([SEQ_BLOCK, triton.next_power_of_2(V_D_HEAD)], dtype=tl.float32)
+    lse_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    # Loop over the entire KV-history
+    # input_pos_ptr stores the location at which kv must be written back for the given batch.
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    num_blocks = (kv_position + seq_len + SEQ_BLOCK - 1) // SEQ_BLOCK
+    for s in range(0, num_blocks + 1, 1):
+        kv_seq_offsets = s * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+        kv_seq_mask = kv_seq_offsets < (kv_position + seq_len)
+        k = tl.load(
+            k_cache_ptr
+            + cache_batch_offset * K_D_HEAD
+            + kv_seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+            + cache_head_offset * K_D_HEAD
+            + q_dhead_offsets[None, :],
+            mask=kv_seq_mask[:, None] * q_dhead_mask[None, :],
+        )
+        qk = tl.zeros([SEQ_BLOCK, SEQ_BLOCK], dtype=tl.float32)
+        qk += tl.dot(q, k.trans())
+        qk = tl.where(
+            (seq_offsets[:, None] + kv_position) >= kv_seq_offsets[None, :], qk, float("-inf")
+        )
+        qk *= softmax_scale
+        # rowmax
+        m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+        p = tl.exp(qk - m_ij[:, None])
+        v = tl.load(
+            v_cache_ptr
+            + cache_batch_offset * V_D_HEAD
+            + kv_seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+            + cache_head_offset * V_D_HEAD
+            + v_dhead_offsets[None, :],
+            mask=kv_seq_mask[:, None] * v_dhead_mask[None, :],
+        )
+        l_ij = tl.sum(p, 1)
+        acc_scale = tl.exp(m_i - m_ij)
+        acc = acc * acc_scale[:, None]
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    acc = acc * o_scale[:, None]
+    tl.store(
+        o_ptr
+        + seq_start_index * N_HEADS * V_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * V_D_HEAD
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        acc,
+        mask=seq_mask[:, None] * v_dhead_mask[None, :],
+    )
+@triton.jit
+def update_kv_cache_rope_fusion(
+    q_ptr,  # [B*S, N, D]
+    k_ptr,  # [B*S, N, D]
+    v_ptr,  # [B*S, N, D]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    q_rope_ptr,  # [B*S, N, D], roped q result
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    input_pos_ptr,  # Specifies the sequence index in the caches at which to write the provided kv
+    cache_loc_ptr,  # Specifies the batch index for each of the input sequences
+    f_ptr,  # [MAX_SEQ_LEN, D_HEAD//2, 2] # frequencies for rope embadding.
+    MAX_SEQ_LENGTH: tl.constexpr,
+    N_HEADS: tl.constexpr,
+    N_KV_HEADS: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    SEQ_BLOCK: tl.constexpr,
+    HEAD_BLOCK_SIZE: tl.constexpr,  # pad to 16 if HEAD_RATIO is < 16 to invoke tensor cores.
+    GENERATE_ONLY: tl.constexpr,
+):
+    """Fuse q and k rope with update_kv_cache kernel.
+    The input is interleaved as [2, D//2] in D_HEAD dim.
+    Update q_rope with the post-rope-embadding q values.
+    Update k_cache with the post-rope-embadding k values.
+    For rope computation, q and k need to load and store in tensors pair of 2 * [D//2].
+    Update v_cache with v.
+    """
+    batch_id = tl.program_id(axis=0)
+    kv_head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    if GENERATE_ONLY:
+        seq_start_index = batch_id
+        seq_len: tl.constexpr = 1
+    else:
+        seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+        seq_len = tl.load(seq_len_ptr + batch_id)
+    # cache is [bsnd]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH * D_HEAD
+    cache_head_offset = kv_head_id * D_HEAD
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    load_mask = seq_mask[:, None] * dhead_mask[None, :]
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS  # This needs to be a power-of-2
+    q_head_offsets = kv_head_id * HEAD_RATIO + tl.arange(0, HEAD_BLOCK_SIZE)
+    q_head_mask = q_head_offsets < (kv_head_id * HEAD_RATIO + HEAD_RATIO)
+    q_batch_offset = seq_start_index * N_HEADS * D_HEAD
+    kv_batch_offset = seq_start_index * N_KV_HEADS * D_HEAD
+    kv_head_offset = cache_head_offset
+    D2: tl.constexpr = D_HEAD // 2
+    # input is interleaved as [2, D//2] in dim [D_HEAD].
+    d2_offsets = tl.arange(0, D2)
+    dhead_offsets1 = d2_offsets
+    dhead_offsets2 = d2_offsets + D2
+    d2_mask = dhead_offsets2 < D_HEAD
+    d2_load_mask = seq_mask[:, None] * d2_mask[None, :]
+    # offsets of [bsn]
+    q_offsets_base = (
+        q_batch_offset
+        + seq_offsets[:, None, None] * N_HEADS * D_HEAD
+        + q_head_offsets[None, :, None] * D_HEAD
+    )
+    q_offsets1 = q_offsets_base + dhead_offsets1[None, None, :]
+    q_offsets2 = q_offsets_base + dhead_offsets2[None, None, :]
+    q_mask = d2_load_mask[:, None, :] * q_head_mask[None, :, None]
+    q1 = tl.load(q_ptr + q_offsets1, mask=q_mask).to(tl.float32)
+    q2 = tl.load(q_ptr + q_offsets2, mask=q_mask).to(tl.float32)
+    k_offsets_base = kv_batch_offset + seq_offsets[:, None] * N_KV_HEADS * D_HEAD + kv_head_offset
+    k_offsets1 = k_offsets_base + dhead_offsets1[None, :]
+    k_offsets2 = k_offsets_base + dhead_offsets2[None, :]
+    k1 = tl.load(k_ptr + k_offsets1, mask=d2_load_mask).to(tl.float32)
+    k2 = tl.load(k_ptr + k_offsets2, mask=d2_load_mask).to(tl.float32)
+    # -----------------------------------
+    # torch version sin/cos
+    # cos and sin values are interleaved in frequencies tensor.
+    f_offsets = seq_offsets[:, None] * D2 + d2_offsets[None, :]
+    cos_ref = tl.load(f_ptr + kv_position * D_HEAD + f_offsets * 2, mask=d2_load_mask).to(
+        dtype=tl.float32
+    )
+    sin_ref = tl.load(f_ptr + kv_position * D_HEAD + f_offsets * 2 + 1, mask=d2_load_mask).to(
+        dtype=tl.float32
+    )
+    qs1 = cos_ref[:, None, :] * q1 - sin_ref[:, None, :] * q2
+    qs2 = sin_ref[:, None, :] * q1 + cos_ref[:, None, :] * q2
+    tl.store(q_rope_ptr + q_offsets1, qs1, mask=q_mask)
+    tl.store(q_rope_ptr + q_offsets2, qs2, mask=q_mask)
+    ks1 = cos_ref * k1 - sin_ref * k2
+    ks2 = sin_ref * k1 + cos_ref * k2
+    # Write back to kv-caches
+    vs = tl.load(
+        v_ptr
+        + kv_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * D_HEAD
+        + kv_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    kv_writeback_seq_offsets = seq_offsets + kv_position
+    cache_offset_base = (
+        cache_batch_offset
+        + kv_writeback_seq_offsets[:, None] * D_HEAD * N_KV_HEADS
+        + cache_head_offset
+    )
+    k_cache_offset1 = cache_offset_base + dhead_offsets1[None, :]
+    k_cache_offset2 = cache_offset_base + dhead_offsets2[None, :]
+    tl.store(k_cache_ptr + k_cache_offset1, ks1, mask=d2_load_mask)
+    tl.store(k_cache_ptr + k_cache_offset2, ks2, mask=d2_load_mask)
+    v_cache_offset = cache_offset_base + dhead_offsets[None, :]
+    tl.store(v_cache_ptr + v_cache_offset, vs, load_mask)
+"""
+Kernels based on paged KV Cache.
+Parameter infos:
+    tensors:
+    - q: [b*s, n, d], flattened queries.
+    - k/v: [b*s, n, d], flattened key/value.
+    - seq_len: [b], length of each sequence in the batch.
+        `seq_len` can be 1 (generate) or larger (context).
+    - seq_start: [b], start index of each sequence in b*s dim of q/k/v.
+    - k_cache/v_cache: [num_pages, PAGE_SIZE, n, d], paged KV Cache.
+        New-coming k/v is split into small group of PAGE_SIZE, and then
+        mapped to incontinuous memory in KV Cache.
+    - page_table: [b, max_num_pages_per_seq], mapping logic of each sequence.
+    - cache_loc: [b], mapping logic of `batch_id` in q/k/v to index in `page_table`.
+    - cache_len: [b], existing cached k/v length of each sequence.
+    constexpr:
+    - N_HEADS/N_KV_HEADS: shape of dim [n] in q or k/v.
+    - D_HEAD: shape of dim [d] in q/k/v.
+        Assuming power of 2.
+    - SEQ_BLOCK: block size to split dim [s].
+        Assuming power of 2.
+        Split k/v in update kernel and split q in context/generate kernel.
+    - MAX_SEQ_LENGTH: seq_len <= MAX_SEQ_LENGTH.
+    - PAGE_SIZE: shape of each kv cache page,
+        Assuming power of 2 and SEQ_BLOCK % PAGE_SIZE = 0.
+    - PAGE_TABLE_STIDE: stride of dim [b] in `page_table`.
+KV Cache access logic in update kernel:
+    1. batch_id i access k[seq_start[i] : seq_start[i] + seq_len[i]]
+        and can be split into pages [a:b] in the sequence.
+    2. Look up cache_len[i] to find if the sequence has cached k/v.
+    3. Look up page_table[cache_loc[i], cache_len[i] + a : cache_len[i] + b]
+       to get the corresponding pages in the k_cache, with result [c:d].
+    4. Then update k_cache[c:d] with the k value.
+"""
+@triton.jit
+def update_paged_kv_cache(
+    k_ptr,  # [B*S, N, D]
+    v_ptr,  # [B*S, N, D]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [num_pages, page_size, n, d]
+    v_cache_ptr,  # [num_pages, page_size, n, d]
+    cache_loc_ptr,  # [b] # index of the sequence in the page table.
+    cache_len_ptr,  # [b] # length of the sequence already in kv cache.
+    page_table_ptr,  # [b, max_num_pages_per_seq] # loc of the block page in the cache.
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    PAGE_TABLE_STRIDE: tl.constexpr,
+    GENERATE_ONLY: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    if GENERATE_ONLY:
+        seq_start_index = batch_id
+        seq_len: tl.constexpr = 1
+    else:
+        seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+        seq_len = tl.load(seq_len_ptr + batch_id)
+    cache_len = tl.load(cache_len_ptr + batch_id)
+    # cache is [num_pages, page_size, n, d]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    cache_head_offset = head_id * D_HEAD
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    load_mask = seq_mask[:, None] * dhead_mask[None, :]
+    kv_batch_offset = seq_start_index * N_KV_HEADS * D_HEAD
+    kv_head_offset = cache_head_offset
+    # Write back to kv-caches
+    ks = tl.load(
+        k_ptr
+        + kv_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * D_HEAD
+        + kv_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    vs = tl.load(
+        v_ptr
+        + kv_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * D_HEAD
+        + kv_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    # assuming SEQ_BLOCK can be divided by PAGE_SIZE and PAGE_SIZE is a power of 2.
+    SEQ_BLOCK_PAGE: tl.constexpr = SEQ_BLOCK // PAGE_SIZE
+    MAX_NUM_PAGES: tl.constexpr = (MAX_SEQ_LENGTH + PAGE_SIZE - 1) // PAGE_SIZE
+    # cache_len // PAGE_SIZE means history pages
+    # if decode sequence, then seq_len = 1 and only seq_block_id = 0 works,
+    kv_pages = seq_block_id * SEQ_BLOCK_PAGE + tl.arange(0, SEQ_BLOCK_PAGE) + cache_len // PAGE_SIZE
+    cache_pages = tl.load(
+        page_table_ptr + cache_loc * PAGE_TABLE_STRIDE + kv_pages, mask=kv_pages < MAX_NUM_PAGES
+    )
+    page_offsets = tl.arange(0, PAGE_SIZE)
+    # shape [SEQ_BLOCK], means [cache_pages, page_offsets]
+    cache_seq_offset = tl.reshape(
+        cache_pages[:, None] * PAGE_SIZE + page_offsets[None, :], [SEQ_BLOCK]
+    )
+    # write offset inside the page
+    cache_seq_offset += cache_len % PAGE_SIZE
+    cache_offsets = (
+        cache_seq_offset[:, None] * N_KV_HEADS * D_HEAD + kv_head_offset + dhead_offsets[None, :]
+    )
+    tl.store(k_cache_ptr + cache_offsets, ks, load_mask)
+    tl.store(v_cache_ptr + cache_offsets, vs, load_mask)
+# TODO: Write a doc describing the 2 stage algorithm
+@triton.jit
+def attention_kv_paged_stage1(
+    q_ptr,  # [Batch, 1, N_HEADS, D_HEAD]
+    k_cache_ptr,  # [NUM_PAGES, PAGE_SIZE, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [NUM_PAGES, PAGE_SIZE, N_HEADS, D_HEAD]
+    cache_loc_ptr,  # [Batch] # Specifies the batch index for each of the generate tokens.
+    page_table_ptr,  # [Batch, num_pages_per_seq]
+    cache_len_ptr,  # [Batch] # Number of tokens in kv cache.
+    output_values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    output_logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    num_blocks,
+    MAX_SEQ_LEN: tl.constexpr,  # Maximum supported sequence length
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    # Block size used for tiling the sequence dim.
+    SEQ_BLOCK_SIZE: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    PAGE_TABLE_STRIDE: tl.constexpr,
+):
+    """Attention kernel to be used during the generate phase.
+    Uses flash decoding.
+    KV-cache layout is assumed to be [Batch, Head, Seq, Dim]
+    1. Fetch the K-cache from 0 to input_pos
+    2. Fetch the V-cache from 0 to input_pos
+    3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
+    4. S = softmax(A)
+    5. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
+    """
+    # Assume KV-cache layout: [Batch, Head, Seq, Dim]
+    # A program is responsible for 1 batch, 1 head and a block of sequences.
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    SEQ_BLOCK_PAGE: tl.constexpr = SEQ_BLOCK_SIZE // PAGE_SIZE
+    MAX_NUM_PAGES: tl.constexpr = MAX_SEQ_LEN // PAGE_SIZE
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    seq_len = tl.load(cache_len_ptr + batch_id)
+    # Offsets for the block of sequences this program processes.
+    seq_start_pos = seq_block_id * SEQ_BLOCK_SIZE
+    if seq_start_pos > seq_len:
+        return
+    seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+    seq_mask = seq_offsets <= seq_len
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    cache_head_offset = (head_id // HEAD_RATIO) * D_HEAD
+    sm_scale: tl.constexpr = 1 / (D_HEAD**0.5)
+    # Program loads the entire Q for the head assigned to it.
+    # [D_HEAD]
+    q_batch_offset = batch_id * N_HEADS * D_HEAD
+    q_head_offset = head_id * D_HEAD
+    q = tl.load(q_ptr + q_batch_offset + q_head_offset + dhead_offsets)
+    kv_mask = seq_mask[:, None] * dhead_mask[None, :]
+    kv_pages = seq_block_id * SEQ_BLOCK_PAGE + tl.arange(0, SEQ_BLOCK_PAGE)
+    cache_pages = tl.load(
+        page_table_ptr + cache_loc * PAGE_TABLE_STRIDE + kv_pages, mask=kv_pages < MAX_NUM_PAGES
+    )
+    page_offsets = tl.arange(0, PAGE_SIZE)
+    # shape [SEQ_BLOCK], means [cache_pages, page_offsets]
+    # token offsets in the paged kv cache
+    cache_seq_offset = tl.reshape(
+        cache_pages[:, None] * PAGE_SIZE + page_offsets[None, :], [SEQ_BLOCK_SIZE]
+    )
+    cache_offsets = (
+        cache_seq_offset[:, None] * N_KV_HEADS * D_HEAD + cache_head_offset + dhead_offsets[None, :]
+    )
+    k = tl.load(k_cache_ptr + cache_offsets, mask=kv_mask)
+    v = tl.load(v_cache_ptr + cache_offsets, mask=kv_mask)
+    # Note: check the output precision of the sum.
+    # compute q*K^T
+    # [D_HEAD] * [seq_block, D_HEAD], sum along axis 1
+    attn = tl.sum(q[None, :] * k, axis=1)  # [seq_block]
+    attn = attn.to(tl.float32)
+    attn *= sm_scale
+    max_attn = tl.max(attn)
+    # Set to -inf attn values where mask is not set. This forces exp(attn) to 0.
+    attn = tl.where(seq_mask, attn, float("-inf"))
+    exp_attn = tl.exp(attn - max_attn)
+    sumexp = tl.sum(exp_attn, axis=0)  # scalar.
+    # [seq_len] * [seq_len, D_HEAD], sum along axis 0
+    output = tl.sum(exp_attn[:, None] * v, axis=0)  # [D_HEAD]
+    output = output / sumexp
+    # We store the log-sum-exp after removing the max.
+    logsumexp = tl.log(sumexp) + max_attn
+    # when seq_mask is all false, max_attn will be -inf and sumexp is zero
+    tl.store(
+        output_values_ptr
+        + batch_id * N_HEADS * D_HEAD * num_blocks
+        + head_id * D_HEAD * num_blocks
+        + seq_block_id * D_HEAD
+        + dhead_offsets,
+        output,
+    )
+    tl.store(
+        output_logsumexp_ptr
+        + batch_id * N_HEADS * num_blocks
+        + head_id * num_blocks
+        + seq_block_id,
+        logsumexp,
+    )
+@triton.jit
+def context_attention_kv_paged(
+    q_ptr,  # [b*s,nd]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [num_pages, page_size, n, d]
+    v_cache_ptr,  # [num_pages, page_size, n, d]
+    cache_loc_ptr,  # [b] # index of the sequence in the page table.
+    cache_len_ptr,  # [Batch] # Number of tokens in kv cache.
+    page_table_ptr,  # [b, max_num_pages_per_seq] # loc of the block page in the cache.
+    softmax_scale,
+    o_ptr,
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    PAGE_TABLE_STRIDE: tl.constexpr,
+):
+    """Kernel for context phase.
+    Fuses rope
+    Assuming:
+    1. Self-attention [seqlen(Q) == seqlen(K)]
+    2. Causal attention
+    3. QKV layout: [b*s,n,d]
+    """
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    seq_start_index = tl.load(seq_start_ptr + batch_id)
+    seq_len = tl.load(seq_len_ptr + batch_id)
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    # assuming SEQ_BLOCK can be divided by PAGE_SIZE and PAGE_SIZE is a power of 2.
+    SEQ_BLOCK_PAGE: tl.constexpr = SEQ_BLOCK // PAGE_SIZE
+    MAX_NUM_PAGES: tl.constexpr = (MAX_SEQ_LENGTH + PAGE_SIZE - 1) // PAGE_SIZE
+    # cache is [num_pages, page_size, n, d]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    table_batch_offset = cache_loc * PAGE_TABLE_STRIDE
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    seq_offsets = tl.arange(0, SEQ_BLOCK)
+    q_seq_offsets = seq_block_id * SEQ_BLOCK + seq_offsets
+    seq_mask = q_seq_offsets < seq_len
+    load_mask = seq_mask[:, None] * dhead_mask[None, :]
+    q_batch_offset = seq_start_index * N_HEADS * D_HEAD
+    q_head_offset = head_id * D_HEAD
+    cache_head_offset = (head_id // HEAD_RATIO) * D_HEAD
+    # Q will stay in SRAM
+    q = tl.load(
+        q_ptr
+        + q_batch_offset
+        + q_seq_offsets[:, None] * N_HEADS * D_HEAD
+        + q_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    acc = tl.zeros([SEQ_BLOCK, D_HEAD], dtype=tl.float32)
+    lse_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    cache_len = tl.load(cache_len_ptr + batch_id)
+    total_len = cache_len + seq_len
+    num_blocks = (total_len + SEQ_BLOCK - 1) // SEQ_BLOCK
+    for s in range(0, num_blocks + 1, 1):
+        kv_pages = s * SEQ_BLOCK_PAGE + tl.arange(0, SEQ_BLOCK_PAGE)
+        cache_pages = tl.load(
+            page_table_ptr + table_batch_offset + kv_pages, mask=kv_pages < MAX_NUM_PAGES
+        )
+        page_offsets = tl.arange(0, PAGE_SIZE)
+        # shape [SEQ_BLOCK], means [cache_pages, page_offsets]
+        # physical token offsets in the paged kv cache
+        cache_seq_offset = tl.reshape(
+            cache_pages[:, None] * PAGE_SIZE + page_offsets[None, :], [SEQ_BLOCK]
+        )
+        cache_offsets = (
+            cache_seq_offset[:, None] * N_KV_HEADS * D_HEAD
+            + cache_head_offset
+            + dhead_offsets[None, :]
+        )
+        # logical kv tokens offsets
+        kv_seq_offsets = s * SEQ_BLOCK + seq_offsets
+        kv_seq_mask = kv_seq_offsets < total_len
+        kv_load_mask = kv_seq_mask[:, None] * dhead_mask[None, :]
+        k = tl.load(k_cache_ptr + cache_offsets, mask=kv_load_mask)
+        qk = tl.zeros([SEQ_BLOCK, SEQ_BLOCK], dtype=tl.float32)
+        qk += tl.dot(q, k.trans())
+        # causal mask, need to use kv_seq_offsets
+        qk = tl.where(
+            (q_seq_offsets[:, None] + cache_len) >= kv_seq_offsets[None, :], qk, float("-inf")
+        )
+        qk *= softmax_scale
+        # rowmax
+        m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+        p = tl.exp(qk - m_ij[:, None])
+        v = tl.load(v_cache_ptr + cache_offsets, mask=kv_load_mask)
+        l_ij = tl.sum(p, 1)
+        acc_scale = tl.exp(m_i - m_ij)
+        acc = acc * acc_scale[:, None]
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    acc = acc * o_scale[:, None]
+    tl.store(
+        o_ptr
+        + q_batch_offset
+        + q_seq_offsets[:, None] * N_HEADS * D_HEAD
+        + q_head_offset
+        + dhead_offsets[None, :],
+        acc,
+        mask=load_mask,
+    )
+@dataclass
+class PositionalEmbeddingConfig:
+    """A dataclass to hold positional embedding information."""
+    mode: Optional[Literal["rope"]] = None
+    rope_theta: float = 10000.0
+    rope_scale: float = 1.0
+    def __post_init__(self):
+        assert self.mode in [None, "rope"], f"Invalid mode: {self.mode}."
+        if self.mode == "rope":
+            assert self.rope_theta > 0, f"Invalid rope theta: {self.rope_theta}."
+@dataclass
+class CacheConfig:
+    """A dataclass to hold information how to configure the cache."""
+    dtype: Optional[torch.dtype] = None
+@dataclass
+class AttentionInfo:
+    """Information about the attention op.
+    This is the dataclass collected by the kvcache transformation and passed in to the
+    AttentionDescriptor methods to inform the attention op about the attention configuration.
+    """
+    num_heads: int
+    num_kv_heads: int
+    head_dim: int  # embedding size of each head
+    dtype: torch.dtype
+    cache_config: CacheConfig
+    pos_embd_config: PositionalEmbeddingConfig
+    # rope_dim represents embedding size of decoupled q/k that carry rope information
+    # when rope_dim != 0 the decoupled q/k tensor carrying rope information is the last part of the tensor [-rope_dim: ]
+    rope_dim: Optional[int] = 0
+@dataclass
+class SequenceInfo:
+    """A dataclass to hold information about how the sequence is laid out and stored in cache.
+    We assume the sequence + cache is laid out in the following way:
+    - input_ids: [id_0, ..., id_{s_total-1}]
+      flattened sequence of [b, 1] or [1, s_total]. We use [b, 1] to denote generate-only batches.
+    - seq_len: [s_0, s_1, ..., s_{b-1}] such that s_total = sum(s_i)
+      Describes how long each sequence is. For example,
+      input_ids[:s_0] will correspond to sequence 0 in the batch and input_ids[s_0:s_1] will
+      correspond to sequence 1 in the batch.
+    - input_pos: [pos_0, ..., pos_{b-1}]
+      Corresponds to the total number of tokens that has been already been cached for each sequence
+      in the batch.
+    - cache_loc: [c0, ...., c_{np-1}] where np is total number of pages allocated to describe all
+      sequences in the batch.
+    - pages_per_seq: [ps_0, ps_1, ..., ps_{b-1}] where ps_i is the number of pages allocated for
+      sequence i. Note that, for example, cache_loc[p_0:p_1] will correspond to the pages associated
+      with sequence 1 in the batch.
+    Here are a couple of notes to emphasize this notation:
+    - The total number of allocated token space for sequence i is given by ps_i * page_size. This is
+      the total number of tokens that can be cached for each sequence.
+    - NOTE: It must hold that pos_i + s_i <= ps_i * page_size for all i in [0, b-1]. Moreover, it is
+      the responsibility of the cache manager and/or runtime to ensure sufficient page allocation
+      for each sequence.
+    """
+    ## USE TO INITIALIZE DATA CLASS  ###############################################################
+    # max_seq_len corresponds the maximum number of tokens in any sequence. It includes the tokens in the
+    # input sequence and the tokens generated by the model.
+    max_seq_len: int = 1
+    # max_batch_size corresponds to the maximum number of sequences (or requests) that the model can process.
+    max_batch_size: int = 1
+    # page_size is the granularity with which the cache pages are allocated for a paged kv cache.
+    # For an unpaged cache, the page size should be set to max_seq_len.
+    # Also note that two sequences in a batch can not share a page.
+    page_size: int = 0
+    # max_num_tokens is the maximum number of tokens that the model can process across all sequences in the batch.
+    # If a batch is composed of context-only requests of input sequence length ISL,
+    # then the maximum number of sequences possible in the batch is min (max_batch_size, max_num_tokens // ISL).
+    # Similarly, if a batch is composed of generate-only requests,
+    # then the maximum number of sequences possible in the batch is min (max_batch_size, max_num_tokens).
+    max_num_tokens: int = 0
+    ## [UPDATE WITH CARE] TENSOR FIELDS THAT WILL BE PASSED TO PREPARE_METADATA OP #################
+    # input_ids MUST ALWAYS BE THE FIRST FIELD
+    input_ids: torch.Tensor = field(default_factory=lambda: torch.zeros(1, 1, dtype=torch.int))
+    seq_len: torch.Tensor = field(default_factory=lambda: torch.ones(1, dtype=torch.int))
+    input_pos: torch.Tensor = field(default_factory=lambda: torch.zeros(1, dtype=torch.int))
+    cache_loc: torch.Tensor = field(default_factory=lambda: torch.arange(1, dtype=torch.int))
+    pages_per_seq: torch.Tensor = field(default_factory=lambda: torch.ones(1, dtype=torch.int))
+    ################################################################################################
+    ## PRIVATE FIELDS ##############################################################################
+    _sequence_lengths: List[int] = field(default_factory=list)
+    _num_pages: int = 1
+    def __post_init__(self):
+        if self.page_size < 1:
+            self.page_size = self.max_seq_len
+        if self.max_num_tokens < 1:
+            self.max_num_tokens = self.max_batch_size * self.max_seq_len
+        # if the provided max_num_tokens is less than the max_batch_size * max_seq_len,
+        # we use the provided max_num_tokens to calculate the number of pages
+        total_tokens = min(self.max_num_tokens, self.max_batch_size * self.max_seq_len)
+        self._num_pages = (total_tokens) // self.page_size + (total_tokens % self.page_size > 0)
+        self.input_ids = torch.ones(self.max_batch_size, 1, dtype=torch.int)
+        self.seq_len = torch.empty(self.max_batch_size, dtype=torch.int)
+        self.input_pos = torch.empty_like(self.seq_len)
+        self.cache_loc = torch.empty(self.num_pages, dtype=torch.int)
+        self.pages_per_seq = torch.empty_like(self.seq_len)
+        # dynamic shape descriptors for tensor args
+        self._dynamic_shapes: Optional[Tuple[Dict[str, Dim]]] = None
+        # keep a list-like object of sequence lengths for simplicity as well
+        self._sequence_lengths = [0] * self.max_batch_size
+        # call reset once to initialize the tensors
+        self.reset()
+    @property
+    def device(self) -> torch.device:
+        return self.input_pos.device
+    @property
+    def args(self) -> List[torch.Tensor]:
+        args = []
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if isinstance(val, torch.Tensor):
+                args.append(val)
+        return args
+    @property
+    def extra_arg_names(self) -> List[str]:
+        """Return extra arg names for the prepare_metadata op beyond input_ids."""
+        return [f.name for f in fields(self) if isinstance(getattr(self, f.name), torch.Tensor)][1:]
+    @property
+    def dynamic_shapes(self) -> Tuple[Dict[str, Dim]]:
+        """Return dynamic shapes of sequence info tensors.
+        NOTE: will be lazily initialized since the Dim object is not picklable for multi-processing.
+        """
+        if self._dynamic_shapes is None:
+            dynamic_shapes = ({},)
+            if self.max_batch_size > 1:
+                dynamic_shapes[0][0] = Dim("batch_size", max=self.max_batch_size)
+            dynamic_shapes[0][1] = Dim("seq_len", max=self.max_seq_len)
+            dynamic_shapes += ({},) * len(self.extra_arg_names)
+            self._dynamic_shapes = dynamic_shapes
+        return self._dynamic_shapes
+    @property
+    def num_sequences(self) -> int:
+        return len(self._sequence_lengths)
+    @property
+    def sequence_lengths(self) -> List[int]:
+        return self._sequence_lengths
+    @property
+    def input_positions(self) -> List[int]:
+        return self.input_pos[: self.num_sequences].tolist()
+    @property
+    def is_generate(self) -> bool:
+        return all(sl == 1 for sl in self.sequence_lengths)
+    @property
+    def num_pages(self) -> int:
+        return self._num_pages
+    @num_pages.setter
+    def num_pages(self, value):
+        self._num_pages = value
+        # update the cache_loc tensor
+        self.cache_loc.resize_(value)
+    @property
+    def is_paged(self) -> bool:
+        return self.page_size < self.max_seq_len
+    @property
+    def page_assignments(self) -> List[List[int]]:
+        """Return the page assignments for each sequence."""
+        pages_per_seq = self.pages_per_seq[: self.num_sequences].tolist()
+        return [
+            c_loc_one_seq.tolist()
+            for c_loc_one_seq in torch.split(self.cache_loc[: sum(pages_per_seq)], pages_per_seq)
+        ]
+    @classmethod
+    def _get_sanitized_seq_len(cls, input_ids: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+        """Sanitize sequence lengths.
+        We want to cover the following scenarios with this function:
+        1. Pre-fill:
+            input_ids: [1, s_total, ...]
+            seq_len: [s_0, s_1, ..., s_{b-1}, 0, 0, ..., 0]
+            ---> returns [s_0, s_1, ..., s_{b-1}]
+        2. Decode:
+            input_ids: [b, 1, ...]
+            seq_len: [1, 1, ..., 1, 0, 0, ..., ..., ..., ..., 0]
+                     |---- b ----|--- (max_batch_size - b) ---|
+            --> returns [1,] * b
+        3. Decode in Cudagraph:
+            input_ids: [b_cudagraph, 1, ...]
+            seq_len: [1, 1, ..., 1, 0, 0, ..., ..., ..., ..., 0]
+                     |---- b ----|--- (max_batch_size - b) ---|
+            --> returns [1,] * b_cudagraph
+            Here b <= b_cudagraph. We want to make sure that the seq_len is one-padded to
+            b_cudagraph.
+            # TODO: I could see one possible issue with this approach in the future.
+            # If we have b < b_cudagraph we now one-pad. However, we don't pad the cache location
+            # information. What could happen is that the for the padded sequences the cache location
+            # tensors point to allocated pages. This could lead to a situation where we write into
+            # allocated cache pages polluting the cache of other sequences. Now this is not an issue
+            # if we write the dummy sequences into unallocated cache pages... One fix could be to
+            # pad not only the seq len but also pad the cache locations by just repeating the last
+            # valid cache location in the batch. This would ensure that the dummy sequences just
+            # repeats valid computation...
+        """
+        _, s = input_ids.shape[:2]
+        num_seq = cls._get_sanitized_num_sequences(input_ids, seq_len)
+        if s > 1:
+            return seq_len[:num_seq].detach().clone()
+        else:
+            return torch.ones(num_seq, dtype=seq_len.dtype, device=seq_len.device)
+    @staticmethod
+    def _get_sanitized_num_sequences(input_ids: torch.Tensor, seq_len: torch.Tensor) -> int:
+        """Get number of sequences.
+        We makes sure that this function is compatible with both torch graph capture and cudagraph.
+        Both can be a bit temparamental when trying to extract the number of sequences from a tensor
+        with max_batch_size or max_batch_size*max_seq_len.
+        """
+        b, s = input_ids.shape[:2]
+        if s > 1:
+            num_seq = torch.sum(seq_len > 0)
+            assert seq_len[num_seq:].sum() == 0, "seq_len should be zero-padded"
+        else:
+            num_seq = b
+        return num_seq
+    def to(self, *args, **kwargs) -> None:
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if isinstance(val, torch.Tensor):
+                setattr(self, f.name, val.to(*args, **kwargs))
+    def sync(self, other: "SequenceInfo") -> None:
+        for f in fields(self):
+            val = getattr(self, f.name)
+            val_other = getattr(other, f.name)
+            if f.name == "input_ids":
+                setattr(self, f.name, val_other.to(self.device))
+            elif f.name == "_sequence_lengths":
+                self._sequence_lengths = val_other
+            elif isinstance(val, torch.Tensor):
+                val[: len(val_other)] = val_other.to(self.device)
+            else:
+                assert val == val_other, f"Field {f.name} mismatch: {val} != {val_other}."
+    def reset(self) -> None:
+        """Reset the sequence information.
+        After reset the sequence information should correspond to a "generate-only" batch of
+        sequences (b, s==1) without cache history.
+        """
+        # set a dummy sequence corresponding to a generate-only batch
+        self.nest_sequences(torch.zeros(self.max_batch_size, 1, dtype=torch.int))
+        # reset cache information
+        self.input_pos.zero_()
+        self.cache_loc[:] = torch.arange(self.num_pages, dtype=torch.int, device=self.device)
+        self.pages_per_seq.fill_(1)
+    def _set_example_sequence(self) -> None:
+        """Set an example sequence for export purposes."""
+        self.reset()
+        input_ids = torch.ones(
+            min(2, self.max_batch_size),
+            min(4, self.max_seq_len),
+            dtype=torch.int,
+            device=self.device,
+        )
+        self.nest_sequences(input_ids)
+        self.input_ids = input_ids
+    def _set_max_num_tokens_sample(self) -> None:
+        """Set an example sequence with max_num_tokens."""
+        self.reset()
+        seq_len = self.max_num_tokens // self.max_batch_size
+        input_ids = torch.ones(
+            self.max_batch_size,
+            seq_len,
+            dtype=torch.int,
+            device=self.device,
+        )
+        self.pages_per_seq.fill_(seq_len // self.page_size)
+        self.nest_sequences(input_ids)
+    def _set_generate_only_batch(self) -> None:
+        """Set an example sequence for generate-only batch."""
+        self.reset()
+        self.nest_sequences([[1]] * self.max_batch_size)
+    def nest_sequences(self, input_ids: Sequence[Sequence[int]]) -> None:
+        """Create and store a flattened list of input_ids from the provided list of sequences.
+        This i/f will also update any relevant sequence information.
+        """
+        # set new sequence lengths
+        seq_lens = [len(ids) for ids in input_ids]
+        self.seq_len.zero_()
+        self.seq_len[: len(seq_lens)].copy_(torch.tensor(seq_lens), non_blocking=True)
+        # set new input_ids as new tensor from flattened input_ids
+        ids_tnsr_list = [
+            lst.detach() if isinstance(lst, torch.Tensor) else torch.tensor(lst, dtype=torch.int)
+            for lst in input_ids
+        ]
+        self.input_ids = torch.cat(ids_tnsr_list, dim=0).to(self.device)
+        # set derivative properties
+        self._sequence_lengths = seq_lens
+        # use [b,1] shape to indicate generate-only batch, otherwise use [1,total_len]
+        if self.is_generate:
+            self.input_ids = self.input_ids.view(-1, 1, *self.input_ids.shape[1:])
+        else:
+            self.input_ids = self.input_ids.view(1, -1, *self.input_ids.shape[1:])
+    def unnest_sequences(self, t_nested: torch.Tensor) -> List[torch.Tensor]:
+        t_squeezed = t_nested.squeeze(1) if self.is_generate else t_nested.squeeze(0)
+        return list(torch.split(t_squeezed, self.sequence_lengths))
+    def update_pos(self, seq_len: Union[torch.Tensor, List[int], int], reset: bool = False) -> None:
+        """Update the starting position for each sequence in the cache.
+        If ``reset=True`, ``input_pos`` will be reset to zero before updating.
+        """
+        if not isinstance(seq_len, torch.Tensor):
+            seq_len = torch.tensor(seq_len, dtype=torch.int)
+        bs = len(seq_len) if seq_len.dim() > 0 else self.max_batch_size
+        if reset:
+            self.input_pos[:bs] = seq_len.to(self.device)
+        else:
+            self.input_pos[:bs] += seq_len.to(self.device)
+    def assign_cache_loc(self, page_assignments: Sequence[Sequence[int]]) -> None:
+        """Set the cache location and pages_per_seq tensors from page assignments."""
+        cache_loc_flat = torch.tensor(
+            [p_idx for pages in page_assignments for p_idx in pages], dtype=torch.int
+        )
+        self.cache_loc[: len(cache_loc_flat)].copy_(cache_loc_flat, non_blocking=True)
+        pages_per_seq = torch.tensor([len(p) for p in page_assignments], dtype=torch.int)
+        self.pages_per_seq[: len(pages_per_seq)].copy_(pages_per_seq, non_blocking=True)
+Constant = Union[int, float, str, None]
+class MHACallable(Protocol):
+    def __call__(
+        self,
+        *qkv_metadata_and_caches: Union[torch.Tensor, Constant],
+    ) -> torch.Tensor: ...
+class PrepareMetadataCallable(Protocol):
+    def __call__(
+        self,
+        input_ids: torch.Tensor,
+        seq_len: torch.Tensor,
+        input_pos: torch.Tensor,
+        cache_loc: torch.Tensor,
+        pages_per_seq: torch.Tensor,
+        page_size: int,
+    ) -> List[torch.Tensor]: ...
+class GetCacheCallable(Protocol):
+    def __call__(self, sequence_info: SequenceInfo) -> torch.Tensor: ...
+class GetBufferCallable(GetCacheCallable):
+    pass
+class GetAttentionInfo(Protocol):
+    def __call__() -> AttentionInfo: ...
+CacheInitializerDict = Dict[str, GetCacheCallable]
+BufferInitializerDict = Dict[str, GetBufferCallable]
+class AttentionDescriptor(ABC):
+    """An interface to define a functional attention operator.
+    The main logic is contained with the actual attention op as well as the prepare_metadata op. The
+    prepare_metadata op is responsible for converting the standardized sequence info into metadata
+    specific to the attention op.
+    """
+    @classmethod
+    @abstractmethod
+    def is_paged(cls) -> bool:
+        """Return if the attention op is paged or not."""
+    @classmethod
+    def get_attention_op(cls) -> Tuple[MHACallable, int]:
+        """Get the attention op and the number of arguments corresponding to qkv.
+        The attention_op should follow the below signature:
+        ```
+        def attention_op(
+            *qkv,       # list of tensors corresponding to Q, K, V as in original op
+            *metadata,  # global info about the sequences as returned by the prepare_metadata op
+            *caches,    # contains layer-specific caches per provided cache initializers
+            *buffers,   # global buffers used by the attention op as provided by buffer initializers
+            *constants, # basic arguments (int, float, str, None) added as CONSTANTS in the graph
+        ) -> torch.Tensor: ...
+        ```
+        **Note that the attention op should be a valid torch custom op, which comes with
+        restrictions on the supported types in the signature.**
+        **Note that the `qkv` tuple should be consistent across both the cached attention
+        op and the op that it is replacing.**
+        """
+        raise NotImplementedError
+    @classmethod
+    @abstractmethod
+    def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]:
+        """Get the prepare_metadata op.
+        The prepare_metadata op should follow the below signature:
+        ```
+        def prepare_metadata(
+            input_ids: torch.Tensor,
+            seq_len: torch.Tensor,
+            input_pos: torch.Tensor,
+            cache_loc: torch.Tensor,
+        ) -> List[torch.Tensor]: ...
+        ```
+        The metadata should contain all necessary global information required for the underlying
+        attention op to process the input sequence and the returned list of tensors will be passed
+        on to each invocation of the attention op in the graph.
+        prepare_metadata is called once at the beginning of the forward pass.
+        **Note that the prepare_metadata op should be a valid torch custom op, which comes with
+        restrictions on the supported types in the signature.**
+        """
+        return NotImplementedError
+    @classmethod
+    @abstractmethod
+    def get_cache_initializers(cls, get_info: GetAttentionInfo) -> CacheInitializerDict:
+        """Provide a dictionary of function pointers that can be used to initialize the caches.
+        The key corresponds to the argument name used in the attention op signature. The function
+        key doesn't need to be unique across multiple attention nodes in the graph. The key used to
+        describe the cache in the graph will be patched with the attention node index to ensure
+        uniqueness.
+        ``get_cache_initializers`` will be called *once* after the model initialization and before
+        the initial forward pass for each attention op detected in the graph. The caches will be
+        managed by the global CacheManager and passed back to the attention op during the forward
+        pass.
+        If the cache initializer requires information about the attention op, the ``get_info``
+        function can be called **inside** the cache initializer to retrieve the necessary
+        information.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_global_buffer_initializers(cls, get_info: GetAttentionInfo) -> BufferInitializerDict:
+        """Provide a dictionary of function pointers that can be used to initialize buffers.
+        The key corresponds to the buffer name used in the graph module and will **not**
+        be patched unlike a cache key. Hence, it is a **global** key that is shared across all
+        attention ops in the model much like a regular buffer in an nn.Module. That means if this
+        i/f is called for multiple attention ops, the same buffer will be shared across all of them
+        if this function provides the same key multiple times.
+        Buffers are initialize *once* after the model initialization and before the initial forward
+        pass for each attention op detected in the graph. The buffer will be managed by the global
+        CacheManager and passed back to the attention op during the forward pass.
+        If the buffer initializer requires information about the attention op, the ``get_info``
+        function can be called **inside** the buffer initializer to retrieve the necessary
+        information.
+        """
+        return {}
+    @classmethod
+    def get_constants(cls, attention_info: AttentionInfo) -> List[Constant]:
+        """Provide a list of constant arguments to be passed to the attention op.
+        The constant arguments are passed to the attention op as additional arguments after the
+        caches and buffers. The constants are expected to be of type int, float, str, or None.
+        """
+        return []
+class AttentionRegistry:
+    """A simple registry to look up different attention implementations."""
+    _attention_registry: Dict[str, Type["AttentionDescriptor"]] = {}
+    @classmethod
+    def register(cls, kernel_source: str) -> Type["AttentionDescriptor"]:
+        def decorator(attention_cls: Type["AttentionDescriptor"]):
+            assert kernel_source not in cls._attention_registry, (
+                f"Attention source {kernel_source} already registered."
+            )
+            cls._attention_registry[kernel_source] = attention_cls
+            return attention_cls
+        return decorator
+    @classmethod
+    def get(cls, kernel_source: str) -> Type["AttentionDescriptor"]:
+        assert cls.has(kernel_source), f"Attention source {kernel_source} not registered."
+        return cls._attention_registry[kernel_source]
+    @classmethod
+    def has(cls, kernel_source: str) -> bool:
+        return kernel_source in cls._attention_registry
+@torch.library.custom_op("attention::scaled_dot_product_attention", mutates_args=())
+def scaled_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    """A carbon copy of torch.nn.functional.scaled_dot_product_attention as custom op.
+    Using this custom op instead of using the functional directly ensures consistent representation
+    of the vanilla sdpa in a graph.
+    """
+    return F.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+    )
+@scaled_dot_product_attention.register_fake
+def scaled_dot_product_attention_fake(
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+):
+    """Fake implementation of scaled_dot_product_attention."""
+    return torch.empty_like(query)
+def _generate_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    cache_locs: torch.Tensor,
+    input_pos: torch.Tensor,
+    out: torch.Tensor,
+):
+    b, (n_heads, q_d_head) = q.shape[0], q.shape[-2:]
+    max_seq_len, n_kv_heads = k_cache.shape[1:3]
+    v_d_head = v.shape[-1]
+    device = q.device
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    SEQ_BLOCK_SIZE = 256
+    num_blocks = (max_seq_len + SEQ_BLOCK_SIZE - 1) // SEQ_BLOCK_SIZE
+    stage1_output_values = torch.empty(
+        b, n_heads, num_blocks, v_d_head, device=device, dtype=torch.float32
+    )
+    stage1_output_logsumexp = torch.empty(
+        b, n_heads, num_blocks, device=device, dtype=torch.float32
+    ) - float("inf")
+    (
+        update_kv_cache[(b, n_kv_heads, 1)](
+            k,
+            v,
+            None,
+            None,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_locs,
+            max_seq_len,
+            n_kv_heads,
+            q_d_head,
+            v_d_head,
+            1,
+            GENERATE_ONLY=True,
+        ),
+    )
+    gqa_attention_kv_stage1[
+        (
+            b,
+            n_kv_heads,
+            num_blocks,
+        )
+    ](
+        q,
+        k_cache,
+        v_cache,
+        cache_locs,
+        input_pos,
+        stage1_output_values,
+        stage1_output_logsumexp,
+        num_blocks,
+        max_seq_len,
+        n_heads,
+        n_kv_heads,
+        q_d_head,
+        v_d_head,
+        SEQ_BLOCK_SIZE,
+        HEAD_BLOCK_SIZE,
+    )
+    attention_kv_stage2[(b, n_heads, 1)](
+        stage1_output_values,
+        stage1_output_logsumexp,
+        out,
+        input_pos,
+        num_blocks,
+        n_heads,
+        v_d_head,
+        SEQ_BLOCK_SIZE,
+    )
+def _context_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    out: torch.Tensor,
+):
+    b, s, n_heads, q_d_head = q.shape
+    max_seq_len, n_kv_heads = k_cache.shape[1:3]
+    v_d_head = v.shape[-1]
+    SEQ_BLOCK = 128
+    softmax_scale = 1.0 / math.sqrt(q_d_head)
+    grid = (b, n_heads, (s + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        s,
+        out,
+        softmax_scale,
+        n_heads,
+        n_kv_heads,
+        q_d_head,
+        v_d_head,
+        SEQ_BLOCK,
+        max_seq_len,
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_mha_with_cache", mutates_args=())
+def fused_mha_with_cache(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Fused MHA with cache that takes raw input from q, k, v GEMMs."""
+    # b, s info
+    b, s = q.shape[:2]
+    head_dim = k_cache.shape[-1]
+    # reshapes with num_heads and head_dim
+    q = q.view(b, s, -1, head_dim)
+    k = k.view(b, s, -1, head_dim)
+    v = v.view(b, s, -1, head_dim)
+    # rope embedding
+    if freqs_cis is not None:
+        q = torch.ops.rope.apply_rope_with_input_pos(q, freqs_cis, input_pos, "bsnd")
+        k = torch.ops.rope.apply_rope_with_input_pos(k, freqs_cis, input_pos, "bsnd")
+    # attention (assumed layout is bsnd)
+    y = torch.empty_like(q)
+    if s > 1:
+        # context phase
+        _context_mha(q, k, v, k_cache, v_cache, y)
+    else:
+        # generate phase
+        cache_locs = torch.arange(0, b, device=q.device, dtype=torch.int32)
+        _generate_mha(q, k, v, k_cache, v_cache, cache_locs, input_pos, y)
+    return y.view(b, s, -1)  # [b,s,n*h_d]
+@fused_mha_with_cache.register_fake
+def fused_mha_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: torch.Tensor,
+):
+    return torch.empty_like(q.contiguous())
+def _flattened_context_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # NOTE: s_total == sum(seq_len)
+    s_total, n_heads, q_d_head = q.shape
+    max_cache_seq_len, n_kv_heads = k_cache.shape[1:3]
+    v_d_head = v.shape[-1]
+    BATCH_SIZE: int = len(input_pos)
+    SEQ_BLOCK = 32
+    (
+        update_kv_cache[(BATCH_SIZE, n_kv_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)](
+            k,
+            v,
+            seq_len,
+            seq_start,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_loc,
+            max_cache_seq_len,
+            n_kv_heads,
+            q_d_head,
+            v_d_head,
+            32,
+            GENERATE_ONLY=False,
+        ),
+    )
+    # TODO: use input_pos to get the correct cache locations
+    softmax_scale = 1.0 / math.sqrt(q_d_head)
+    grid = (BATCH_SIZE, n_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv_flattened[grid](
+        q,
+        seq_len,
+        seq_start,
+        k_cache,
+        v_cache,
+        input_pos,
+        cache_loc,
+        out,
+        softmax_scale,
+        n_heads,
+        n_kv_heads,
+        q_d_head,
+        v_d_head,
+        SEQ_BLOCK,
+        max_cache_seq_len,
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_flattened_mha_with_cache", mutates_args=())
+def fused_flattened_mha_with_cache(
+    # Q, K, V
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    # METADATA
+    seq_len: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_start: torch.Tensor,
+    # CACHES
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    # BUFFERS
+    freqs_cis: torch.Tensor,
+    # CONSTANTS
+    # <none>
+) -> torch.Tensor:
+    """Flattened & fused MHA with cache that takes raw input from q, k, v GEMMs.
+    NOTE: this op can also handle seq_len==0, which might be useful for CUDAGRAPH.
+    """
+    # b, s info
+    # NOTE: b, s are just the shapes of the input tensor q; not necessarily the number of sequences.
+    # Generally speaking, we expect one of two cases here:
+    # 1. b > 0, s==1: this indicates a generate-only batch of tokens.
+    # 2. b==1, s > 0: this indicates a mixed context+generate phase. The actual number of sequences
+    #    and number of tokens per sequence are encoded in seq_len and seq_start.
+    head_dim = k_cache.shape[-1]
+    b, s, d = q.shape
+    # reshapes with num_heads and head_dim
+    if s == 1:
+        bs_view = (b, s)
+    else:
+        bs_view = (b * s,)
+    q = q.view(*bs_view, q.shape[2] // head_dim, head_dim)
+    k = k.view(*bs_view, k.shape[2] // head_dim, head_dim)
+    v = v.view(*bs_view, v.shape[2] // head_dim, head_dim)
+    # rope embedding for generate-only or mixed
+    if freqs_cis is not None and freqs_cis.numel() > 0:
+        if s == 1:
+            rope_args = (freqs_cis, input_pos, "bsnd")
+            fn_rope = torch.ops.rope.apply_rope_with_input_pos
+        else:
+            rope_args = (freqs_cis, input_pos, seq_len, seq_start)
+            fn_rope = torch.ops.rope.apply_rope_on_flattened_inputs
+        q = fn_rope(q, *rope_args)
+        k = fn_rope(k, *rope_args)
+    # run attention
+    y = torch.empty_like(q)
+    if s == 1:
+        # generate-only phase
+        _generate_mha(q, k, v, k_cache, v_cache, cache_loc, input_pos, y)
+    else:
+        # mixed context + generate phase
+        _flattened_context_mha(
+            q,
+            k,
+            v,
+            input_pos,
+            cache_loc,
+            k_cache,
+            v_cache,
+            seq_len,
+            seq_start,
+            y,
+        )
+    return y.view(b, s, d)  # [b,s,n*h_d]
+@fused_flattened_mha_with_cache.register_fake
+def fused_flattened_mha_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_len: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_start: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: torch.Tensor,
+):
+    return torch.empty_like(q.contiguous())
+def _generate_mha_rope_fusion(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    cache_locs: torch.Tensor,
+    input_pos: torch.Tensor,
+    out: torch.Tensor,
+):
+    b, (n_heads, d_head) = q.shape[0], q.shape[-2:]
+    max_seq_len, n_kv_heads = k_cache.shape[1:3]
+    device = q.device
+    SEQ_BLOCK_SIZE = 64
+    num_blocks = (max_seq_len + SEQ_BLOCK_SIZE - 1) // SEQ_BLOCK_SIZE
+    stage1_output_values = torch.empty(
+        b, n_heads, num_blocks, d_head, device=device, dtype=torch.float32
+    )
+    stage1_output_logsumexp = torch.empty(
+        b, n_heads, num_blocks, device=device, dtype=torch.float32
+    ) - float("inf")
+    q_rope = torch.empty_like(q)
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    (
+        update_kv_cache_rope_fusion[(b, n_kv_heads, 1)](
+            q,
+            k,
+            v,
+            None,
+            None,
+            q_rope,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_locs,
+            freqs_cis,
+            max_seq_len,
+            n_heads,
+            n_kv_heads,
+            d_head,
+            1,
+            HEAD_BLOCK_SIZE,
+            GENERATE_ONLY=True,
+        ),
+    )
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    gqa_attention_kv_stage1[
+        (
+            b,
+            n_kv_heads,
+            num_blocks,
+        )
+    ](
+        q_rope,
+        k_cache,
+        v_cache,
+        cache_locs,
+        input_pos,
+        stage1_output_values,
+        stage1_output_logsumexp,
+        num_blocks,
+        max_seq_len,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        d_head,
+        SEQ_BLOCK_SIZE,
+        HEAD_BLOCK_SIZE,
+    )
+    attention_kv_stage2[(b, n_heads, 1)](
+        stage1_output_values,
+        stage1_output_logsumexp,
+        out,
+        input_pos,
+        num_blocks,
+        n_heads,
+        d_head,
+        SEQ_BLOCK_SIZE,
+    )
+def _flattened_context_mha_rope_fusion(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # NOTE: s_total == sum(seq_len)
+    s_total, n_heads, d_head = q.shape
+    max_cache_seq_len, n_kv_heads = k_cache.shape[1:3]
+    BATCH_SIZE: int = len(input_pos)
+    SEQ_BLOCK = 32
+    q_rope = torch.empty_like(q)
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    (
+        update_kv_cache_rope_fusion[
+            (BATCH_SIZE, n_kv_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+        ](
+            q,
+            k,
+            v,
+            seq_len,
+            seq_start,
+            q_rope,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_loc,
+            freqs_cis,
+            max_cache_seq_len,
+            n_heads,
+            n_kv_heads,
+            d_head,
+            32,
+            HEAD_BLOCK_SIZE,
+            GENERATE_ONLY=False,
+        ),
+    )
+    # TODO: use input_pos to get the correct cache locations
+    softmax_scale = 1.0 / math.sqrt(d_head)
+    grid = (BATCH_SIZE, n_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv_flattened[grid](
+        q_rope,
+        seq_len,
+        seq_start,
+        k_cache,
+        v_cache,
+        input_pos,
+        cache_loc,
+        out,
+        softmax_scale,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        d_head,
+        SEQ_BLOCK,
+        max_cache_seq_len,
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_flattened_mha_with_cache_rope_fusion", mutates_args=())
+def fused_flattened_mha_with_cache_rope_fusion(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Flattened & fused MHA with cache that takes raw input from q, k, v GEMMs.
+    Fuse k rope in update_kv_cache and q rope in attention.
+    NOTE: this op can also handle seq_len==0, which might be useful for CUDAGRAPH.
+    """
+    # this function only handle requests with rope embadding.
+    if freqs_cis is None:
+        return fused_flattened_mha_with_cache(
+            q,
+            k,
+            v,
+            input_pos,
+            cache_loc,
+            seq_len,
+            seq_start,
+            k_cache,
+            v_cache,
+            freqs_cis,
+        )
+    # b, s info
+    # NOTE: b, s are just the shapes of the input tensor q; not necessarily the number of sequences.
+    # Generally speaking, we expect one of two cases here:
+    # 1. b > 0, s==1: this indicates a generate-only batch of tokens.
+    # 2. b==1, s > 0: this indicates a mixed context+generate phase. The actual number of sequences
+    #    and number of tokens per sequence are encoded in seq_len and seq_start.
+    b, s, d = q.shape
+    head_dim = k_cache.shape[-1]
+    # reshapes with num_heads and head_dim
+    if s == 1:
+        bs_view = (b, s)
+    else:
+        bs_view = (b * s,)
+    q = q.view(*bs_view, q.shape[2] // head_dim, head_dim)
+    k = k.view(*bs_view, k.shape[2] // head_dim, head_dim)
+    v = v.view(*bs_view, v.shape[2] // head_dim, head_dim)
+    # run attention
+    y = torch.empty_like(q)
+    if s == 1:
+        # generate-only phase
+        _generate_mha_rope_fusion(q, k, v, freqs_cis, k_cache, v_cache, cache_loc, input_pos, y)
+    else:
+        # mixed context + generate phase
+        _flattened_context_mha_rope_fusion(
+            q,
+            k,
+            v,
+            freqs_cis,
+            input_pos,
+            cache_loc,
+            k_cache,
+            v_cache,
+            seq_len,
+            seq_start,
+            y,
+        )
+    return y.view(b, s, d)  # [b,s,n*h_d]
+@fused_flattened_mha_with_cache_rope_fusion.register_fake
+def fused_flattened_mha_with_cache_rope_fusion_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: torch.Tensor,
+):
+    return torch.empty_like(q.contiguous())
+def _paged_generate_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    page_table: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    cache_loc: torch.Tensor,
+    input_pos: torch.Tensor,
+    out: torch.Tensor,
+    max_seq_len: int,
+):
+    b, (n_heads, d_head) = q.shape[0], q.shape[-2:]
+    PAGE_SIZE, n_kv_heads = k_cache.shape[1:3]
+    device = q.device
+    SEQ_BLOCK_SIZE = PAGE_SIZE # 256
+    num_blocks = (max_seq_len + SEQ_BLOCK_SIZE - 1) // SEQ_BLOCK_SIZE
+    stage1_output_values = torch.empty(
+        b, n_heads, num_blocks, d_head, device=device, dtype=torch.float32
+    )
+    stage1_output_logsumexp = torch.empty(
+        b, n_heads, num_blocks, device=device, dtype=torch.float32
+    ) - float("inf")
+    (
+        update_paged_kv_cache[(b, n_kv_heads, 1)](
+            k,
+            v,
+            None,
+            None,
+            k_cache,
+            v_cache,
+            cache_loc,
+            input_pos,
+            page_table,
+            n_kv_heads,
+            d_head,
+            SEQ_BLOCK_SIZE,
+            max_seq_len,
+            PAGE_SIZE,
+            page_table.stride(0),
+            GENERATE_ONLY=True,
+        ),
+    )
+    attention_kv_paged_stage1[
+        (
+            b,
+            n_heads,
+            num_blocks,
+        )
+    ](
+        q,
+        k_cache,
+        v_cache,
+        cache_loc,
+        page_table,
+        input_pos,
+        stage1_output_values,
+        stage1_output_logsumexp,
+        num_blocks,
+        max_seq_len,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        SEQ_BLOCK_SIZE,
+        PAGE_SIZE,
+        page_table.stride(0),
+    )
+    attention_kv_stage2[(b, n_heads, 1)](
+        stage1_output_values,
+        stage1_output_logsumexp,
+        out,
+        input_pos,
+        num_blocks,
+        n_heads,
+        d_head,
+        SEQ_BLOCK_SIZE,
+    )
+def _paged_context_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    page_table: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    out: torch.Tensor,
+    max_seq_len: int,  # max cache length of sequence, kv_cache shape don't provide this info.
+) -> None:
+    # NOTE: s_total == sum(seq_len)
+    s_total, n_heads, d_head = q.shape
+    PAGE_SIZE, n_kv_heads = k_cache.shape[1:3]
+    BATCH_SIZE = len(input_pos)
+    SEQ_BLOCK = PAGE_SIZE # 32
+    (
+        update_paged_kv_cache[
+            (BATCH_SIZE, n_kv_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+        ](
+            k,
+            v,
+            seq_len,
+            seq_start,
+            k_cache,
+            v_cache,
+            cache_loc,
+            input_pos,
+            page_table,
+            n_kv_heads,
+            d_head,
+            SEQ_BLOCK,
+            max_seq_len,
+            PAGE_SIZE,
+            page_table.stride(0),
+            GENERATE_ONLY=False,
+        ),
+    )
+    softmax_scale = 1.0 / math.sqrt(d_head)
+    grid = (BATCH_SIZE, n_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv_paged[grid](
+        q,
+        seq_len,
+        seq_start,
+        k_cache,
+        v_cache,
+        cache_loc,
+        input_pos,
+        page_table,
+        softmax_scale,
+        out,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        SEQ_BLOCK,
+        max_seq_len,
+        PAGE_SIZE,
+        page_table.stride(0),
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_mha_with_paged_cache", mutates_args=())
+def fused_mha_with_paged_cache(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    page_table: torch.Tensor,
+    max_seq_len: int,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Fused MHA with paged cache that takes raw input from q, k, v GEMMs.
+    NOTE: this op can also handle seq_len==0, which might be useful for CUDAGRAPH.
+    """
+    # b, s info
+    # NOTE: b, s are just the shapes of the input tensor q; not necessarily the number of sequences.
+    # Generally speaking, we expect one of two cases here:
+    # 1. b > 0, s==1: this indicates a generate-only batch of tokens.
+    # 2. b==1, s > 0: this indicates a mixed context+generate phase. The actual number of sequences
+    #    and number of tokens per sequence are encoded in seq_len and seq_start.
+    #    Assuming that context seq_len always > 0.
+    b, s, d = q.shape
+    head_dim = k_cache.shape[-1]
+    # reshapes with num_heads and head_dim
+    if s == 1:
+        bs_view = (b, s)
+    else:
+        bs_view = (b * s,)
+    q = q.view(*bs_view, q.shape[2] // head_dim, head_dim)
+    k = k.view(*bs_view, k.shape[2] // head_dim, head_dim)
+    v = v.view(*bs_view, v.shape[2] // head_dim, head_dim)
+    # rope embedding for generate-only or mixed
+    if freqs_cis is not None:
+        if s == 1:
+            rope_args = (freqs_cis, input_pos, "bsnd")
+            fn_rope = torch.ops.rope.apply_rope_with_input_pos
+        else:
+            rope_args = (freqs_cis, input_pos, seq_len, seq_start)
+            fn_rope = torch.ops.rope.apply_rope_on_flattened_inputs
+        q = fn_rope(q, *rope_args)
+        k = fn_rope(k, *rope_args)
+    # run attention
+    y = torch.empty_like(q)
+    if s == 1:
+        # generate-only phase
+        _paged_generate_mha(
+            q, k, v, page_table, k_cache, v_cache, cache_loc, input_pos, y, max_seq_len
+        )
+    else:
+        # mixed context + generate phase
+        _paged_context_mha(
+            q,
+            k,
+            v,
+            input_pos,
+            cache_loc,
+            page_table,
+            k_cache,
+            v_cache,
+            seq_len,
+            seq_start,
+            y,
+            max_seq_len,
+        )
+    return y.view(b, s, d)  # [b,s,n*h_d]
+@fused_mha_with_paged_cache.register_fake
+def fused_mha_with_paged_cache_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    page_table: torch.Tensor,
+    max_seq_len: int,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    return torch.empty_like(q.contiguous())
+@torch.library.custom_op("attention::prepare_fused_mha_metadata", mutates_args=())
+def prepare_fused_mha_metadata(
+    input_ids: torch.Tensor,
+    seq_len: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    pages_per_seq: torch.Tensor,
+    page_size: int,
+) -> List[torch.Tensor]:
+    num_seq = SequenceInfo._get_sanitized_num_sequences(input_ids, seq_len)
+    seq_start = torch.zeros_like(seq_len[:num_seq])
+    seq_start[1:] = torch.cumsum(seq_len[: num_seq - 1], 0)
+    return (
+        seq_len[:num_seq].clone(),
+        input_pos[:num_seq].clone(),
+        cache_loc[:num_seq].clone(),
+        seq_start,
+    )
+@prepare_fused_mha_metadata.register_fake
+def prepare_fused_mha_metadata_fake(
+    input_ids, seq_len, input_pos, cache_loc, pages_per_seq, page_size
+):
+    return (
+        torch.empty_like(seq_len),
+        torch.empty_like(input_pos),
+        torch.empty_like(cache_loc),
+        torch.empty_like(seq_len),
+    )
+@AttentionRegistry.register("TritonWithFlattenedInputs")
+class TritonWithFlattenedInputs(AttentionDescriptor):
+    @classmethod
+    def is_paged(cls):
+        """Return if the attention op is paged or not."""
+        return False
+    @classmethod
+    def get_attention_op(cls):
+        return torch.ops.attention.fused_flattened_mha_with_cache, 3
+    @classmethod
+    def get_prepare_metadata_op(cls):
+        return torch.ops.attention.prepare_fused_mha_metadata, 4
+    @classmethod
+    def get_cache_initializers(cls, get_info):
+        def _get_cache(si: SequenceInfo):
+            assert not si.is_paged, "Paged cache not supported for TritonWithFlattenedInputs"
+            attention_info = get_info()
+            return torch.empty(
+                si.num_pages,
+                si.page_size,
+                attention_info.num_kv_heads,
+                attention_info.head_dim,
+                device=si.device,
+                dtype=attention_info.cache_config.dtype or attention_info.dtype,
+            )
+        return {"k_cache": _get_cache, "v_cache": _get_cache}
+    @classmethod
+    def get_global_buffer_initializers(cls, get_info):
+        attention_info = get_info()
+        head_dim = attention_info.head_dim
+        pos_embd_config = attention_info.pos_embd_config
+        def _get_freqs_cis(si: SequenceInfo):
+            if pos_embd_config.mode is None:
+                return torch.empty(0, device=si.device)
+            assert pos_embd_config.mode == "rope", f"Mode {pos_embd_config.mode=} not supported"
+            assert pos_embd_config.rope_scale == 1.0, f"{pos_embd_config.rope_scale=} not supported"
+            rope_theta = pos_embd_config.rope_theta
+            return cls._precompute_freqs_cis(2 * si.max_seq_len, head_dim, rope_theta).to(si.device)
+        k_full = "_".join(map(str, ["freqs_cis", *astuple(pos_embd_config)])).replace(".", "_")
+        return {k_full: _get_freqs_cis}
+    @staticmethod
+    def _precompute_freqs_cis(
+        seq_len: int, head_dim: int, rope_theta: Optional[float] = None
+    ) -> torch.Tensor:
+        if rope_theta is None:
+            rope_theta = 1e4
+        freqs = 1.0 / (
+            rope_theta ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim)
+        )
+        t = torch.arange(seq_len)
+        freqs = torch.outer(t, freqs)
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+        # cos and sin (real and img) are packed
+        cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+        return cache.to(dtype=torch.float16)