alpha normalize ademamix | mamba norms and gate | VWN | wnorm (nemotron-flash) | MG equivalence | fix IDM config saving | CCAv2 | MoBA | reduce lm head

Browse files

Files changed (5) hide show

compute_loss.py +422 -0
configuration_dragon.py +22 -0
modeling_dragon.py +934 -42
optimizers/Ademamix.py +4 -1
training_dragon.py +151 -42

compute_loss.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import os
+import glob
+import pickle
+from dataclasses import dataclass
+from typing import Optional
+import tyro
+from tqdm.auto import tqdm
+import numpy as np
+import torch
+from .configuration_dragon import DragonConfig
+from .modeling_dragon import DragonForCausalLM
+@dataclass
+class Args:
+    load_dir: str
+    val_bin: str
+@dataclass
+class NanoArgs:
+    resume_from: Optional[str] = None
+    run_name : str = ""
+    # arch - general
+    d_model : int = 768
+    n_heads : int = 6 # head dim 128 suggested by @Grad62304977
+    head_dim: Optional[int] = None
+    layers_config : str = 4*"lrdlr"
+    expand_factor : int = 2 # expand factor for Mamba/Dragon
+    rope_type_local: str = "" #p-rope
+    rope_type_global: str = "" #p-rope
+    rope_theta_local: float = 10000.0
+    rope_theta_global: float = 0.0
+    eps_rmsnorm: float = 1e-6
+    mlp_expand: int = 4 # expand factor for MLP
+    fused_loss_computation : bool = True # whether to use fused linear + cross entropy loss
+    use_uscaling: bool = False
+    uscaling_tau: float = 0.2
+    zero_centered_gamma: bool = False
+    zero_centered_gate: bool = False
+    zero_centered_gate_type: int = 1 # 1, 2, 3, 4
+    gate_attn: bool = False
+    gate_gdn: bool = True
+    gate_type: str = "elementwise" # elementwise (one per dim), headwise (one per head), kimi (lora)
+    gate_act: str = "silu" # silu, sigmoid
+    scalar_proj_as_hidden_matrix: bool = True
+    normalization_type: str = "rmsnorm" # rmsnorm, seednorm
+    seednorm_wd: bool = True
+    seednorm_type: int = 1
+    seednorm_rank: int = 1
+    mixer_gn: bool = True
+    mlp_linking : bool = False
+    final_norm: bool = True
+    layer_norm_scaling: bool = False # not read when using muP
+    mlp_type: str = "simple" # simple, gated
+    tie_lm_head: bool = False
+    # MoE
+    moe: bool = False
+    moe_num_routed_experts: int = 2
+    moe_routed_scaling_factor: float = 2.5
+    moe_routed_intermediate_size: int = 768
+    moe_shared_intermediate_size: int = 768
+    # attention related
+    n_kv_heads : int = 0
+    swa_window_size : int = 1024
+    slw_warmup_iters: float = 0
+    slw_start: int = 8 # window size at the start of training
+    slw_increment: int = 64 # window size increment at each step
+    softcap_local_attn: float = 0.0 # logit soft-capping for local attn logits, as per Gemma2 (0.0 = no soft-capping)
+    softcap_global_attn: float = 0.0
+    qk_norm: bool = True
+    scalable_softmax: bool = True
+    resformer : bool = False # Works only on f layers (DiffAttention)
+    token_shift_attn: bool = False
+    token_shift_gdn: bool = False
+    token_conv1d_attn: bool = False
+    token_conv1d_gdn: bool = True
+    num_attention_heads_indexer: int = 8
+    head_dim_indexer: int = 32
+    dsa_q_lora_rank: int = 128
+    dsa_topk: int = 512
+    cca_seq_kernel_size: int = 4
+    nsa_topk: int = 16
+    nsa_block_size: int = 64
+    nsa_window_size: int = 512
+    num_signal_heads_diff: Optional[int] = None
+    tpa_rank: int = 2
+    shrink_qk_da: int = 2
+    mla_kv_rank: int = 128
+    # GDN related
+    rope_gdn: Optional[str] = None # None, rope, (srope)
+    head_dim_gdn: Optional[int] = None
+    n_heads_gdn: int = 0
+    n_kv_heads_gdn: int = 0
+    shrink_qk_gdn: int = 2
+    kda_allow_neg_eigval: bool = False
+    kda_num_v_heads: Optional[int] = None
+    mamba_mimo_dim: Optional[int] = 2
+    mamba_ngroups: Optional[int] = 1
+    mamba_d_state: int = 128
+    mamba_headdim: int = 64
+    mamba3_rope: bool = True
+    mamba3_remove_BC_bias: bool = False
+    mamba3_is_id_rms: bool = True
+    mamba3_remove_conv: bool = True
+    mamba3_is_A_dd: bool = True
+    mamba3_add_trapezoid: bool = True
+    # optim
+    optim: str = "adamw" # adamw, spam, stable-spam, muon, muon_moonlight, splus
+    second_order_optim : Optional[str] = None # snoo
+    batch_size: int = 8*64 # batch size, in sequences, across all devices
+    device_batch_size: int = 64 # batch size, in sequences, per device
+    total_iterations: int = 1000 # number of iterations to run
+    learning_rate: float = 1e-4
+    weight_decay: float = 0.
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.95
+    adam_eps: float = 1e-8
+    warmup_iters: int = 200
+    warmdown_iters: int = 3000
+    warmdown_type: str = "linear" # linear, cosine
+    grad_norm_clip: float = 1.0
+    uscaling_mult_embed: float = 0
+    uscaling_mult_scalar: float = 0
+    uscaling_mult_head: float = 0
+    init_std: float = 0.006
+    patch_level_training: bool = False
+    patch_level_training_size: int = 4
+    second_order_lr: float = 0.68
+    second_order_momentum: float = 0.37
+    second_order_interval: int = 25
+    # data
+    vocab_size: int = 50304
+    bos_id: int = 50256
+    sequence_length: int = 1024
+    intra_doc_masking: bool = False
+    input_bin: Optional[str] = None
+    input_val_bin: Optional[str] = None
+    # evaluation and logging
+    val_loss_every: int = 125
+    val_iterations: int = 50 # 1 step = global bs * T tokens
+    inspect_every: int = 0
+    save_every: int = 1000
+    log_dir: str = "logs/"
+    wandb_project: str = "dragon_v1.5"
+    wandb_name: Optional[str] = None
+    log_wandb: bool = False
+    load_arg_from_config: bool = True
+    load_optim: bool = True
+    load_sched: bool = True
+    compile: bool = True
+    compile_dynamic: bool = False
+    # used during training
+    slw_window: int = 0
+def _peek_data_shard(filename):
+    with open(filename, "rb") as f:
+        header = np.frombuffer(f.read(256 * 4), dtype=np.int32)
+    if header[0] != 20240520:
+        print("ERROR: magic number mismatch in the data .bin file!")
+        print("---> HINT: Are you passing in a correct file with --input_bin?")
+        print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README")
+        exit(1)
+    assert header[1] == 1, "unsupported version"
+    ntok = int(header[2])
+    return ntok
+def _load_data_shard(filename):
+    with open(filename, "rb") as f:
+        header = np.frombuffer(f.read(256 * 4), dtype=np.int32)
+        assert header[0] == 20240520, "magic number mismatch in the data .bin file"
+        assert header[1] == 1, "unsupported version"
+        ntok = int(header[2])
+    # memmap the token payload directly (uint16) after the 256*4B header
+    tokens = np.memmap(filename, dtype=np.uint16, mode="r", offset=256 * 4, shape=(ntok,))
+    assert tokens.size == ntok, "number of tokens read does not match header?"
+    return tokens
+class DistributedDataLoader:
+    def __init__(self, filename_pattern, intra_doc_masking,B, T, process_rank, num_processes, bos_id, stop_on_end=False):
+        self.process_rank = process_rank
+        self.num_processes = num_processes
+        self.intra_doc_masking = intra_doc_masking
+        self.bos_id = bos_id
+        self.B = B # micro batch size
+        self.T = T
+        self.stop_on_end = stop_on_end
+        # glob files that match the pattern
+        self.files = sorted(glob.glob(filename_pattern))
+        assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}"
+        if self.stop_on_end:
+            assert len(self.files) == 1, "Pass a single .bin path (not a pattern) when stop_on_end=True."
+        # load and validate all data shards, count number of tokens in total
+        ntok_total = 0
+        self.shard_ntoks = []
+        for fname in self.files:
+            shard_ntok = _peek_data_shard(fname)
+            #print(f"shard {fname} has {shard_ntok} tokens")
+            assert shard_ntok >= num_processes * B * T + 1
+            self.shard_ntoks.append(shard_ntok)
+            ntok_total += int(shard_ntok)
+        self.ntok_total = ntok_total
+        # kick things off
+        self.reset()
+    def reset(self, shard=0):
+        self.current_shard = shard
+        self.current_position = self.process_rank * self.B * self.T
+        self.tokens = _load_data_shard(self.files[self.current_shard])
+    def advance(self): # advance to next data shard
+        self.current_shard = (self.current_shard + 1) % len(self.files)
+        self.current_position = self.process_rank * self.B * self.T
+        self.tokens = _load_data_shard(self.files[self.current_shard])
+        if self.process_rank == 0:
+            shard_tokens = self.shard_ntoks[self.current_shard]
+            cum_tokens = sum(self.shard_ntoks[: self.current_shard + 1])
+            def _fmt(n):
+                return f"{n/1e9:.2f}B" if n >= 1_000_000_000 else (
+                    f"{n/1e6:.2f}M" if n >= 1_000_000 else str(n))
+            print(
+                f"Advancing to shard {self.current_shard}/{len(self.files)-1} "
+                f"(this={_fmt(shard_tokens)} tok, cum={_fmt(cum_tokens)}/{_fmt(self.ntok_total)})"
+            )
+    def next_batch(self):
+        B = self.B
+        T = self.T
+        buf = self.tokens[self.current_position : self.current_position+B*T]
+        buf = np.asarray(buf, dtype=np.int64)
+        x = torch.from_numpy(buf.reshape(B, T)) # inputs
+        y = torch.from_numpy(buf.reshape(B, T)) # targets
+        # compute cumulative document positions for intra-document masking
+        cu = None
+        maxlen = None
+        position_ids = None
+        if self.intra_doc_masking:
+            assert self.B == 1
+            starts = (x == self.bos_id).nonzero(as_tuple=True)[1].to(torch.long)
+            if starts.numel() == 0 or starts[0] != 0:
+                starts = torch.cat([torch.zeros(1, dtype=torch.long), starts])
+            ends = torch.cat([starts[1:], torch.tensor([x.numel()])])
+            seqlens = (ends - starts).to(torch.int32)
+            # cu_seqlens, max_seqlen.
+            cu = torch.cat([torch.zeros(1, dtype=torch.int32), seqlens.cumsum(0)]).cuda().to(torch.int32)
+            maxlen = int(seqlens.max())
+            # position_ids.
+            lengths = seqlens.to(torch.long)
+            starts_per_token = torch.repeat_interleave(starts.to(torch.long), lengths)
+            idx = torch.arange(T, device=x.device, dtype=torch.long)
+            position_ids = (idx - starts_per_token).unsqueeze(0)
+        # advance current position and load next shard if necessary
+        self.current_position += B * T * self.num_processes
+        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
+            if self.stop_on_end:
+                raise StopIteration
+            else:
+                self.advance()
+        return x.cuda(), y.cuda(), cu, maxlen, position_ids
+run_args = tyro.cli(Args)
+saved_args_path = os.path.join(os.path.dirname(run_args.load_dir), "args.pkl")
+print(f"Loading args from {saved_args_path}")
+if os.path.exists(saved_args_path):
+    with open(saved_args_path, "rb") as f:
+        saved_args = pickle.load(f)
+    args: NanoArgs = saved_args
+print(args)
+B, T = args.device_batch_size, args.sequence_length
+accumulation_steps = args.batch_size // (B * 1)
+val_loader = DistributedDataLoader(run_args.val_bin, False, B, T, 0, 1, args.bos_id, stop_on_end=True)
+print(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
+# load model.
+config_hf = DragonConfig(
+    tie_lm_head=args.tie_lm_head,
+    mlp_type=args.mlp_type,
+    layer_norm_scaling=args.layer_norm_scaling,
+    mamba_d_state=args.mamba_d_state,
+    mamba_headdim=args.mamba_headdim,
+    mamba3_rope=args.mamba3_rope,
+    mamba3_remove_BC_bias=args.mamba3_remove_BC_bias,
+    mamba3_is_id_rms=args.mamba3_is_id_rms,
+    mamba3_remove_conv=args.mamba3_remove_conv,
+    mamba3_is_A_dd=args.mamba3_is_A_dd,
+    mamba3_add_trapezoid=args.mamba3_add_trapezoid,
+    moe=args.moe,
+    moe_num_routed_experts=args.moe_num_routed_experts,
+    moe_routed_scaling_factor=args.moe_routed_scaling_factor,
+    moe_routed_intermediate_size=args.moe_routed_intermediate_size,
+    moe_shared_intermediate_size=args.moe_shared_intermediate_size,
+    intra_doc_masking=args.intra_doc_masking,
+    seednorm_rank=args.seednorm_rank,
+    seednorm_type=args.seednorm_type,
+    final_norm=args.final_norm,
+    mla_kv_rank=args.mla_kv_rank,
+    rope_gdn=args.rope_gdn,
+    shrink_qk_da=args.shrink_qk_da,
+    shrink_qk_gdn=args.shrink_qk_gdn,
+    mixer_gn=args.mixer_gn,
+    kda_allow_neg_eigval=args.kda_allow_neg_eigval,
+    kda_num_v_heads=args.kda_num_v_heads,
+    seednorm_wd=args.seednorm_wd,
+    normalization_type=args.normalization_type,
+    tpa_rank=args.tpa_rank,
+    num_signal_heads_diff=args.num_signal_heads_diff,
+    scalar_proj_as_hidden_matrix=args.scalar_proj_as_hidden_matrix,
+    token_shift_attn=args.token_shift_attn,
+    token_shift_gdn=args.token_shift_gdn,
+    token_conv1d_attn=args.token_conv1d_attn,
+    token_conv1d_gdn=args.token_conv1d_gdn,
+    patch_level_training=args.patch_level_training,
+    patch_level_training_size=args.patch_level_training_size,
+    nsa_topk=args.nsa_topk,
+    nsa_block_size=args.nsa_block_size,
+    nsa_window_size=args.nsa_window_size,
+    cca_seq_kernel_size=args.cca_seq_kernel_size,
+    head_dim=args.head_dim,
+    head_dim_gdn=args.head_dim_gdn,
+    num_attention_heads_gdn=args.n_heads_gdn,
+    num_key_value_heads_gdn=args.n_kv_heads_gdn,
+    zero_centered_gate=args.zero_centered_gate,
+    zero_centered_gate_type=args.zero_centered_gate_type,
+    scalable_softmax=args.scalable_softmax,
+    mamba_mimo_dim=args.mamba_mimo_dim,
+    mamba_ngroups=args.mamba_ngroups,
+    resformer=args.resformer,
+    gate_type=args.gate_type,
+    gate_act=args.gate_act,
+    gate_attn=args.gate_attn,
+    gate_gdn=args.gate_gdn,
+    fused_loss_computation=args.fused_loss_computation,
+    qk_norm=args.qk_norm,
+    num_attention_heads_indexer=args.num_attention_heads_indexer,
+    head_dim_indexer=args.head_dim_indexer,
+    dsa_q_lora_rank=args.dsa_q_lora_rank,
+    dsa_topk=args.dsa_topk,
+    zero_centered_gamma=args.zero_centered_gamma,
+    vocab_size=args.vocab_size,
+    max_position_embeddings=args.sequence_length,
+    use_uscaling=args.use_uscaling,
+    hidden_size=args.d_model,
+    intermediate_size=args.d_model * args.mlp_expand,
+    expand_factor=args.expand_factor,
+    layers_config=args.layers_config,
+    num_attention_heads=args.n_heads,
+    num_key_value_heads=args.n_kv_heads if args.n_kv_heads > 0 else args.n_heads,
+    initializer_range=args.init_std,
+    softcap_local_attn=args.softcap_local_attn,
+    softcap_global_attn=args.softcap_global_attn,
+    norm_epsilon=args.eps_rmsnorm,
+    use_cache=False,
+    sliding_window_size=args.swa_window_size,
+    rope_type_global=args.rope_type_global,
+    rope_type_local=args.rope_type_local,
+    rope_theta_global=args.rope_theta_global,
+    rope_theta_local=args.rope_theta_local,
+    uscaling_tau=args.uscaling_tau,
+    mlp_linking=args.mlp_linking
+)
+model = DragonForCausalLM.from_pretrained(run_args.load_dir, config=config_hf, torch_dtype=torch.bfloat16)
+model = model.cuda()
+model = torch.compile(model, dynamic=args.compile_dynamic) if args.compile else model
+model.eval()
+ctx = torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)
+val_loader.reset()
+total_steps = (val_loader.shard_ntoks[val_loader.current_shard] - 1) // (B * T * val_loader.num_processes)
+pbar = tqdm(total=total_steps, desc="Validating", unit="step")
+val_loss_sum = torch.zeros((), device="cuda", dtype=torch.float32)
+n_steps = 0
+tok_per_step = B * T
+with torch.no_grad():
+    while True:
+        try:
+            inputs, targets, cu, maxlen, position_ids = val_loader.next_batch()
+        except StopIteration:
+            break
+        with ctx:
+            step_loss = model(
+                input_ids=inputs,
+                labels=targets,
+                just_loss=True,
+                cu_seqlens=cu,
+                max_seqlen=maxlen,
+                position_ids=position_ids,
+            ).loss.detach()
+        val_loss_sum += step_loss
+        n_steps += 1
+        avg = (val_loss_sum / n_steps).item()
+        pbar.update(1)
+        pbar.set_postfix(avg_loss=f"{avg:.4f}", ppl=f"{np.exp(avg):.2f}")
+pbar.close()
+assert n_steps > 0, "No batches read from the file; check B/T vs file size."
+val_loss = (val_loss_sum / n_steps).item()
+print(f"Validation Loss: {val_loss:.6f}. Perplexity: {np.exp(val_loss):.6f} (steps={n_steps}, tokens={n_steps*tok_per_step})")

configuration_dragon.py CHANGED Viewed

@@ -92,6 +92,15 @@ class DragonConfig(PretrainedConfig):
     def __init__(
         self,
         tie_lm_head: bool = False,
         mlp_type: str = "simple",
         layer_norm_scaling: bool = False,
@@ -103,6 +112,7 @@ class DragonConfig(PretrainedConfig):
         mamba3_remove_conv: bool = True,
         mamba3_is_A_dd: bool = True,
         mamba3_add_trapezoid: bool = True,
         moe: bool = False,
         moe_num_routed_experts: int = 2,
         moe_routed_scaling_factor: float = 2.5,
@@ -116,6 +126,7 @@ class DragonConfig(PretrainedConfig):
         shrink_qk_da: int = 2,
         shrink_qk_gdn: int = 2,
         mixer_gn: bool = True,
         kda_allow_neg_eigval: bool = False,
         kda_num_v_heads: Optional[int] = None,
         seednorm_wd: bool = True,
@@ -197,6 +208,15 @@ class DragonConfig(PretrainedConfig):
         mlp_linking=False,
         **kwargs,
     ):
         self.tie_lm_head = tie_lm_head
         self.mlp_type = mlp_type
         self.layer_norm_scaling = layer_norm_scaling
@@ -208,6 +228,7 @@ class DragonConfig(PretrainedConfig):
         self.mamba3_remove_conv = mamba3_remove_conv
         self.mamba3_is_A_dd = mamba3_is_A_dd
         self.mamba3_add_trapezoid = mamba3_add_trapezoid
         self.moe = moe
         self.moe_num_routed_experts = moe_num_routed_experts
         self.moe_routed_scaling_factor = moe_routed_scaling_factor
@@ -221,6 +242,7 @@ class DragonConfig(PretrainedConfig):
         self.shrink_qk_da = shrink_qk_da
         self.shrink_qk_gdn = shrink_qk_gdn
         self.mixer_gn = mixer_gn
         self.kda_allow_neg_eigval = kda_allow_neg_eigval
         self.kda_num_v_heads = kda_num_v_heads
         self.seednorm_wd = seednorm_wd

     def __init__(
         self,
+        reduce_lm_head: int = 0,
+        dataset_type: str = "hf",
+        vwn: bool = False,
+        vwn_m: int = 2,
+        vwn_n: int = 3,
+        vwn_wd_alpha_beta: bool = False,
+        vwn_dynamic: bool = True,
+        legacy_gate: bool = False,
+        init_gpt2: bool = False,
         tie_lm_head: bool = False,
         mlp_type: str = "simple",
         layer_norm_scaling: bool = False,
         mamba3_remove_conv: bool = True,
         mamba3_is_A_dd: bool = True,
         mamba3_add_trapezoid: bool = True,
+        mamba3_postgate_norm: bool = False,
         moe: bool = False,
         moe_num_routed_experts: int = 2,
         moe_routed_scaling_factor: float = 2.5,
         shrink_qk_da: int = 2,
         shrink_qk_gdn: int = 2,
         mixer_gn: bool = True,
+        gate_before_norm: bool = True,
         kda_allow_neg_eigval: bool = False,
         kda_num_v_heads: Optional[int] = None,
         seednorm_wd: bool = True,
         mlp_linking=False,
         **kwargs,
     ):
+        self.reduce_lm_head = reduce_lm_head
+        self.dataset_type = dataset_type
+        self.vwn = vwn
+        self.vwn_m = vwn_m
+        self.vwn_n = vwn_n
+        self.vwn_wd_alpha_beta = vwn_wd_alpha_beta
+        self.vwn_dynamic = vwn_dynamic
+        self.legacy_gate = legacy_gate
+        self.init_gpt2 = init_gpt2
         self.tie_lm_head = tie_lm_head
         self.mlp_type = mlp_type
         self.layer_norm_scaling = layer_norm_scaling
         self.mamba3_remove_conv = mamba3_remove_conv
         self.mamba3_is_A_dd = mamba3_is_A_dd
         self.mamba3_add_trapezoid = mamba3_add_trapezoid
+        self.mamba3_postgate_norm = mamba3_postgate_norm
         self.moe = moe
         self.moe_num_routed_experts = moe_num_routed_experts
         self.moe_routed_scaling_factor = moe_routed_scaling_factor
         self.shrink_qk_da = shrink_qk_da
         self.shrink_qk_gdn = shrink_qk_gdn
         self.mixer_gn = mixer_gn
+        self.gate_before_norm = gate_before_norm
         self.kda_allow_neg_eigval = kda_allow_neg_eigval
         self.kda_num_v_heads = kda_num_v_heads
         self.seednorm_wd = seednorm_wd

modeling_dragon.py CHANGED Viewed

@@ -21,6 +21,11 @@ from fla.ops.nsa.parallel import parallel_nsa
 from flash_attn.modules.mlp import GatedMlp
 try:
     from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined
 except ImportError:
@@ -54,6 +59,8 @@ try:
 except ImportError:
     chunk_kda, fused_recurrent_kda, fused_kda_gate, prepare_sequence_ids = None, None, None, None
 from torch.compiler import disable
 logger = logging.get_logger(__name__)
@@ -268,6 +275,13 @@ class DragonLinear(nn.Linear):
         out = super().forward(x)
         return ScaledGrad.apply(out, self.alpha_fwd, self.alpha_bwd)
 class HybridDragonDynamicCache(DynamicCache):
     """
     A dynamic cache that handle both the attention cache (which has a seq_len dimension) and the GDN cache
@@ -299,6 +313,10 @@ class HybridDragonDynamicCache(DynamicCache):
         self.q_conv_caches = []
         self.k_conv_caches = []
         self.v_conv_caches = []
         for idx, layer_type in enumerate(config.layers_config):
             if not layer_type == "r":
@@ -313,6 +331,8 @@ class HybridDragonDynamicCache(DynamicCache):
             self.q_conv_caches.append(None)
             self.k_conv_caches.append(None)
             self.v_conv_caches.append(None)
         self.window_size = config.sliding_window_size
         self.layers_config = config.layers_config
@@ -359,6 +379,15 @@ class HybridDragonDynamicCache(DynamicCache):
     def set_prev_hidden(self, layer_idx, h):
         self.cca_prev_hidden[layer_idx] = h
     # kv shift
     def get_last_kv(self, layer_idx):
@@ -568,6 +597,7 @@ class DragonAttention(nn.Module):
         projection_dim = self.head_dim * (self.num_attention_heads + 2 * (0 if reuse_kv else self.num_key_value_heads))
         self.linear_qkv = DragonLinear(config, config.hidden_size, projection_dim, bias=False)
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
@@ -755,6 +785,187 @@ class DragonAttention(nn.Module):
         return attn_output, last_key_states, last_value_states
 class DragonTensorProductAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
@@ -785,6 +996,8 @@ class DragonTensorProductAttention(nn.Module):
         self.W_A_v = DragonLinear(config, self.hidden_size, self.num_attention_heads * self.rank, bias=False)
         self.W_B_k = DragonLinear(config, self.hidden_size, self.rank * self.head_dim, bias=False)
         self.W_B_v = DragonLinear(config, self.hidden_size, self.rank * self.head_dim, bias=False)
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
@@ -1156,6 +1369,246 @@ class DragonCompressedConvolutionalAttention(nn.Module):
         return attn_output, None, None
 class DragonNativeSparseAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
@@ -1696,6 +2149,7 @@ class DragonDifferentialAttention(nn.Module):
         projection_dim = self.head_qk_dim * self.num_attention_heads + self.head_qk_dim * self.num_key_value_heads + (self.head_v_dim * self.num_noise_heads//2)
         self.linear_qkv = DragonLinear(config, config.hidden_size, projection_dim, bias=False)
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
@@ -2373,6 +2827,8 @@ class DragonDifferentialTensorProductAttention(nn.Module):
         self.W_A_v = DragonLinear(config, self.hidden_size, self.num_noise_heads * self.rank, bias=False)
         self.W_B_k = DragonLinear(config, self.hidden_size, self.rank * self.head_qk_dim, bias=False)
         self.W_B_v = DragonLinear(config, self.hidden_size, self.rank * self.head_v_dim, bias=False)
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
@@ -3161,12 +3617,29 @@ class DragonGatedDeltaNet(nn.Module):
             self.num_attention_heads*self.dk + self.n_kv_heads*self.dk + self.n_kv_heads*self.dv,
             bias=False
         )
         self.linear_ba = DragonLinear(
             config, config.hidden_size,
             self.num_attention_heads + self.num_attention_heads, #+ self.num_attention_heads*self.dv, # b(H), a(H), g(H*dv)
             bias=False
         )
         dt_min = config.time_step_min
         dt_max = config.time_step_max
         dt_init_floor = config.time_step_floor
@@ -3181,11 +3654,13 @@ class DragonGatedDeltaNet(nn.Module):
         inv_dt = dt + torch.log(-torch.expm1(-dt))
         with torch.no_grad():
             self.dt_bias = nn.Parameter(inv_dt)
         assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
         A = torch.empty(self.n_heads_local, dtype=torch.float32).uniform_(*A_init_range)
         A_log = torch.log(A)  # Keep A_log in fp32
         self.A_log = nn.Parameter(A_log)
         if self.config.rope_gdn == "rope":
             self.rope_proj = DragonLinear(config, config.hidden_size, self.dk//4, bias=False)
@@ -3348,6 +3823,11 @@ class DragonGatedDeltaNet(nn.Module):
                 use_qk_l2norm_in_kernel=True
             ) # (B L H dv)
         # update GDN cache
         if cache_params is not None:
             cache_params.ssm_caches[self.layer_idx] = ssm_cache
@@ -3381,6 +3861,9 @@ class DragonKimiDeltaAttention(nn.Module):
         self.q_proj = DragonLinear(config, config.hidden_size, self.key_dim, bias=False)
         self.k_proj = DragonLinear(config, config.hidden_size, self.key_dim, bias=False)
         self.v_proj = DragonLinear(config, config.hidden_size, self.value_dim, bias=False)
         self.q_conv1d = ShortConvolution(
             hidden_size=self.key_dim,
@@ -3413,10 +3896,21 @@ class DragonKimiDeltaAttention(nn.Module):
         self.A_log = nn.Parameter(torch.log(torch.empty(self.num_q_heads, dtype=torch.float32).uniform_(1, 16)))
         self.dt_bias = nn.Parameter(torch.zeros(self.key_dim, dtype=torch.float32))
-        """self.g_proj = nn.Sequential(
-            DragonLinear(config, config.hidden_size, self.head_v_dim, bias=False),
-            DragonLinear(config, self.head_v_dim, self.value_dim, bias=True),
-        )"""
     @disable
     def _kda_gate_call(self, g, A_log, head_k_dim, g_bias):
@@ -3427,6 +3921,7 @@ class DragonKimiDeltaAttention(nn.Module):
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         cache_params: Optional[HybridDragonDynamicCache] = None,
         **kwargs,
     ):
         _, q_len, _ = hidden_states.shape
@@ -3443,20 +3938,26 @@ class DragonKimiDeltaAttention(nn.Module):
             conv_state_k = cache_params.k_conv_caches[self.layer_idx]
             conv_state_v = cache_params.v_conv_caches[self.layer_idx]
         q, conv_state_q = self.q_conv1d(
             x=self.q_proj(hidden_states),
             cache=conv_state_q,
             output_final_state=cache_params is not None,
         )
         k, conv_state_k = self.k_conv1d(
             x=self.k_proj(hidden_states),
             cache=conv_state_k,
             output_final_state=cache_params is not None,
         )
         v, conv_state_v = self.v_conv1d(
             x=self.v_proj(hidden_states),
             cache=conv_state_v,
             output_final_state=cache_params is not None,
         )
         g = self.f_proj(hidden_states)
@@ -3482,6 +3983,7 @@ class DragonKimiDeltaAttention(nn.Module):
                 initial_state=None,
                 output_final_state=cache_params is not None,
                 use_qk_l2norm_in_kernel=True,
             )
         elif mode == 'fused_recurrent':
             o, ssm_cache = fused_recurrent_kda(
@@ -3500,6 +4002,11 @@ class DragonKimiDeltaAttention(nn.Module):
         #o = o * F.silu(rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=self.head_v_dim))
         # TODO: other types of gates? as well as ZCG?
         if cache_params is not None:
             cache_params.ssm_caches[self.layer_idx] = ssm_cache
             cache_params.q_conv_caches[self.layer_idx] = conv_state_q
@@ -3549,8 +4056,8 @@ class DragonMamba3(nn.Module):
         if config.mamba3_rope:
             self.rope_proj = DragonLinear(config, self.d_model, self.num_rope_angles, bias=False)
-        # Order: [z, x, B, C, dt]
-        d_in_proj = 2 * self.d_inner + 2 * self.d_state * self.ngroups + self.nheads
         if self.config.mamba3_is_A_dd:
             self.A_proj = DragonLinear(config, self.d_model, self.nheads, bias=False, dtype=torch.float32)
@@ -3575,6 +4082,7 @@ class DragonMamba3(nn.Module):
         self.dt_bias._no_weight_decay = True
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
         self.B_bias, self.C_bias = None, None
         if not config.mamba3_remove_BC_bias:
@@ -3604,18 +4112,36 @@ class DragonMamba3(nn.Module):
         self.D = nn.Parameter(torch.ones(self.nheads))
         self.D._no_weight_decay = True
-    def forward(
         self,
         hidden_states: torch.Tensor,
         cache_params: Optional[HybridDragonDynamicCache] = None,
         **kwargs
     ):
         # Apply in_proj
-        zxBCdt = self.in_proj(hidden_states)
-        z, xBC, dd_dt = torch.split(
-            zxBCdt,
             [
-                self.d_inner,
                 self.d_inner + 2 * self.d_state * self.ngroups,
                 self.nheads,
             ],
@@ -3628,12 +4154,17 @@ class DragonMamba3(nn.Module):
             _A = -torch.exp(self.A_log).unsqueeze(0).unsqueeze(0)
         dt = F.softplus(dd_dt + self.dt_bias) # (B, L, N)
         if not self.config.mamba3_remove_conv:
             xBC = causal_conv1d_fn(
                 x=xBC.transpose(1, 2),
                 weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
                 bias=self.conv1d.bias,
                 activation=self.activation,
             ).transpose(1, 2) # (B, L, self.d_inner + 2 * ngroups * d_state)
         x, B, C = torch.split(
@@ -3699,10 +4230,6 @@ class DragonMamba3(nn.Module):
             x_scalar = (gamma_arr*_alpha_arr).to(torch.bfloat16)
-        ssm_cache = None
-        if cache_params is not None:
-            ssm_cache = cache_params.ssm_caches[self.layer_idx]
         out = mamba_chunk_scan_discretized_combined(
             x=x.bfloat16(),
             A=A,
@@ -3714,19 +4241,26 @@ class DragonMamba3(nn.Module):
             CB_sum=CB_sum,
             D=self.D,
             z=None,
-            initial_states=ssm_cache,
-            return_final_states=cache_params is not None,
         )
-        if cache_params is not None:
-            y, ssm_cache = out
-            cache_params.ssm_caches[self.layer_idx] = ssm_cache
-        else:
-            y = out
-        y = rearrange(y, "b l h p -> b l (h p)")
-        y = y*self.act(z)
-        y = rearrange(y, "b l (h p) -> b l h p", h=self.nheads).to(x.dtype)
         return y, None, None
@@ -3747,6 +4281,7 @@ class DragonMamba2(nn.Module):
         # Order: [x, B, C, dt]
         d_in_proj = self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=False)
         if not self.config.mamba3_remove_conv:
             conv_dim = self.d_inner + 2 * self.ngroups * self.d_state
@@ -3784,6 +4319,15 @@ class DragonMamba2(nn.Module):
         self.D = nn.Parameter(torch.ones(self.nheads))
         self.D._no_weight_decay = True
     def forward(self, hidden_states, **kwargs):
         """
         u: (B, L, D)
@@ -3830,6 +4374,12 @@ class DragonMamba2(nn.Module):
             initial_states=None,
         )
         return y, None, None
 class DragonMamba3Mimo(nn.Module):
@@ -3844,11 +4394,13 @@ class DragonMamba3Mimo(nn.Module):
                 "when creating this class."
             )
         self.d_model = config.hidden_size
-        self.d_state = 64
         self.conv_init = None
         self.expand = 2
-        self.headdim = 128
         self.ngroups = config.mamba_ngroups
         self.activation = "swish"
         self.bias = False
@@ -3863,14 +4415,12 @@ class DragonMamba3Mimo(nn.Module):
         self.dt_init_floor = 1e-4
         self.mimo_dim = config.mamba_mimo_dim
         self.mimo_proj_block_order = 1
         self.d_inner = int(self.expand * self.d_model)
         assert self.d_inner % self.headdim == 0
         self.nheads = self.d_inner // self.headdim
         self.dr_out_dim = self.d_inner // self.mimo_proj_block_order
         self.split_tensor_size = int(self.d_state * self.rope_fraction)
         if self.split_tensor_size % 2 != 0:
             self.split_tensor_size -= 1
@@ -3896,6 +4446,7 @@ class DragonMamba3Mimo(nn.Module):
         self.dt_bias._no_weight_decay = True
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
         self.B_bias = nn.Parameter(torch.ones((self.mimo_dim, self.nheads, self.d_state)), requires_grad=True)
         self.C_bias = nn.Parameter(torch.ones((self.mimo_dim, self.nheads, self.d_state)), requires_grad=True)
@@ -3927,11 +4478,14 @@ class DragonMamba3Mimo(nn.Module):
         self.in_proj_mimo_z = nn.Parameter(in_proj_mimo_z_init_weights, requires_grad=True)
         self.out_proj_mimo = nn.Parameter(out_proj_mimo_init_weights, requires_grad=True)
         # D "skip" parameter
         self.D = nn.Parameter(torch.ones(self.nheads))
         self.D._no_weight_decay = True
     def forward(self, hidden_states, **kwargs):
         # Apply in_proj
         zxBCdt = self.in_proj(hidden_states)
@@ -4024,7 +4578,7 @@ class DragonMamba3Mimo(nn.Module):
         _beta_arr = torch.roll(beta_arr, shifts=-1, dims=1)
         x_scalar = (gamma_arr*_alpha_arr + _beta_arr).to(torch.bfloat16)
         z = rearrange(z, "b l r (h p) -> b l r h p", p=self.headdim)
         y = mamba_mimo_chunk_scan_discretized_fused_combined(
@@ -4037,10 +4591,15 @@ class DragonMamba3Mimo(nn.Module):
             gamma=gamma_arr,
             CB_sum=CB_sum,
             D=self.D,
-            z=z,
         )
         y = rearrange(y, "b l r h p -> b l r (h p)")
         #if seqlen_og is not None:
         #    y = rearrange(y, "b l r d -> (b l) r d")
@@ -4067,7 +4626,9 @@ class DragonMLP(nn.Module):
             self.lambda1 = nn.Parameter(torch.zeros(self.link_size))  # sigmoid->0.5
         else :
             self.fc_1 = DragonLinear(config, config.hidden_size, intermediate_size, bias=False)
         self.fc_2 = DragonLinear(config, intermediate_size, config.hidden_size, bias=False)
         self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
     def forward(self, hidden_states):
@@ -4096,7 +4657,9 @@ class DragonGatedMLP(nn.Module):
         self.intermediate_size = intermediate_size
         self.fc_1 = DragonLinear(config, config.hidden_size, num_active_experts*self.intermediate_size, bias=False)
         self.fc_2 = DragonLinear(config, num_active_experts*self.intermediate_size, config.hidden_size, bias=False)
         self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
     def forward(self, hidden_states, gates):
@@ -4174,6 +4737,11 @@ class DragonMonoBlock(GradientCheckpointingLayer):
             head_dim = self.mixer.head_dim
             num_attention_heads = self.mixer.num_q_heads
             use_gate = config.gate_attn
         elif layer_type == 'n':
             self.mixer = DragonNativeSparseAttention(config, reuse_kv=False, layer_idx=layer_idx)
             head_dim = self.mixer.head_dim
@@ -4203,7 +4771,7 @@ class DragonMonoBlock(GradientCheckpointingLayer):
             self.mixer = DragonMamba3(config, layer_idx=layer_idx)
             head_dim = self.mixer.headdim
             num_attention_heads = self.mixer.nheads
-            use_gate = False
         elif layer_type == '2':
             self.mixer = DragonMamba2(config, layer_idx=layer_idx)
             head_dim = self.mixer.headdim
@@ -4214,6 +4782,11 @@ class DragonMonoBlock(GradientCheckpointingLayer):
             head_dim = self.mixer.headdim
             num_attention_heads = self.mixer.nheads
             use_gate = False # inside Mamba3Mimo
         else:
             raise ValueError(f"Unknown layer type: {layer_type}")
@@ -4233,6 +4806,7 @@ class DragonMonoBlock(GradientCheckpointingLayer):
                     self.gate_proj.is_scalar_weight = True
             else:
                 raise ValueError(f"Unknown gate_type: {self.config.gate_type}")
             if self.config.zero_centered_gate:
                 val = 1.
                 if self.config.zero_centered_gate_type==3:
@@ -4253,6 +4827,7 @@ class DragonMonoBlock(GradientCheckpointingLayer):
         self.use_gate = use_gate
         self.mixer_proj = DragonLinear(config, head_dim*num_attention_heads, config.hidden_size, bias=False)
         if config.mixer_gn:
             self.mixer_group_norm = DragonHeadWiseRMSNorm(n_heads=num_attention_heads, d_head=head_dim, eps=config.norm_epsilon, zero_centered_gamma=config.zero_centered_gamma)
@@ -4299,6 +4874,8 @@ class DragonMonoBlock(GradientCheckpointingLayer):
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
         ) # (B, L, E*D)
         if self.use_gate:
             if self.config.gate_type == "elementwise" or self.config.gate_type == "kimi":
                 g_proj = self.gate_proj(hidden_states).view(hidden_states.size(0), hidden_states.size(1), self.num_attention_heads, self.head_dim).to(y_mixer.dtype)
@@ -4313,7 +4890,7 @@ class DragonMonoBlock(GradientCheckpointingLayer):
                 y_mixer = y_mixer * (self.gate_act(g_proj) + self.gate_bias)
             elif self.config.zero_centered_gate_type == 3 or self.config.zero_centered_gate_type == 4:
                 y_mixer = y_mixer * self.gate_act(g_proj + self.gate_bias)
-        if self.config.mixer_gn:
             y_mixer = self.mixer_group_norm(y_mixer)
         y_mixer = y_mixer.view(y_mixer.size(0), y_mixer.size(1), -1)
         y_mixer = self.mixer_proj(y_mixer)
@@ -4327,6 +4904,282 @@ class DragonMonoBlock(GradientCheckpointingLayer):
         return hidden_states, last_key_states, last_value_states
 class DragonBlock(GradientCheckpointingLayer):
     def __init__(self, config: DragonConfig, layer_idx: int, layer_type: str):
         super().__init__()
@@ -4412,13 +5265,13 @@ class DragonPreTrainedModel(PreTrainedModel):
         "attentions": DragonBlock,
     }
-    def _init_weights(self, module): # TODO: ??
         if isinstance(module, (DragonLinear, nn.Conv1d)):
             if module.bias is not None:
                 nn.init.zeros_(module.bias)
-            nn.init.normal_(module.weight, mean=0., std=1. if self.config.use_uscaling else 0.006)
         elif isinstance(module, nn.Embedding):
-            nn.init.normal_(module.weight, mean=0., std=1. if self.config.use_uscaling else 0.006)
 @dataclass
 class DragonOutput(ModelOutput):
@@ -4473,19 +5326,31 @@ class DragonModel(DragonPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.embedding = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([DragonBlock(config, layer_idx=i, layer_type=layer) if layer in ['l', 'r', 'd'] else DragonMonoBlock(config, layer_idx=i, layer_type=layer) for i, layer in enumerate(config.layers_config)])
         if self.config.rope_type_global != '' or self.config.rope_type_local != '':
             self.rotary_emb = DragonRotaryEmbedding(config, head_dim=config.head_dim if config.head_dim else (config.expand_factor*config.hidden_size)//config.num_attention_heads, theta=config.rope_theta_local) # only for SWA
         else:
             self.rotary_emb = None
         if self.config.final_norm:
             self.final_norm = DragonNorm(config, config.hidden_size)
         self.gradient_checkpointing = False
         self.post_init()
     def get_input_embeddings(self):
         return self.embedding
@@ -4514,6 +5379,8 @@ class DragonModel(DragonPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)
         if self.config.patch_level_training:
             # (B, KL, D) => (B, L, D) OR (B, L, D) ==> (B, L//K, D)
@@ -4570,12 +5437,21 @@ class DragonModel(DragonPreTrainedModel):
             )
             shared_kv = (last_k, last_v)
         if self.config.final_norm:
             hidden_states = self.final_norm(hidden_states)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
         return DragonOutput(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
@@ -4589,11 +5465,23 @@ class DragonForCausalLM(DragonPreTrainedModel, GenerationMixin):
         self.config = config
         self.model = DragonModel(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = DragonLinear(config, config.hidden_size, config.vocab_size, bias=False, alpha_fwd=1/config.hidden_size, alpha_bwd=1/math.sqrt(config.hidden_size))
         self.post_init()
         if config.tie_lm_head:
             self.lm_head.weight = self.model.embedding.weight
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -4639,7 +5527,10 @@ class DragonForCausalLM(DragonPreTrainedModel, GenerationMixin):
             labels = labels.to(hidden_states.device)
             if linear_cross_entropy is None or not self.config.fused_loss_computation:
-                logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)[:, slice_indices, :]).float()
                 if not self.config.patch_level_training:
                     shift_logits = logits[..., :-1, :].contiguous()
                     shift_labels = labels[..., 1:].contiguous()
@@ -4653,6 +5544,7 @@ class DragonForCausalLM(DragonPreTrainedModel, GenerationMixin):
                         loss = loss + F.nll_loss(log_probs, shift_labels[:, i])
                     loss = loss / self.config.patch_level_training_size
             else:
                 assert not self.config.patch_level_training, "Fused loss computation is not supported with patch-level training."
                 loss = linear_cross_entropy(
                     hidden_states[:, slice_indices, :].view(-1, hidden_states.size(-1)),

 from flash_attn.modules.mlp import GatedMlp
+try:
+    import flash_moba
+except ImportError:
+    flash_moba = None
 try:
     from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined
 except ImportError:
 except ImportError:
     chunk_kda, fused_recurrent_kda, fused_kda_gate, prepare_sequence_ids = None, None, None, None
+from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
 from torch.compiler import disable
 logger = logging.get_logger(__name__)
         out = super().forward(x)
         return ScaledGrad.apply(out, self.alpha_fwd, self.alpha_bwd)
+class DragonScale(nn.Module):
+    def __init__(self, s: float):
+        super().__init__()
+        self.s = s
+    def forward(self, x):
+        return x * self.s
 class HybridDragonDynamicCache(DynamicCache):
     """
     A dynamic cache that handle both the attention cache (which has a seq_len dimension) and the GDN cache
         self.q_conv_caches = []
         self.k_conv_caches = []
         self.v_conv_caches = []
+        # cca v2
+        self.conv_states = []
+        self.prev_hs = []
+        self.has_previous_state = False
         for idx, layer_type in enumerate(config.layers_config):
             if not layer_type == "r":
             self.q_conv_caches.append(None)
             self.k_conv_caches.append(None)
             self.v_conv_caches.append(None)
+            self.conv_states.append(None)
+            self.prev_hs.append(None)
         self.window_size = config.sliding_window_size
         self.layers_config = config.layers_config
     def set_prev_hidden(self, layer_idx, h):
         self.cca_prev_hidden[layer_idx] = h
+    # cca v2
+    def update_conv_state(self, layer_idx: int, new_conv_state: torch.Tensor) -> torch.Tensor:
+        if not self.has_previous_state:
+            self.conv_states[layer_idx] = new_conv_state#.to(self.conv_states.device)
+        else:
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
+            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :]#.to(self.conv_states.device)
+        return self.conv_states[layer_idx]
     # kv shift
     def get_last_kv(self, layer_idx):
         projection_dim = self.head_dim * (self.num_attention_heads + 2 * (0 if reuse_kv else self.num_key_value_heads))
         self.linear_qkv = DragonLinear(config, config.hidden_size, projection_dim, bias=False)
+        self.linear_qkv.norm_case_1 = True
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
         return attn_output, last_key_states, last_value_states
+class DragonMoBAttention(nn.Module):
+    def __init__(self, config: DragonConfig, reuse_kv: bool, layer_idx: Optional[int], **kwargs):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.head_dim # if config.head_dim else config.hidden_size * config.expand_factor // self.num_attention_heads
+        self.qk_norm = config.qk_norm
+        self.window_size = config.sliding_window_size
+        self.block_size = config.nsa_block_size
+        self.topk = config.nsa_topk
+        self.reuse_kv = reuse_kv
+        projection_dim = self.head_dim * (self.num_attention_heads + 2 * (0 if reuse_kv else self.num_key_value_heads))
+        self.linear_qkv = DragonLinear(config, config.hidden_size, projection_dim, bias=False)
+        self.linear_qkv.norm_case_1 = True
+        if self.config.token_shift_attn:
+            if self.config.scalar_proj_as_hidden_matrix:
+                self.shift_proj_k = DragonLinear(config, self.hidden_size, self.num_key_value_heads, bias=False)
+                self.shift_proj_v = DragonLinear(config, self.hidden_size, self.num_key_value_heads, bias=False)
+            else:
+                self.shift_proj_k = DragonLinear(config, self.hidden_size, self.num_key_value_heads, bias=False, alpha_bwd=1., alpha_fwd=1.)
+                self.shift_proj_v = DragonLinear(config, self.hidden_size, self.num_key_value_heads, bias=False, alpha_bwd=1., alpha_fwd=1.)
+                self.shift_proj_k.is_scalar_weight = True
+                self.shift_proj_v.is_scalar_weight = True
+        if self.config.token_conv1d_attn:
+            self.conv_size = config.conv_kernel
+            self.conv_dim = self.num_attention_heads * self.head_dim + self.num_key_value_heads * self.head_dim + self.num_key_value_heads * self.head_dim
+            self.qkv_conv1d = nn.Conv1d(in_channels=self.conv_dim, out_channels=self.conv_dim, bias=False, kernel_size=self.conv_size, groups=self.conv_dim, padding=self.conv_size-1)
+            self.causal_conv1d_fn = causal_conv1d_fn
+            self.causal_conv1d_update = causal_conv1d_update or torch_causal_conv1d_update
+        if self.qk_norm:
+            self.q_norm = DragonNorm(config, self.head_dim)
+            if not reuse_kv:
+                self.k_norm = DragonNorm(config, self.head_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridDragonDynamicCache] = None,
+        key_value_last_layer: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ):
+        _, q_len, _ = hidden_states.shape
+        use_precomputed_states = (cache_params is not None and q_len == 1)
+        # Q, K, V projections.
+        if not self.reuse_kv:
+            query_states, key_states, value_states = get_query_key_value_tensors(self, hidden_states)
+        else:
+            query_states = get_query_key_value_tensors(self, hidden_states)
+            key_states, value_states = key_value_last_layer
+            last_key_states, last_value_states = None, None
+        # token-shift.
+        if self.config.token_shift_attn and not self.reuse_kv:
+            alpha_k = torch.sigmoid(self.shift_proj_k(hidden_states).float()).float().to(key_states.dtype).unsqueeze(-1) # (B, L, Hkv, 1)
+            alpha_v = torch.sigmoid(self.shift_proj_v(hidden_states).float()).float().to(value_states.dtype).unsqueeze(-1) # (B, L, Hkv, 1)
+            if cache_params is not None:
+                k_prev, v_prev = cache_params.get_last_kv(self.layer_idx)
+                if k_prev is None:
+                    k_prev, v_prev = torch.zeros_like(key_states[:, :1]), torch.zeros_like(value_states[:, :1])
+                cache_params.set_last_kv(self.layer_idx, key_states[:, -1:], value_states[:, -1:])
+            else:
+                k_prev = F.pad(key_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
+                v_prev = F.pad(value_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
+            key_states = alpha_k * k_prev + (1 - alpha_k) * key_states
+            value_states = alpha_v * v_prev + (1 - alpha_v) * value_states
+        # conv.
+        if self.config.token_conv1d_attn:
+            assert not self.reuse_kv, "not supported"
+            # --- pack for conv ---
+            q_proj = rearrange(query_states, "b l h d -> b l (h d)")
+            k_proj = rearrange(key_states, "b l g d -> b l (g d)")
+            v_proj = rearrange(value_states, "b l g d -> b l (g d)")
+            mixed_qkv = torch.cat([q_proj, k_proj, v_proj], dim=-1).transpose(1, 2) # (B,C,L)
+            if cache_params is not None:
+                conv_cache = cache_params.conv_caches[self.layer_idx]
+            if use_precomputed_states:
+                mixed_qkv = self.causal_conv1d_update(
+                    mixed_qkv,
+                    conv_cache,
+                    self.qkv_conv1d.weight.squeeze(1),
+                    self.qkv_conv1d.bias,
+                    'silu',
+                ) # conv_cache is updated in-place here
+            else:
+                if cache_params is not None:
+                    conv_cache = F.pad(mixed_qkv, (self.conv_size - mixed_qkv.shape[-1], 0))
+                    cache_params.conv_caches[self.layer_idx] = conv_cache
+                if self.causal_conv1d_fn is not None:
+                    mixed_qkv = self.causal_conv1d_fn(
+                        x=mixed_qkv,
+                        weight=self.qkv_conv1d.weight.squeeze(1),
+                        bias=self.qkv_conv1d.bias,
+                        activation='silu',
+                        seq_idx=None,
+                    )
+                else:
+                    mixed_qkv = F.silu(self.qkv_conv1d(mixed_qkv)[:, :, :q_len])
+            # split back
+            mixed_qkv = mixed_qkv.transpose(1, 2)
+            q_proj, k_proj, v_proj = torch.split(
+                mixed_qkv,
+                [self.num_attention_heads*self.head_dim, self.num_key_value_heads*self.head_dim, self.num_key_value_heads*self.head_dim],
+                dim=-1,
+            )
+            query_states = rearrange(q_proj, "b l (h d) -> b l h d", h=self.num_attention_heads)
+            key_states = rearrange(k_proj, "b l (g d) -> b l g d", g=self.num_key_value_heads)
+            value_states = rearrange(v_proj, "b l (g d) -> b l g d", g=self.num_key_value_heads)
+        # QK-norm.
+        if self.qk_norm:
+            query_states = self.q_norm(query_states)
+            if not self.reuse_kv:
+                key_states = self.k_norm(key_states)
+        # RoPE.
+        if self.config.rope_theta_local > 0.0:
+            cos, sin = position_embeddings
+            if self.config.rope_type_local == "rope":
+                query_states = apply_rotary_emb(query_states, cos, sin)
+                if not self.reuse_kv:
+                    key_states = apply_rotary_emb(key_states, cos, sin)
+            elif self.config.rope_type_local == "p-rope":
+                query_states = apply_p_rotary_emb(query_states, cos, sin, p=0.5)
+                if not self.reuse_kv:
+                    key_states = apply_p_rotary_emb(key_states, cos, sin)
+            else:
+                raise ValueError(f"Unknow rope type : {self.config.rope_type_local}")
+        # KV-cache.
+        if not self.reuse_kv and cache_params is not None:
+            key_states, value_states = cache_params.update(key_states, value_states, self.layer_idx)
+        # save k,v for next layer (*after* norm and RoPE and kv-cache update)
+        if not self.reuse_kv:
+            last_key_states, last_value_states = key_states, value_states
+        # attention computation.
+        B, L, _, _ = query_states.shape
+        cu_seqlens = torch.arange(0, (B + 1) * L, step=L, dtype=torch.int32, device=query_states.device)
+        attn_output = flash_moba.flash_moba_varlen_func(
+            q=query_states.bfloat16().view(B*L, self.num_attention_heads, self.head_dim),
+            k=key_states.bfloat16().view(B*L, self.num_key_value_heads, self.head_dim),
+            v=value_states.bfloat16().view(B*L, self.num_key_value_heads, self.head_dim),
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=L,
+            max_seqlen_k=L,
+            moba_chunk_size=self.block_size,
+            moba_topk=self.topk,
+            causal=True,
+        ).view(B, L, self.num_attention_heads, self.head_dim)
+        # softmax scale...
+        # softcap...
+        #if cache_params is not None and not self.reuse_kv:
+        #    cache_params.trim(self.layer_idx)
+        return attn_output, last_key_states, last_value_states
 class DragonTensorProductAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
         self.W_A_v = DragonLinear(config, self.hidden_size, self.num_attention_heads * self.rank, bias=False)
         self.W_B_k = DragonLinear(config, self.hidden_size, self.rank * self.head_dim, bias=False)
         self.W_B_v = DragonLinear(config, self.hidden_size, self.rank * self.head_dim, bias=False)
+        self.c_q.norm_case_1 = True
+        # todo : norm others?
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
         return attn_output, None, None
+class DragonCompressedConvolutionalAttention2(nn.Module):
+    def __init__(self, config: DragonConfig, layer_idx: Optional[int], **kwargs):
+        super().__init__()
+        self.config = config
+        assert layer_idx is not None
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.window_size = config.sliding_window_size
+        self.cca_time0 = 2
+        self.cca_time1 = 2
+        self.padding0 = self.cca_time0 - 1
+        self.padding1 = self.cca_time1 - 1
+        self.total_padding = self.padding0 + self.padding1
+        self.num_kv_heads = 5 # config.num_key_value_heads
+        self.num_q_heads = 10 # config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        # Geometry
+        self.head_dim = config.head_dim
+        self.latent_k_dim = self.num_kv_heads * self.head_dim
+        self.latent_q_dim = self.num_q_heads * self.head_dim
+        self.sqrt_head_dim = float(math.sqrt(self.head_dim))
+        self.gqa_groups = self.num_q_heads // self.num_kv_heads
+        assert self.num_q_heads % self.num_kv_heads == 0, "q_heads must be a multiple of k_heads"
+        assert (self.latent_k_dim + self.latent_q_dim) == (self.num_kv_heads + self.num_q_heads) * self.head_dim
+        # Projections
+        self.linear_q = nn.Linear(self.hidden_size, self.latent_q_dim, bias=self.config.attention_bias)
+        self.linear_k = nn.Linear(self.hidden_size, self.latent_k_dim, bias=self.config.attention_bias)
+        self.val_proj1 = nn.Linear(self.hidden_size, self.latent_k_dim // 2, bias=self.config.attention_bias)
+        self.val_proj2 = nn.Linear(self.hidden_size, self.latent_k_dim // 2, bias=self.config.attention_bias)
+        # Depthwise + grouped conv along sequence
+        in_out_ch = self.latent_k_dim + self.latent_q_dim
+        self.conv_qk = nn.Sequential(
+            nn.Conv1d(
+                in_channels=in_out_ch,
+                out_channels=in_out_ch,
+                kernel_size=self.cca_time0,
+                groups=in_out_ch,
+                padding=0,
+                stride=1,
+            ),
+            nn.Conv1d(
+                in_channels=in_out_ch,
+                out_channels=in_out_ch,
+                kernel_size=self.cca_time1,
+                groups=(self.num_kv_heads + self.num_q_heads),
+                padding=0,
+                stride=1,
+            ),
+        )
+        # Per-k head temperature
+        self.temp = nn.Parameter(torch.zeros(self.num_kv_heads))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        cache_params: Optional[HybridDragonDynamicCache],
+        **kwargs,
+    ):
+        """
+        hidden_states: [B, S, E]  (HF layout)
+        returns:
+          query: [B, S, num_q_heads*head_dim]
+          key  : [B, S, num_k_heads*head_dim]
+          value: [B, S, num_k_heads*head_dim]
+        """
+        past_key_values = cache_params
+        batch_size, seq_length, _ = hidden_states.shape
+        # ---- Switch to [S, B, H] ----
+        hs = hidden_states.transpose(0, 1).contiguous()  # [S, B, H]
+        # Time-shifted stream for v2 (pad one at the front along sequence)
+        hs_d = F.pad(hs[:-1], pad=(0, 0, 0, 0, 1, 0))  # [S, B, H]
+        # Q/K in the full space
+        q = self.linear_q(hs)  # [S, B, latent_q_dim]
+        k = self.linear_k(hs)  # [S, B, latent_k_dim]
+        qk_packed0 = torch.cat([q, k], dim=-1)  # [S, B, latent_q + latent_k]
+        # Pre-mean tensors in head form (for "qk_mean_{q,k}" calc)
+        query_pre = qk_packed0[..., : self.latent_q_dim].view(
+            *qk_packed0.shape[:2], self.num_q_heads, self.head_dim
+        )  # [S, B, qh, dh]
+        key_pre = qk_packed0[..., self.latent_q_dim :].view(
+            *qk_packed0.shape[:2], self.num_kv_heads, self.head_dim
+        )  # [S, B, kh, dh]
+        key_pre = (
+            key_pre.unsqueeze(-2)
+            .repeat(1, 1, 1, self.gqa_groups, 1)
+            .view(*qk_packed0.shape[:2], self.num_q_heads, self.head_dim)
+        )  # [S, B, qh, dh]
+        # Means for residual mixing
+        qk_mean_q = (query_pre + key_pre) / 2
+        qk_mean_k = qk_mean_q.view(*qk_mean_q.shape[:2], self.num_kv_heads, self.gqa_groups, -1).mean(dim=-2)
+        if past_key_values is not None:
+            if past_key_values.has_previous_state:
+                # Generation
+                qk_packed0 = qk_packed0.transpose(0, 1)  # [B, 1, H]
+                qk_packed0_cached = past_key_values.conv_states[self.layer_idx]  # [B, H, 2]
+                qk_packed0_cat = torch.cat([qk_packed0_cached, qk_packed0.transpose(1, 2)], dim=-1)  # [B, H, 3]
+                qk_packed3 = self.conv_qk(qk_packed0_cat).permute(2, 0, 1)  # [S, B, E]
+                qk_packed0_cache = past_key_values.update_conv_state(
+                    layer_idx=self.layer_idx, new_conv_state=qk_packed0
+                )  # [B, H, 2]
+            else:
+                # Prefill
+                qk_packed0_transposed = qk_packed0.permute(1, 2, 0)  # [S, B, H] -> [B, H, S]
+                conv_states = nn.functional.pad(
+                    qk_packed0_transposed,
+                    (
+                        self.cca_time0 - qk_packed0_transposed.shape[-1],
+                        0,
+                    ),
+                )
+                qk_packed0_cache = past_key_values.update_conv_state(
+                    layer_idx=self.layer_idx, new_conv_state=conv_states
+                )
+                # Convs over sequence: [S, B, E] -> [B, E, S] -> pad -> conv ->
+                # [S, B, E]
+                qk_packed1 = qk_packed0.permute(1, 2, 0)  # [B, E, S]
+                qk_packed2 = F.pad(qk_packed1, (self.total_padding, 0))
+                qk_packed3 = self.conv_qk(qk_packed2).permute(2, 0, 1)  # [S, B, E]
+        else:
+            # Convs over sequence: [S, B, E] -> [B, E, S] -> pad -> conv -> [S,
+            # B, E]
+            qk_packed1 = qk_packed0.permute(1, 2, 0)  # [B, E, S]
+            qk_packed2 = F.pad(qk_packed1, (self.total_padding, 0))
+            qk_packed3 = self.conv_qk(qk_packed2).permute(2, 0, 1)  # [S, B, E]
+        # Build queries/keys from conv output + means
+        query = (
+            qk_packed3[..., : self.latent_q_dim].view(*qk_packed3.shape[:2], self.num_q_heads, self.head_dim)
+            + qk_mean_q
+        )  # [S, B, qh, dh]
+        key = (
+            qk_packed3[..., self.latent_q_dim :].view(*qk_packed3.shape[:2], self.num_kv_heads, self.head_dim)
+            + qk_mean_k
+        )  # [S, B, kh, dh]
+        # Values from the two time streams
+        v1 = self.val_proj1(hs)  # [S, B, latent_k_dim/2]
+        if past_key_values is not None:
+            if past_key_values.has_previous_state:
+                # Generation
+                # [B, H]
+                hs_d = past_key_values.prev_hs[self.layer_idx].clone()
+                hs_d = hs_d.unsqueeze(0)  # [1, B, H]
+            else:
+                past_key_values.prev_hs[self.layer_idx] = torch.zeros(batch_size, self.hidden_size, device=hs.device, dtype=hs.dtype)
+            past_key_values.prev_hs[self.layer_idx].copy_(hs[-1, :, :])
+        v2 = self.val_proj2(hs_d)  # [S, B, latent_k_dim/2]
+        value = (
+            torch.cat([v1, v2], dim=-1).contiguous().view(*hs.shape[:2], self.num_kv_heads, self.head_dim)
+        )  # [S, B, kh, dh]
+        # L2-normalize per head, then scale
+        query_norm = query.norm(p=2, dim=-1, keepdim=True)
+        key_norm = key.norm(p=2, dim=-1, keepdim=True)
+        key = (key * (self.sqrt_head_dim / key_norm)) * self.temp[None, None].unsqueeze(-1)
+        query = query * (self.sqrt_head_dim / query_norm)
+        # Flatten head axis, then return to HF layout [B, S, ...]
+        query = query.view(*query.shape[:2], self.num_q_heads * self.head_dim).transpose(0, 1).contiguous()
+        key = key.view(*key.shape[:2], self.num_kv_heads * self.head_dim).transpose(0, 1).contiguous()
+        value = value.view(*value.shape[:2], self.num_kv_heads * self.head_dim).transpose(0, 1).contiguous()
+        query_states = query
+        key_states = key
+        value_states = value
+        query_states = query_states.view(batch_size, seq_length, self.num_q_heads, self.head_dim)
+        key_states = key_states.view(batch_size, seq_length, self.num_kv_heads, self.head_dim)
+        value_states = value_states.view(batch_size, seq_length, self.num_kv_heads, self.head_dim)
+        # RoPE.
+        if self.config.rope_theta_local > 0.0:
+            cos, sin = position_embeddings
+            if self.config.rope_type_local == "rope":
+                query_states = apply_rotary_emb(query_states, cos, sin)
+                key_states = apply_rotary_emb(key_states, cos, sin)
+            elif self.config.rope_type_local == "p-rope":
+                query_states = apply_p_rotary_emb(query_states, cos, sin, p=0.5)
+                key_states = apply_p_rotary_emb(key_states, cos, sin)
+            else:
+                raise ValueError(f"Unknow rope type : {self.config.rope_type_local}")
+        # KV-cache.
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+        # attention computation.
+        wsize = min(self.window_size, self.config.slw_wsize) if self.config.slw_wsize > 0 else self.window_size
+        if ATTN_IMPL == "eager":
+            attention_interface = lambda q, k, v, wsize, **kw: eager_attention_forward(q, k, v, window_size=(wsize, 0), **kw)
+        elif ATTN_IMPL == "flex":
+            if wsize != self.last_wsize:
+                self.last_wsize = self.build_mask(wsize)
+            attention_interface = lambda q, k, v, softmax_scale, **kw: flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=create_block_mask(self.attn_mask, B=None, H=None, Q_LEN=q.size(1), KV_LEN=k.size(1)), score_mod=self.score_mod, scale=softmax_scale, enable_gqa=self.num_attention_heads > self.num_key_value_heads).transpose(1, 2)
+        elif ATTN_IMPL == "fa2":
+            attention_interface = lambda q, k, v, wsize, **kw: flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)
+        elif ATTN_IMPL == "fa3":
+            attention_interface = lambda q, k, v, wsize, **kw: flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)[0]
+        else:
+            raise ValueError(f"Unknown ATTN_IMPL: {ATTN_IMPL}")
+        attn_output = attention_interface(
+            query_states.bfloat16(),
+            key_states.bfloat16(),
+            value_states.bfloat16(),
+            causal=True,
+            wsize=wsize,
+            softcap=self.config.softcap_local_attn,
+            softmax_scale=None if not self.config.use_uscaling else 1/self.head_dim,
+        )
+        return attn_output, None, None
 class DragonNativeSparseAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper.
         projection_dim = self.head_qk_dim * self.num_attention_heads + self.head_qk_dim * self.num_key_value_heads + (self.head_v_dim * self.num_noise_heads//2)
         self.linear_qkv = DragonLinear(config, config.hidden_size, projection_dim, bias=False)
+        self.linear_qkv.norm_case_1 = True
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
         self.W_A_v = DragonLinear(config, self.hidden_size, self.num_noise_heads * self.rank, bias=False)
         self.W_B_k = DragonLinear(config, self.hidden_size, self.rank * self.head_qk_dim, bias=False)
         self.W_B_v = DragonLinear(config, self.hidden_size, self.rank * self.head_v_dim, bias=False)
+        self.c_q.norm_case_1 = True
+        # todo: norm others?
         if self.config.token_shift_attn:
             if self.config.scalar_proj_as_hidden_matrix:
             self.num_attention_heads*self.dk + self.n_kv_heads*self.dk + self.n_kv_heads*self.dv,
             bias=False
         )
+        self.linear_qkv.norm_case_1 = True
         self.linear_ba = DragonLinear(
             config, config.hidden_size,
             self.num_attention_heads + self.num_attention_heads, #+ self.num_attention_heads*self.dv, # b(H), a(H), g(H*dv)
             bias=False
         )
+        if config.legacy_gate:
+            if config.gate_type == 'kimi':
+                self.linear_g = nn.Sequential(
+                    DragonLinear(config, config.hidden_size, self.dv, bias=False),
+                    DragonLinear(config, self.dv, self.n_kv_heads*self.dv, bias=True),
+                )
+                self.output_norm = FusedRMSNormGated(hidden_size=self.dv, eps=config.norm_epsilon, activation='sigmoid')
+            else:
+                self.linear_g = DragonLinear(
+                    config, config.hidden_size,
+                    self.n_kv_heads * self.dv,
+                    bias=False
+                )
+                self.output_norm = FusedRMSNormGated(hidden_size=self.dv, eps=config.norm_epsilon)
+            self.linear_g.norm_case_1 = True
         dt_min = config.time_step_min
         dt_max = config.time_step_max
         dt_init_floor = config.time_step_floor
         inv_dt = dt + torch.log(-torch.expm1(-dt))
         with torch.no_grad():
             self.dt_bias = nn.Parameter(inv_dt)
+        self.dt_bias._no_weight_decay = True
         assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
         A = torch.empty(self.n_heads_local, dtype=torch.float32).uniform_(*A_init_range)
         A_log = torch.log(A)  # Keep A_log in fp32
         self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
         if self.config.rope_gdn == "rope":
             self.rope_proj = DragonLinear(config, config.hidden_size, self.dk//4, bias=False)
                 use_qk_l2norm_in_kernel=True
             ) # (B L H dv)
+        if self.config.legacy_gate:
+            g = self.linear_g(hidden_states) # (B, L, H*dv)
+            g = rearrange(g, "b l (h d) -> b l h d", h=self.n_kv_heads)
+            o = self.output_norm(o, g)
         # update GDN cache
         if cache_params is not None:
             cache_params.ssm_caches[self.layer_idx] = ssm_cache
         self.q_proj = DragonLinear(config, config.hidden_size, self.key_dim, bias=False)
         self.k_proj = DragonLinear(config, config.hidden_size, self.key_dim, bias=False)
         self.v_proj = DragonLinear(config, config.hidden_size, self.value_dim, bias=False)
+        self.q_proj.norm_case_1 = True
+        self.k_proj.norm_case_1 = True
+        self.v_proj.norm_case_1 = True
         self.q_conv1d = ShortConvolution(
             hidden_size=self.key_dim,
         self.A_log = nn.Parameter(torch.log(torch.empty(self.num_q_heads, dtype=torch.float32).uniform_(1, 16)))
         self.dt_bias = nn.Parameter(torch.zeros(self.key_dim, dtype=torch.float32))
+        if config.legacy_gate:
+            if config.gate_type == 'kimi':
+                self.linear_g = nn.Sequential(
+                    DragonLinear(config, config.hidden_size, self.head_v_dim, bias=False),
+                    DragonLinear(config, self.head_v_dim, self.num_attention_heads*self.head_v_dim, bias=True),
+                )
+                self.output_norm = FusedRMSNormGated(hidden_size=self.head_v_dim, eps=config.norm_epsilon, activation='sigmoid')
+            else:
+                self.linear_g = DragonLinear(
+                    config, config.hidden_size,
+                    self.num_attention_heads * self.head_v_dim,
+                    bias=False
+                )
+                self.output_norm = FusedRMSNormGated(hidden_size=self.head_v_dim, eps=config.norm_epsilon)
+            self.linear_g.norm_case_1 = True
     @disable
     def _kda_gate_call(self, g, A_log, head_k_dim, g_bias):
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         cache_params: Optional[HybridDragonDynamicCache] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
         **kwargs,
     ):
         _, q_len, _ = hidden_states.shape
             conv_state_k = cache_params.k_conv_caches[self.layer_idx]
             conv_state_v = cache_params.v_conv_caches[self.layer_idx]
+        seq_idx = None
+        if cu_seqlens is not None:
+            seq_idx = prepare_sequence_ids(cu_seqlens).to(torch.int32).unsqueeze(0)
         q, conv_state_q = self.q_conv1d(
             x=self.q_proj(hidden_states),
             cache=conv_state_q,
             output_final_state=cache_params is not None,
+            seq_idx=seq_idx,
         )
         k, conv_state_k = self.k_conv1d(
             x=self.k_proj(hidden_states),
             cache=conv_state_k,
             output_final_state=cache_params is not None,
+            seq_idx=seq_idx,
         )
         v, conv_state_v = self.v_conv1d(
             x=self.v_proj(hidden_states),
             cache=conv_state_v,
             output_final_state=cache_params is not None,
+            seq_idx=seq_idx,
         )
         g = self.f_proj(hidden_states)
                 initial_state=None,
                 output_final_state=cache_params is not None,
                 use_qk_l2norm_in_kernel=True,
+                cu_seqlens=cu_seqlens,
             )
         elif mode == 'fused_recurrent':
             o, ssm_cache = fused_recurrent_kda(
         #o = o * F.silu(rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=self.head_v_dim))
         # TODO: other types of gates? as well as ZCG?
+        if self.config.legacy_gate:
+            g = self.linear_g(hidden_states) # (B, L, H*dv)
+            g = rearrange(g, "b l (h d) -> b l h d", h=self.num_attention_heads)
+            o = self.output_norm(o, g)
         if cache_params is not None:
             cache_params.ssm_caches[self.layer_idx] = ssm_cache
             cache_params.q_conv_caches[self.layer_idx] = conv_state_q
         if config.mamba3_rope:
             self.rope_proj = DragonLinear(config, self.d_model, self.num_rope_angles, bias=False)
+        # Order: [x, B, C, dt]
+        d_in_proj = self.d_inner + 2 * self.d_state * self.ngroups + self.nheads
         if self.config.mamba3_is_A_dd:
             self.A_proj = DragonLinear(config, self.d_model, self.nheads, bias=False, dtype=torch.float32)
         self.dt_bias._no_weight_decay = True
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
+        self.in_proj.norm_case_1 = True
         self.B_bias, self.C_bias = None, None
         if not config.mamba3_remove_BC_bias:
         self.D = nn.Parameter(torch.ones(self.nheads))
         self.D._no_weight_decay = True
+        if config.legacy_gate:
+            self.linear_g = DragonLinear(
+                config, config.hidden_size,
+                self.d_inner,
+                bias=False,
+            )
+            self.linear_g.norm_case_1 = True
+            if config.mamba3_postgate_norm:
+                self.output_norm = RMSNormGated(self.d_inner, eps=config.norm_epsilon, norm_before_gate=False)
+    def forward(
         self,
         hidden_states: torch.Tensor,
         cache_params: Optional[HybridDragonDynamicCache] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
         **kwargs
     ):
+        cached_len = None
+        if cache_params is not None:
+            hidden_states_cached = cache_params.ssm_caches[self.layer_idx] # (B, L, D)
+            if hidden_states_cached is not None:
+                cached_len = hidden_states_cached.shape[1]
+                hidden_states = torch.cat([hidden_states_cached, hidden_states], dim=1) # (B, L+1, D)
+            cache_params.ssm_caches[self.layer_idx] = hidden_states
         # Apply in_proj
+        xBCdt = self.in_proj(hidden_states) # (B, l, D), l=1 when decoding
+        xBC, dd_dt = torch.split(
+            xBCdt,
             [
                 self.d_inner + 2 * self.d_state * self.ngroups,
                 self.nheads,
             ],
             _A = -torch.exp(self.A_log).unsqueeze(0).unsqueeze(0)
         dt = F.softplus(dd_dt + self.dt_bias) # (B, L, N)
+        seq_idx = None
+        if cu_seqlens is not None:
+            seq_idx = prepare_sequence_ids(cu_seqlens).to(torch.int32).unsqueeze(0)
         if not self.config.mamba3_remove_conv:
             xBC = causal_conv1d_fn(
                 x=xBC.transpose(1, 2),
                 weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
                 bias=self.conv1d.bias,
                 activation=self.activation,
+                seq_idx=seq_idx,
             ).transpose(1, 2) # (B, L, self.d_inner + 2 * ngroups * d_state)
         x, B, C = torch.split(
             x_scalar = (gamma_arr*_alpha_arr).to(torch.bfloat16)
         out = mamba_chunk_scan_discretized_combined(
             x=x.bfloat16(),
             A=A,
             CB_sum=CB_sum,
             D=self.D,
             z=None,
+            initial_states=None, # ssm_cache,
+            return_final_states=False, # cache_params is not None,
+            seq_idx=seq_idx,
         )
+        y = out
+        if self.config.legacy_gate:
+            if not self.config.mamba3_postgate_norm:
+                g = self.linear_g(hidden_states) # (B, L, d_inner)
+                y = rearrange(y, "b l h p -> b l (h p)")
+                y = y * F.silu(g)
+                y = rearrange(y, "b l (h p) -> b l h p", h=self.nheads)
+            else:
+                g = self.linear_g(hidden_states) # (B, L, d_inner)
+                y = rearrange(y, "b l h p -> b l (h p)")
+                y = self.output_norm(y, g)
+                y = rearrange(y, "b l (h p) -> b l h p", h=self.nheads)
+        if cached_len and cached_len > 0:
+            y = y[:, cached_len:, :] # keep only the new Ln steps
         return y, None, None
         # Order: [x, B, C, dt]
         d_in_proj = self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=False)
+        self.in_proj.norm_case_1 = True
         if not self.config.mamba3_remove_conv:
             conv_dim = self.d_inner + 2 * self.ngroups * self.d_state
         self.D = nn.Parameter(torch.ones(self.nheads))
         self.D._no_weight_decay = True
+        if config.legacy_gate:
+            self.linear_g = DragonLinear(
+                config, config.hidden_size,
+                self.d_inner,
+                bias=False,
+            )
+            self.linear_g.norm_case_1 = True
+            self.output_norm = RMSNormGated(self.d_inner, eps=config.norm_epsilon, norm_before_gate=False)
     def forward(self, hidden_states, **kwargs):
         """
         u: (B, L, D)
             initial_states=None,
         )
+        if self.config.legacy_gate:
+            g = self.linear_g(hidden_states) # (B, L, d_inner)
+            y = rearrange(y, "b l h p -> b l (h p)")
+            y = self.output_norm(y, g)
+            y = rearrange(y, "b l (h p) -> b l h p", h=self.nheads)
         return y, None, None
 class DragonMamba3Mimo(nn.Module):
                 "when creating this class."
             )
+        assert not self.config.gate_gdn, "gate must done inside the mimo mamba3 block."
         self.d_model = config.hidden_size
+        self.d_state = config.mamba_d_state
         self.conv_init = None
         self.expand = 2
+        self.headdim = config.mamba_headdim
         self.ngroups = config.mamba_ngroups
         self.activation = "swish"
         self.bias = False
         self.dt_init_floor = 1e-4
         self.mimo_dim = config.mamba_mimo_dim
         self.mimo_proj_block_order = 1
         self.d_inner = int(self.expand * self.d_model)
         assert self.d_inner % self.headdim == 0
         self.nheads = self.d_inner // self.headdim
         self.dr_out_dim = self.d_inner // self.mimo_proj_block_order
         self.split_tensor_size = int(self.d_state * self.rope_fraction)
         if self.split_tensor_size % 2 != 0:
             self.split_tensor_size -= 1
         self.dt_bias._no_weight_decay = True
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
+        self.in_proj.norm_case_1 = True
         self.B_bias = nn.Parameter(torch.ones((self.mimo_dim, self.nheads, self.d_state)), requires_grad=True)
         self.C_bias = nn.Parameter(torch.ones((self.mimo_dim, self.nheads, self.d_state)), requires_grad=True)
         self.in_proj_mimo_z = nn.Parameter(in_proj_mimo_z_init_weights, requires_grad=True)
         self.out_proj_mimo = nn.Parameter(out_proj_mimo_init_weights, requires_grad=True)
         # D "skip" parameter
         self.D = nn.Parameter(torch.ones(self.nheads))
         self.D._no_weight_decay = True
+        if config.legacy_gate:
+            if config.mamba3_postgate_norm:
+                self.output_norm = RMSNormGated(self.d_inner, eps=config.norm_epsilon, norm_before_gate=False)
     def forward(self, hidden_states, **kwargs):
         # Apply in_proj
         zxBCdt = self.in_proj(hidden_states)
         _beta_arr = torch.roll(beta_arr, shifts=-1, dims=1)
         x_scalar = (gamma_arr*_alpha_arr + _beta_arr).to(torch.bfloat16)
         z = rearrange(z, "b l r (h p) -> b l r h p", p=self.headdim)
         y = mamba_mimo_chunk_scan_discretized_fused_combined(
             gamma=gamma_arr,
             CB_sum=CB_sum,
             D=self.D,
+            z=z if not (self.config.legacy_gate and self.config.mamba3_postgate_norm) else None,
         )
         y = rearrange(y, "b l r h p -> b l r (h p)")
+        if self.config.legacy_gate and self.config.mamba3_postgate_norm:
+            z = rearrange(z, "b l r h p -> b l r (h p)")
+            y = self.output_norm(y, z)
         #if seqlen_og is not None:
         #    y = rearrange(y, "b l r d -> (b l) r d")
             self.lambda1 = nn.Parameter(torch.zeros(self.link_size))  # sigmoid->0.5
         else :
             self.fc_1 = DragonLinear(config, config.hidden_size, intermediate_size, bias=False)
+        self.fc_1.norm_case_1 = True
         self.fc_2 = DragonLinear(config, intermediate_size, config.hidden_size, bias=False)
+        self.fc_2.norm_case_2 = True
         self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
     def forward(self, hidden_states):
         self.intermediate_size = intermediate_size
         self.fc_1 = DragonLinear(config, config.hidden_size, num_active_experts*self.intermediate_size, bias=False)
+        self.fc_1.norm_case_1 = True
         self.fc_2 = DragonLinear(config, num_active_experts*self.intermediate_size, config.hidden_size, bias=False)
+        self.fc_2.norm_case_2 = True
         self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
     def forward(self, hidden_states, gates):
             head_dim = self.mixer.head_dim
             num_attention_heads = self.mixer.num_q_heads
             use_gate = config.gate_attn
+        elif layer_type == 'C':
+            self.mixer = DragonCompressedConvolutionalAttention2(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_q_heads
+            use_gate = config.gate_attn
         elif layer_type == 'n':
             self.mixer = DragonNativeSparseAttention(config, reuse_kv=False, layer_idx=layer_idx)
             head_dim = self.mixer.head_dim
             self.mixer = DragonMamba3(config, layer_idx=layer_idx)
             head_dim = self.mixer.headdim
             num_attention_heads = self.mixer.nheads
+            use_gate = config.gate_gdn
         elif layer_type == '2':
             self.mixer = DragonMamba2(config, layer_idx=layer_idx)
             head_dim = self.mixer.headdim
             head_dim = self.mixer.headdim
             num_attention_heads = self.mixer.nheads
             use_gate = False # inside Mamba3Mimo
+        elif layer_type == 'b':
+            self.mixer = DragonMoBAttention(config, reuse_kv=False, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_attn
         else:
             raise ValueError(f"Unknown layer type: {layer_type}")
                     self.gate_proj.is_scalar_weight = True
             else:
                 raise ValueError(f"Unknown gate_type: {self.config.gate_type}")
+            self.gate_proj.norm_case_1 = True
             if self.config.zero_centered_gate:
                 val = 1.
                 if self.config.zero_centered_gate_type==3:
         self.use_gate = use_gate
         self.mixer_proj = DragonLinear(config, head_dim*num_attention_heads, config.hidden_size, bias=False)
+        self.mixer_proj.norm_case_2 = True
         if config.mixer_gn:
             self.mixer_group_norm = DragonHeadWiseRMSNorm(n_heads=num_attention_heads, d_head=head_dim, eps=config.norm_epsilon, zero_centered_gamma=config.zero_centered_gamma)
             cu_seqlens=cu_seqlens,
             max_seqlen=max_seqlen,
         ) # (B, L, E*D)
+        if self.config.mixer_gn and not self.config.gate_before_norm:
+            y_mixer = self.mixer_group_norm(y_mixer)
         if self.use_gate:
             if self.config.gate_type == "elementwise" or self.config.gate_type == "kimi":
                 g_proj = self.gate_proj(hidden_states).view(hidden_states.size(0), hidden_states.size(1), self.num_attention_heads, self.head_dim).to(y_mixer.dtype)
                 y_mixer = y_mixer * (self.gate_act(g_proj) + self.gate_bias)
             elif self.config.zero_centered_gate_type == 3 or self.config.zero_centered_gate_type == 4:
                 y_mixer = y_mixer * self.gate_act(g_proj + self.gate_bias)
+        if self.config.mixer_gn and self.config.gate_before_norm:
             y_mixer = self.mixer_group_norm(y_mixer)
         y_mixer = y_mixer.view(y_mixer.size(0), y_mixer.size(1), -1)
         y_mixer = self.mixer_proj(y_mixer)
         return hidden_states, last_key_states, last_value_states
+class DragonGHyperConnection(nn.Module):
+    def __init__(self, config: DragonConfig, m, n_in=3):
+        super().__init__()
+        self.config = config
+        self.m, self.n_in = m, n_in
+        dim = self.config.hidden_size
+        self.factor = 1.0 / math.sqrt(dim // self.m)
+        # Initialize static beta: cyclic pattern
+        static_beta_tensor = torch.zeros(self.m, n_in)
+        for j in range(n_in):
+            static_beta_tensor[j % self.m, j] = 1.0
+        self.static_beta = nn.Parameter(static_beta_tensor.T.contiguous())
+        # Initialize static alpha: block matrix
+        init_alpha = torch.cat([torch.eye(self.m), torch.eye(self.m), torch.zeros((self.m, self.n_in - self.m))], dim=1)
+        if self.n_in > self.m:
+            part2 = torch.cat([torch.zeros((self.n_in - self.m, self.m * 2)), torch.eye(self.n_in - self.m)], dim=1)
+            init_alpha = torch.cat([init_alpha, part2], dim=0)
+        self.static_alpha = nn.Parameter(init_alpha.contiguous())
+        # Dynamic parameters
+        self.dynamic_alpha_fn = nn.Parameter(torch.zeros((dim // self.m, self.m + self.n_in)))
+        self.dynamic_beta_fn = nn.Parameter(torch.zeros((dim // self.m, self.m)))
+        self.dynamic_alpha_fn.requires_weight_decay = True
+        self.dynamic_beta_fn.requires_weight_decay = True
+        if self.config.vwn_dynamic:
+            self.dynamic_alpha_scale = nn.Parameter(torch.ones_like(self.static_alpha))
+            self.dynamic_beta_scale = nn.Parameter(torch.ones_like(self.static_beta))
+            if config.vwn_wd_alpha_beta:
+                self.dynamic_alpha_scale.requires_weight_decay = True
+                self.dynamic_beta_scale.requires_weight_decay = True
+        else:
+            self.register_buffer("dynamic_alpha_scale", torch.zeros_like(self.static_alpha), persistent=False)
+            self.register_buffer("dynamic_beta_scale", torch.zeros_like(self.static_beta), persistent=False)
+        self.layer_norm = DragonNorm(config, dim//self.m)
+    def _base_width_connection(self, h, dynamic_fn, dynamic_scale, static_scale):
+        h_shape = h.shape
+        N, NMM = static_scale.shape
+        M = (NMM - N) // 2
+        h_reshape = h.reshape((h_shape[:-1].numel(),) + (N, h_shape[-1] // N))
+        norm_h = self.layer_norm(h_reshape)
+        alpha_beta = (F.tanh(norm_h @ dynamic_fn.T.to(dtype=norm_h.dtype) * self.factor) * dynamic_scale[None, ...] + static_scale[None, ...])
+        alpha, beta = torch.split(alpha_beta, (M + N, M), dim=-1)
+        mix_h = (h_reshape.transpose(1, 2) @ alpha.to(dtype=h_reshape.dtype)).transpose(1, 2)
+        return mix_h.reshape(h_shape[:-1] + mix_h.shape[1:]), beta
+    def width_connection(self, h):
+        dynamic_fn = torch.concat([self.dynamic_alpha_fn.T, self.dynamic_beta_fn.T], dim=0)
+        dynamic_scale = torch.concat([self.dynamic_alpha_scale, self.dynamic_beta_scale], dim=-1).contiguous()
+        static_scale = torch.concat([self.static_alpha, self.static_beta], dim=-1)
+        return self._base_width_connection(h, dynamic_fn.to(dtype=h.dtype), dynamic_scale.to(dtype=h.dtype), static_scale.to(dtype=h.dtype))
+    def depth_connection(self, mix_h, h_o, beta, sqrt_one_minus_tau, sqrt_tau):
+        h_o_shape = h_o.shape
+        h_o = h_o.reshape(h_o_shape[:-1] + (self.m, h_o_shape[-1] // self.m))
+        h_i = beta.view(h_o.shape[:2] + beta.shape[1:]).to(dtype=h_o.dtype) @ h_o
+        h = sqrt_tau * h_i + sqrt_one_minus_tau * mix_h[..., self.m:, :]
+        h_shape = h.shape
+        return h.reshape(h_shape[:-2] + (h_shape[-2] * h_shape[-1],)).contiguous()
+class DragonMonoVirtualBlock(GradientCheckpointingLayer):
+    def __init__(self, config: DragonConfig, layer_idx: int, layer_type: str):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        assert self.config.vwn
+        if layer_type == 'g':
+            self.mixer = DragonGatedDeltaNet(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_gdn
+        elif layer_type == 'f':
+            self.mixer = DragonDifferentialAttention(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_signal_heads
+            use_gate = config.gate_attn
+        elif layer_type == 's':
+            self.mixer = DragonDeepSeekSparseAttention(config, reuse_kv=False, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'm':
+            self.mixer = DragonDynamicMaskAttention(config, reuse_kv=False, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'w':
+            self.mixer = DragonAttention(config, reuse_kv=False, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'p':
+            self.mixer = DragonSlidingWindowRecurrenceAttention(config)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'c':
+            self.mixer = DragonCompressedConvolutionalAttention(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_q_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'C':
+            self.mixer = DragonCompressedConvolutionalAttention2(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_q_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'n':
+            self.mixer = DragonNativeSparseAttention(config, reuse_kv=False, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_attn
+        elif layer_type == 't':
+            self.mixer = DragonTensorProductAttention(config, reuse_kv=False, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'T':
+            self.mixer = DragonDifferentialTensorProductAttention(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_signal_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'A':
+            self.mixer = DragonDifferentialMultiLatentAttention(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_signal_heads
+            use_gate = config.gate_attn
+        elif layer_type == 'k':
+            self.mixer = DragonKimiDeltaAttention(config, layer_idx=layer_idx)
+            head_dim = self.mixer.head_dim
+            num_attention_heads = self.mixer.num_attention_heads
+            use_gate = config.gate_gdn
+        elif layer_type == '3':
+            self.mixer = DragonMamba3(config, layer_idx=layer_idx)
+            head_dim = self.mixer.headdim
+            num_attention_heads = self.mixer.nheads
+            use_gate = config.gate_gdn
+        elif layer_type == '2':
+            self.mixer = DragonMamba2(config, layer_idx=layer_idx)
+            head_dim = self.mixer.headdim
+            num_attention_heads = self.mixer.nheads
+            use_gate = config.gate_gdn
+        elif layer_type == 'M':
+            self.mixer = DragonMamba3Mimo(config, layer_idx=layer_idx)
+            head_dim = self.mixer.headdim
+            num_attention_heads = self.mixer.nheads
+            use_gate = False # inside Mamba3Mimo
+        else:
+            raise ValueError(f"Unknown layer type: {layer_type}")
+        if use_gate:
+            if self.config.gate_type == "elementwise":
+                self.gate_proj = DragonLinear(self.config, config.hidden_size, num_attention_heads*head_dim, bias=False)
+            elif self.config.gate_type == "kimi":
+                self.gate_proj = nn.Sequential(
+                    DragonLinear(config, config.hidden_size, head_dim, bias=False),
+                    DragonLinear(config, head_dim, num_attention_heads*head_dim, bias=True),
+                )
+            elif self.config.gate_type == "headwise":
+                if self.config.scalar_proj_as_hidden_matrix:
+                    self.gate_proj = DragonLinear(self.config, config.hidden_size, num_attention_heads, bias=False)
+                else:
+                    self.gate_proj = DragonLinear(self.config, config.hidden_size, num_attention_heads, bias=False, alpha_fwd=1., alpha_bwd=1.)
+                    self.gate_proj.is_scalar_weight = True
+            else:
+                raise ValueError(f"Unknown gate_type: {self.config.gate_type}")
+            self.gate_proj.norm_case_1 = True
+            if self.config.zero_centered_gate:
+                val = 1.
+                if self.config.zero_centered_gate_type==3:
+                    val = 1.28 # F.silu(E(g) + 1.28) = 1
+                elif self.config.zero_centered_gate_type==4:
+                    val = 1.15 # E(silu(g + 1.15)) = 1
+                self.register_buffer("gate_bias", torch.tensor(val), persistent=False)
+            else:
+                self.register_buffer("gate_bias", torch.tensor(0.), persistent=False)
+            if self.config.gate_act == "silu":
+                self.gate_act = F.silu
+            elif self.config.gate_act == "sigmoid":
+                self.gate_act = F.sigmoid
+            else:
+                raise ValueError(f"Unknown gate_act: {self.config.gate_act}")
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.use_gate = use_gate
+        self.mixer_proj = DragonLinear(config, head_dim*num_attention_heads, config.hidden_size, bias=False)
+        self.mixer_proj.norm_case_2 = True
+        if config.mixer_gn:
+            self.mixer_group_norm = DragonHeadWiseRMSNorm(n_heads=num_attention_heads, d_head=head_dim, eps=config.norm_epsilon, zero_centered_gamma=config.zero_centered_gamma)
+        self.input_norm = DragonNorm(config, config.hidden_size)
+        self.postmixer_norm = DragonNorm(config, config.hidden_size)
+        self.mixer_ghyper_connection = DragonGHyperConnection(config, m=config.vwn_m, n_in=config.vwn_n)
+        self.mlp_ghyper_connection = DragonGHyperConnection(config, m=config.vwn_m, n_in=config.vwn_n)
+        if not config.moe:
+            if config.mlp_type == "simple":
+                self.mlp = DragonMLP(config)
+            elif config.mlp_type == "gated":
+                self.mlp = GatedMlp(in_features=config.hidden_size, hidden_features=config.intermediate_size, out_features=config.hidden_size, activation=F.silu, bias1=False, bias2=False)
+        else:
+            self.mlp = DragonMoE(config)
+        if config.use_uscaling or not config.layer_norm_scaling:
+            self.register_buffer("lns", torch.tensor(1.0), persistent=False)
+        else:
+            self.register_buffer("lns", torch.tensor(1. / math.sqrt(layer_idx + (2 if config.old_lns else 1))), persistent=False)
+        self.register_buffer("sqrt_tau", torch.sqrt(torch.tensor(self.config.uscaling_tau)) if config.use_uscaling else torch.tensor(1.0), persistent=False)
+        self.register_buffer("sqrt_one_minus_tau", torch.sqrt(torch.tensor(1.0 - self.config.uscaling_tau)) if config.use_uscaling else torch.tensor(1.0), persistent=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridDragonDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        key_value_last_layer: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+        **kwargs,
+    ):
+        # hidden_states : (B, L, D'). D' = n/m D (expanded width)
+        # MIXER.
+        mix_h, beta = self.mixer_ghyper_connection.width_connection(hidden_states)
+        mix_h_shape = mix_h.shape
+        h = mix_h[..., :self.config.vwn_m, :].reshape(mix_h_shape[:-2] + (self.config.vwn_m * mix_h_shape[-1],))
+        # h is (B, L, D)
+        h = self.lns * self.input_norm(h)
+        y_mixer, last_key_states, last_value_states = self.mixer(
+            hidden_states=h,
+            position_embeddings=position_embeddings,
+            position_ids=position_ids,
+            cache_params=cache_params,
+            key_value_last_layer=key_value_last_layer,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        ) # (B, L, E*D)
+        if self.config.mixer_gn and not self.config.gate_before_norm:
+            y_mixer = self.mixer_group_norm(y_mixer)
+        if self.use_gate:
+            if self.config.gate_type == "elementwise" or self.config.gate_type == "kimi":
+                g_proj = self.gate_proj(h).view(h.size(0), h.size(1), self.num_attention_heads, self.head_dim).to(y_mixer.dtype)
+            elif self.config.gate_type == "headwise":
+                g_proj = self.gate_proj(h).unsqueeze(-1).to(y_mixer.dtype)
+            else:
+                raise ValueError(f"Unknown gate_type: {self.config.gate_type}")
+            if self.config.zero_centered_gate_type == 1:
+                y_mixer = y_mixer * self.gate_act(g_proj)
+                y_mixer = y_mixer + self.gate_bias
+            elif self.config.zero_centered_gate_type == 2:
+                y_mixer = y_mixer * (self.gate_act(g_proj) + self.gate_bias)
+            elif self.config.zero_centered_gate_type == 3 or self.config.zero_centered_gate_type == 4:
+                y_mixer = y_mixer * self.gate_act(g_proj + self.gate_bias)
+        if self.config.mixer_gn and self.config.gate_before_norm:
+            y_mixer = self.mixer_group_norm(y_mixer)
+        y_mixer = y_mixer.view(y_mixer.size(0), y_mixer.size(1), -1)
+        y_mixer = self.mixer_proj(y_mixer) # (B, L, D)
+        h = self.mixer_ghyper_connection.depth_connection(mix_h, y_mixer, beta, self.sqrt_one_minus_tau, self.sqrt_tau) # (B, L, D')
+        # MLP.
+        mix_h, beta = self.mlp_ghyper_connection.width_connection(h)
+        mix_h_shape = mix_h.shape
+        h = mix_h[..., :self.config.vwn_m, :].reshape(mix_h_shape[:-2] + (self.config.vwn_m * mix_h_shape[-1],))
+        # h is (B, L, D)
+        h = self.lns * self.postmixer_norm(h)
+        y_mlp = self.mlp(h) # (B, L, D)
+        h = self.mlp_ghyper_connection.depth_connection(mix_h, y_mlp, beta, self.sqrt_one_minus_tau, self.sqrt_tau) # (B, L, D')
+        return h, 0, 0
 class DragonBlock(GradientCheckpointingLayer):
     def __init__(self, config: DragonConfig, layer_idx: int, layer_type: str):
         super().__init__()
         "attentions": DragonBlock,
     }
+    def _init_weights(self, module):
         if isinstance(module, (DragonLinear, nn.Conv1d)):
             if module.bias is not None:
                 nn.init.zeros_(module.bias)
+            nn.init.normal_(module.weight, mean=0., std=self.config.initializer_range)
         elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0., std=self.config.initializer_range)
 @dataclass
 class DragonOutput(ModelOutput):
         self.vocab_size = config.vocab_size
         self.embedding = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        if self.config.vwn:
+            self.hidden_size_expanded = int(config.vwn_n/config.vwn_m * config.hidden_size)
+            self.expand_embedding = DragonLinear(config, config.hidden_size, self.hidden_size_expanded, bias=False)
+        if not self.config.vwn:
+            self.layers = nn.ModuleList([DragonBlock(config, layer_idx=i, layer_type=layer) if layer in ['l', 'r', 'd'] else DragonMonoBlock(config, layer_idx=i, layer_type=layer) for i, layer in enumerate(config.layers_config)])
+        else:
+            self.layers = nn.ModuleList([DragonBlock(config, layer_idx=i, layer_type=layer) if layer in ['l', 'r', 'd'] else DragonMonoVirtualBlock(config, layer_idx=i, layer_type=layer) for i, layer in enumerate(config.layers_config)])
         if self.config.rope_type_global != '' or self.config.rope_type_local != '':
             self.rotary_emb = DragonRotaryEmbedding(config, head_dim=config.head_dim if config.head_dim else (config.expand_factor*config.hidden_size)//config.num_attention_heads, theta=config.rope_theta_local) # only for SWA
         else:
             self.rotary_emb = None
+        if self.config.vwn:
+            if int(self.config.vwn_n/self.config.vwn_m) == 8:
+                self.gn = torch.nn.GroupNorm(num_groups=self.hidden_size_expanded//config.hidden_size, num_channels=self.hidden_size_expanded, eps=config.norm_epsilon, affine=False) # todo : zcg ?
+            self.reduce_h = DragonLinear(config, self.hidden_size_expanded, config.hidden_size, bias=False)
         if self.config.final_norm:
             self.final_norm = DragonNorm(config, config.hidden_size)
         self.gradient_checkpointing = False
         self.post_init()
     def get_input_embeddings(self):
         return self.embedding
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)
+        if self.config.vwn:
+            inputs_embeds = self.expand_embedding(inputs_embeds) # (B, L, D')
         if self.config.patch_level_training:
             # (B, KL, D) => (B, L, D) OR (B, L, D) ==> (B, L//K, D)
             )
             shared_kv = (last_k, last_v)
+        if self.config.vwn:
+            if int(self.config.vwn_n/self.config.vwn_m) == 8:
+                B, L, D = hidden_states.shape
+                hidden_states = self.gn(hidden_states.reshape(-1, D)).view(B, L, D)
+            hidden_states = self.reduce_h(hidden_states) # back to (B, L, D)
         if self.config.final_norm:
             hidden_states = self.final_norm(hidden_states)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
         return DragonOutput(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
         self.config = config
         self.model = DragonModel(config)
         self.vocab_size = config.vocab_size
+        bwd = 1/math.sqrt(config.hidden_size) if config.dataset_type == "hf" else 1/config.hidden_size
+        if config.reduce_lm_head == 0:
+            self.lm_head = DragonLinear(config, config.hidden_size, config.vocab_size, bias=False, alpha_fwd=1/config.hidden_size, alpha_bwd=bwd)
+        else:
+            self.lm_head = nn.Sequential(
+                DragonLinear(config, config.hidden_size, config.reduce_lm_head, bias=False, alpha_fwd=1./math.sqrt(config.reduce_lm_head)),
+                DragonLinear(config, config.reduce_lm_head, config.vocab_size, bias=False, alpha_fwd=1/config.hidden_size, alpha_bwd=bwd),
+            )
         self.post_init()
         if config.tie_lm_head:
             self.lm_head.weight = self.model.embedding.weight
+        if config.init_gpt2:
+            for pn, p in self.named_parameters():
+                if pn.endswith('fc2.weight') or pn.endswith('mixer_proj.weight'):
+                    torch.nn.init.normal_(p, mean=0.0, std=config.initializer_range/math.sqrt(2 * len(config.layers_config)))
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
             labels = labels.to(hidden_states.device)
             if linear_cross_entropy is None or not self.config.fused_loss_computation:
+                if not self.config.reduce_lm_head:
+                    logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)[:, slice_indices, :]).float()
+                else:
+                    logits = self.lm_head(hidden_states.to(self.lm_head[0].weight.dtype)[:, slice_indices, :]).float()
                 if not self.config.patch_level_training:
                     shift_logits = logits[..., :-1, :].contiguous()
                     shift_labels = labels[..., 1:].contiguous()
                         loss = loss + F.nll_loss(log_probs, shift_labels[:, i])
                     loss = loss / self.config.patch_level_training_size
             else:
+                assert not self.config.reduce_lm_head
                 assert not self.config.patch_level_training, "Fused loss computation is not supported with patch-level training."
                 loss = linear_cross_entropy(
                     hidden_states[:, slice_indices, :].view(-1, hidden_states.size(-1)),

optimizers/Ademamix.py CHANGED Viewed

@@ -46,7 +46,7 @@ class AdEMAMix(Optimizer):
     """
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999, 0.999), alpha=8.0,
-                 beta3_warmup=None, alpha_warmup=None,  eps=1e-8,
                  weight_decay=0):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
@@ -62,6 +62,7 @@ class AdEMAMix(Optimizer):
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if not 0.0 <= alpha:
             raise ValueError("Invalid alpha value: {}".format(alpha))
         defaults = dict(lr=lr, betas=betas, eps=eps, alpha=alpha, beta3_warmup=beta3_warmup,
                         alpha_warmup=alpha_warmup, weight_decay=weight_decay)
         super(AdEMAMix, self).__init__(params, defaults)
@@ -139,6 +140,8 @@ class AdEMAMix(Optimizer):
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                 denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
                 update = (exp_avg_fast.div(bias_correction1) + alpha * exp_avg_slow) / denom

     """
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999, 0.999), alpha=8.0,
+                 beta3_warmup=None, alpha_warmup=None, eps=1e-8, normalize_alpha=False,
                  weight_decay=0):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if not 0.0 <= alpha:
             raise ValueError("Invalid alpha value: {}".format(alpha))
+        self.normalize_alpha = normalize_alpha
         defaults = dict(lr=lr, betas=betas, eps=eps, alpha=alpha, beta3_warmup=beta3_warmup,
                         alpha_warmup=alpha_warmup, weight_decay=weight_decay)
         super(AdEMAMix, self).__init__(params, defaults)
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                 denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+                if self.normalize_alpha:
+                    denom = denom * (1.0 + alpha)
                 update = (exp_avg_fast.div(bias_correction1) + alpha * exp_avg_slow) / denom

training_dragon.py CHANGED Viewed

@@ -41,7 +41,8 @@ class NanoArgs:
     rope_theta_local: float = 10000.0
     rope_theta_global: float = 0.0
     eps_rmsnorm: float = 1e-6
-    mlp_expand: int = 4 # expand factor for MLP
     fused_loss_computation : bool = True # whether to use fused linear + cross entropy loss
     use_uscaling: bool = False
     uscaling_tau: float = 0.2
@@ -58,11 +59,19 @@ class NanoArgs:
     seednorm_type: int = 1
     seednorm_rank: int = 1
     mixer_gn: bool = True
     mlp_linking : bool = False
     final_norm: bool = True
     layer_norm_scaling: bool = False # not read when using muP
     mlp_type: str = "simple" # simple, gated
     tie_lm_head: bool = False
     # MoE
     moe: bool = False
@@ -117,6 +126,7 @@ class NanoArgs:
     mamba3_remove_conv: bool = True
     mamba3_is_A_dd: bool = True
     mamba3_add_trapezoid: bool = True
     # optim
     optim: str = "adamw" # adamw, spam, stable-spam, muon, muon_moonlight, splus
@@ -129,6 +139,8 @@ class NanoArgs:
     adam_beta1: float = 0.9
     adam_beta2: float = 0.95
     adam_eps: float = 1e-8
     warmup_iters: int = 200
     warmdown_iters: int = 3000
     warmdown_type: str = "linear" # linear, cosine
@@ -142,6 +154,8 @@ class NanoArgs:
     second_order_lr: float = 0.68
     second_order_momentum: float = 0.37
     second_order_interval: int = 25
     # data
     vocab_size: int = 50304
@@ -150,6 +164,7 @@ class NanoArgs:
     intra_doc_masking: bool = False
     input_bin: Optional[str] = None
     input_val_bin: Optional[str] = None
     # evaluation and logging
     val_loss_every: int = 125
@@ -170,7 +185,34 @@ class NanoArgs:
     # used during training
     slw_window: int = 0
-def _peek_data_shard(filename):
     with open(filename, "rb") as f:
         header = np.frombuffer(f.read(256 * 4), dtype=np.int32)
     if header[0] != 20240520:
@@ -182,25 +224,22 @@ def _peek_data_shard(filename):
     ntok = int(header[2])
     return ntok
-def _load_data_shard(filename):
-    with open(filename, "rb") as f:
-        header = np.frombuffer(f.read(256 * 4), dtype=np.int32)
-        assert header[0] == 20240520, "magic number mismatch in the data .bin file"
-        assert header[1] == 1, "unsupported version"
-        ntok = int(header[2])
-    # memmap the token payload directly (uint16) after the 256*4B header
-    tokens = np.memmap(filename, dtype=np.uint16, mode="r", offset=256 * 4, shape=(ntok,))
-    assert tokens.size == ntok, "number of tokens read does not match header?"
-    return tokens
 class DistributedDataLoader:
-    def __init__(self, filename_pattern, intra_doc_masking,B, T, process_rank, num_processes, bos_id):
         self.process_rank = process_rank
         self.num_processes = num_processes
         self.intra_doc_masking = intra_doc_masking
         self.bos_id = bos_id
         self.B = B # micro batch size
         self.T = T
         # glob files that match the pattern
         self.files = sorted(glob.glob(filename_pattern))
@@ -210,7 +249,7 @@ class DistributedDataLoader:
         ntok_total = 0
         self.shard_ntoks = []
         for fname in self.files:
-            shard_ntok = _peek_data_shard(fname)
             #print(f"shard {fname} has {shard_ntok} tokens")
             assert shard_ntok >= num_processes * B * T + 1
             self.shard_ntoks.append(shard_ntok)
@@ -223,12 +262,12 @@ class DistributedDataLoader:
     def reset(self, shard=0):
         self.current_shard = shard
         self.current_position = self.process_rank * self.B * self.T
-        self.tokens = _load_data_shard(self.files[self.current_shard])
     def advance(self): # advance to next data shard
         self.current_shard = (self.current_shard + 1) % len(self.files)
         self.current_position = self.process_rank * self.B * self.T
-        self.tokens = _load_data_shard(self.files[self.current_shard])
         if self.process_rank == 0:
             shard_tokens = self.shard_ntoks[self.current_shard]
@@ -282,30 +321,38 @@ def param_groups_mup(model, base_lr_hidden, base_lr_scalar, base_lr_embed, base_
     groups, seen = [], set()
     id2name = {id(p): n for n, p in model.named_parameters()}
-    for mod in model.modules():
         if isinstance(mod, nn.Linear):
             pname = id2name.get(id(mod.weight), "")
             is_scalar = getattr(mod, "is_scalar_weight", False)
             fan_in = mod.weight.shape[1]
-            scale = 1 / math.sqrt(fan_in)
             if "lm_head" in pname:
                 lr_scaled = base_lr_head
                 wd_scaled = 0.0
             elif is_scalar:
                 lr_scaled = base_lr_scalar
                 wd_scaled = 0.0
             else:
                 lr_scaled = base_lr_hidden * scale
                 wd_scaled = wd / lr_scaled
             groups.append({"params": [mod.weight], "lr": lr_scaled, "weight_decay": wd_scaled})
             seen.add(mod.weight)
             if mod.bias is not None:
                 groups.append({"params": [mod.bias], "lr": base_lr_scalar, "weight_decay": 0.0})
                 seen.add(mod.bias)
-    for p in model.parameters():
         if p in seen:
             continue
         pname = id2name.get(id(p), "<unnamed>")
@@ -318,11 +365,15 @@ def param_groups_mup(model, base_lr_hidden, base_lr_scalar, base_lr_embed, base_
             lr_scaled = base_lr_scalar
         wd_scaled = 0.
         if getattr(p, "requires_weight_decay", False):
             wd_scaled = wd / lr_scaled
         groups.append({"params": [p], "lr": lr_scaled, "weight_decay": wd_scaled})
     return groups
 args = tyro.cli(NanoArgs)
@@ -341,6 +392,9 @@ if args.mlp_type == "gated":
         print("problem: gated MLP with MoE is not supported, because we use FA backend")
         exit(0)
 # set up DDP (distributed data parallel).
 assert torch.cuda.is_available()
 dist.init_process_group(
@@ -434,13 +488,22 @@ tokenizer = transformers.AutoTokenizer.from_pretrained("/leonardo_work/BOOST_LCu
 # load dataloaders.
 #if args.patch_level_training:
 #    assert T % args.patch_level_training_size == 0, "sequence length must be divisible by patch level training size in reduced mode"
-train_loader = DistributedDataLoader(args.input_bin, args.intra_doc_masking, B, T, ddp_rank, ddp_world_size, args.bos_id)
-val_loader = DistributedDataLoader(args.input_val_bin, args.intra_doc_masking, B, T, ddp_rank, ddp_world_size, args.bos_id)
 print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files")
 print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
 # load model.
 config_hf = DragonConfig(
     tie_lm_head=args.tie_lm_head,
     mlp_type=args.mlp_type,
     layer_norm_scaling=args.layer_norm_scaling,
@@ -452,6 +515,7 @@ config_hf = DragonConfig(
     mamba3_remove_conv=args.mamba3_remove_conv,
     mamba3_is_A_dd=args.mamba3_is_A_dd,
     mamba3_add_trapezoid=args.mamba3_add_trapezoid,
     moe=args.moe,
     moe_num_routed_experts=args.moe_num_routed_experts,
     moe_routed_scaling_factor=args.moe_routed_scaling_factor,
@@ -466,6 +530,7 @@ config_hf = DragonConfig(
     shrink_qk_da=args.shrink_qk_da,
     shrink_qk_gdn=args.shrink_qk_gdn,
     mixer_gn=args.mixer_gn,
     kda_allow_neg_eigval=args.kda_allow_neg_eigval,
     kda_num_v_heads=args.kda_num_v_heads,
     seednorm_wd=args.seednorm_wd,
@@ -508,7 +573,7 @@ config_hf = DragonConfig(
     max_position_embeddings=args.sequence_length,
     use_uscaling=args.use_uscaling,
     hidden_size=args.d_model,
-    intermediate_size=args.d_model * args.mlp_expand,
     expand_factor=args.expand_factor,
     layers_config=args.layers_config,
     num_attention_heads=args.n_heads,
@@ -535,18 +600,14 @@ else:
     model = model.cuda()
 print0(model)
-"""# check here that the init std is as expected: # TODO TEMPORARY
 with torch.no_grad():
-    wstd = model.model.embedding.weight.std().item()
-    print0(f"Model weight init std: {wstd:.6f} (expected {args.init_std})")
-    assert abs(wstd - args.init_std) / args.init_std < 0.1, f"weight init std {wstd} deviates from expected {args.init_std} by more than 10%"
-    # check on another we
-    lstd = model.model.layers[0].attn.linear_qkv.weight.std().item()
-    print0(f"Model first layer attention QKV weight init std: {lstd:.6f} (expected {args.init_std})")
-    lstd = model.model.layers[0].lin_attn.qkv_conv1d.weight.std().item()
-    print0(f"Model first layer conv QKV weight init std: {lstd:.6f} (expected {args.init_std})")"""
 # count params. (total & active)
 num_params = sum(p.numel() for p in model.parameters())
@@ -570,7 +631,7 @@ ctx = torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)
 if args.intra_doc_masking:
     print0("!!! Using intra-document masking !!!")
-    print0("It is only compatible with GDN (conv+chunk), DA and GDTPA layers. For DA/GDTPA, kv shift is also compatible. All other config will not have intra-doc masking support!!")
 # load optimizers & schedulers.
 if args.use_uscaling:
@@ -587,18 +648,38 @@ if args.use_uscaling:
         optimizer = torch.optim.AdamW(param_list, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps)
     elif args.optim == "ademamix":
         from .optimizers.Ademamix import AdEMAMix
-        beta3_warmup = alpha_warmup = args.total_iterations
-        optimizer = AdEMAMix(param_list, beta3_warmup=beta3_warmup, alpha_warmup=alpha_warmup, weight_decay=args.weight_decay)
     else:
         raise ValueError(f"Unknown optimizer for unit scaling: {args.optim}")
 else:
     if args.optim == "adamw":
-        optimizer = torch.optim.AdamW(raw_model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps)
     elif args.optim == "ademamix":
         from .optimizers.Ademamix import AdEMAMix
-        beta3_warmup = alpha_warmup = args.total_iterations
-        optimizer = AdEMAMix(raw_model.parameters(), lr=args.learning_rate, beta3_warmup=beta3_warmup, alpha_warmup=alpha_warmup, weight_decay=args.weight_decay)
     else:
         raise ValueError(f"Unknown Optimizer: {args.optim}")
 if args.second_order_optim == "snoo":
@@ -624,7 +705,7 @@ def get_lr_wsd(num_iterations, warmup_iters, warmdown_iters, it):
 if args.warmdown_type == "linear":
     sched_func = partial(get_lr_wsd, args.total_iterations, args.warmup_iters, args.warmdown_iters)
     schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, sched_func) for opt in optimizers]
-elif args.warmdown_type == "cosine":
     sched = get_wsd_schedule(
         optimizers[0],
         num_warmup_steps=args.warmup_iters,
@@ -632,7 +713,7 @@ elif args.warmdown_type == "cosine":
         num_training_steps=args.total_iterations,
         min_lr_ratio=0.,
         warmup_type='linear',
-        decay_type='cosine',
     )
     schedulers = [sched]
 else:
@@ -721,8 +802,11 @@ for iter_ in range(start_iter, start_iter+args.total_iterations+1):
         # save model & tokenizer to make evaluation easier.
         tokenizer.save_pretrained(save_dir)
         state_dict_bf16 = {k: v.detach().to(torch.bfloat16).cpu() for k, v in uncompiled_model.state_dict().items()}
         uncompiled_model.config.torch_dtype = torch.bfloat16
         uncompiled_model.save_pretrained(save_dir, safe_serialization=True, state_dict=state_dict_bf16)
         # save training state.
         train_state = dict(
             iteration=iter_,
@@ -757,6 +841,18 @@ for iter_ in range(start_iter, start_iter+args.total_iterations+1):
                 (loss / accumulation_steps).backward()
         else:
             (loss / accumulation_steps).backward() # just sync on the last step
     # clip those gradients.
     if args.grad_norm_clip is not None:
         grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.grad_norm_clip, foreach=True)
@@ -771,13 +867,26 @@ for iter_ in range(start_iter, start_iter+args.total_iterations+1):
     # null those gradients.
     model.zero_grad(set_to_none=True)
     # ----------- LOGGING SECTION -----------
     approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0)
     avg_step_time = approx_training_time_ms / (iter_ + 1 - WARMUP_SKIP) if iter_ >= start_iter+WARMUP_SKIP else 0
     extra = " ".join(f"{k}:{v}" for k, v in (to_log or {}).items())
     print0(f"iteration:{iter_+1:0{len(str(start_iter+args.total_iterations))}d}/{args.total_iterations} train_loss:{train_loss.item():.4f} lr: {schedulers[0].get_last_lr()[0]:.4f} train_time:{approx_training_time_ms:.0f}ms step_avg:{avg_step_time:.2f}ms {extra}")
     if master_process:
-        wandb.log({'train_loss': train_loss.item(), 'step_avg_time': avg_step_time, **{f'lr_{i}': sched.get_last_lr()[0] for i, sched in enumerate(schedulers)}, 'grad_norm': grad_norm.item(), **to_log}, step=iter_)
 print0(f"peak memory consumption during training: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")
 print0("Training complete.")

     rope_theta_local: float = 10000.0
     rope_theta_global: float = 0.0
     eps_rmsnorm: float = 1e-6
+    mlp_expand: float = 4. # expand factor for MLP
+    intermediate_size: Optional[int] = None
     fused_loss_computation : bool = True # whether to use fused linear + cross entropy loss
     use_uscaling: bool = False
     uscaling_tau: float = 0.2
     seednorm_type: int = 1
     seednorm_rank: int = 1
     mixer_gn: bool = True
+    gate_before_norm: bool = True
     mlp_linking : bool = False
     final_norm: bool = True
     layer_norm_scaling: bool = False # not read when using muP
     mlp_type: str = "simple" # simple, gated
     tie_lm_head: bool = False
+    legacy_gate: bool = False
+    vwn: bool = False
+    vwn_m: int = 2
+    vwn_n: int = 3
+    vwn_wd_alpha_beta: bool = False
+    vwn_dynamic: bool = True
+    reduce_lm_head: int = 0
     # MoE
     moe: bool = False
     mamba3_remove_conv: bool = True
     mamba3_is_A_dd: bool = True
     mamba3_add_trapezoid: bool = True
+    mamba3_postgate_norm: bool = False # only works if legacy_gate is True!!
     # optim
     optim: str = "adamw" # adamw, spam, stable-spam, muon, muon_moonlight, splus
     adam_beta1: float = 0.9
     adam_beta2: float = 0.95
     adam_eps: float = 1e-8
+    alpha_normalize: bool = False # whether to normalize update by (1+alpha) in AdEMAMix
+    alpha_ademamix: float = 8.0
     warmup_iters: int = 200
     warmdown_iters: int = 3000
     warmdown_type: str = "linear" # linear, cosine
     second_order_lr: float = 0.68
     second_order_momentum: float = 0.37
     second_order_interval: int = 25
+    init_gpt2: bool = False
+    wnorm: bool = False # as in nemotron-flash (2511.18890)
     # data
     vocab_size: int = 50304
     intra_doc_masking: bool = False
     input_bin: Optional[str] = None
     input_val_bin: Optional[str] = None
+    dataset_type: str = "hf" # hf, mg
     # evaluation and logging
     val_loss_every: int = 125
     # used during training
     slw_window: int = 0
+def _peek_data_shard(filename, dataset_type='hf'):
+    if dataset_type == 'hf':
+        return _peek_hf_shard(filename)
+    elif dataset_type == 'mg':
+        return _peek_mg_shard(filename)
+    else:
+        raise ValueError(f"unknown dataset type: {dataset_type}")
+def _load_data_shard(filename, dataset_type='hf'):
+    if dataset_type == 'hf':
+        return _load_hf_shard(filename)
+    elif dataset_type == 'mg':
+        return _load_mg_shard(filename)
+    else:
+        raise ValueError(f"unknown dataset type: {dataset_type}")
+def _load_hf_shard(filename):
+    with open(filename, "rb") as f:
+        header = np.frombuffer(f.read(256 * 4), dtype=np.int32)
+        assert header[0] == 20240520, "magic number mismatch in the data .bin file"
+        assert header[1] == 1, "unsupported version"
+        ntok = int(header[2])
+    # memmap the token payload directly (uint16) after the 256*4B header
+    tokens = np.memmap(filename, dtype=np.uint16, mode="r", offset=256 * 4, shape=(ntok,))
+    assert tokens.size == ntok, "number of tokens read does not match header?"
+    return tokens
+def _peek_hf_shard(filename):
     with open(filename, "rb") as f:
         header = np.frombuffer(f.read(256 * 4), dtype=np.int32)
     if header[0] != 20240520:
     ntok = int(header[2])
     return ntok
+def _peek_mg_shard(filename):
+    tokens = np.memmap(filename, dtype=np.uint16, mode="r")
+    return int(tokens.size)
+def _load_mg_shard(filename):
+    return np.memmap(filename, dtype=np.uint16, mode="r")
 class DistributedDataLoader:
+    def __init__(self, filename_pattern, intra_doc_masking,B, T, process_rank, num_processes, bos_id, dataset_type='hf'):
         self.process_rank = process_rank
         self.num_processes = num_processes
         self.intra_doc_masking = intra_doc_masking
         self.bos_id = bos_id
         self.B = B # micro batch size
         self.T = T
+        self.dataset_type = dataset_type
         # glob files that match the pattern
         self.files = sorted(glob.glob(filename_pattern))
         ntok_total = 0
         self.shard_ntoks = []
         for fname in self.files:
+            shard_ntok = _peek_data_shard(fname, dataset_type=self.dataset_type)
             #print(f"shard {fname} has {shard_ntok} tokens")
             assert shard_ntok >= num_processes * B * T + 1
             self.shard_ntoks.append(shard_ntok)
     def reset(self, shard=0):
         self.current_shard = shard
         self.current_position = self.process_rank * self.B * self.T
+        self.tokens = _load_data_shard(self.files[self.current_shard], dataset_type=self.dataset_type)
     def advance(self): # advance to next data shard
         self.current_shard = (self.current_shard + 1) % len(self.files)
         self.current_position = self.process_rank * self.B * self.T
+        self.tokens = _load_data_shard(self.files[self.current_shard], dataset_type=self.dataset_type)
         if self.process_rank == 0:
             shard_tokens = self.shard_ntoks[self.current_shard]
     groups, seen = [], set()
     id2name = {id(p): n for n, p in model.named_parameters()}
+    for name, mod in model.named_modules():
         if isinstance(mod, nn.Linear):
             pname = id2name.get(id(mod.weight), "")
             is_scalar = getattr(mod, "is_scalar_weight", False)
             fan_in = mod.weight.shape[1]
             if "lm_head" in pname:
+                scale = 1
                 lr_scaled = base_lr_head
                 wd_scaled = 0.0
+                wd_mult = 0.0
             elif is_scalar:
+                scale = 1
                 lr_scaled = base_lr_scalar
                 wd_scaled = 0.0
+                wd_mult = 0.0
             else:
+                scale = 1 / math.sqrt(fan_in)
                 lr_scaled = base_lr_hidden * scale
                 wd_scaled = wd / lr_scaled
+                wd_mult = 1/lr_scaled
             groups.append({"params": [mod.weight], "lr": lr_scaled, "weight_decay": wd_scaled})
             seen.add(mod.weight)
+            print(f"param {name}.weight | shape {mod.weight.shape} | scale {scale} | wd_mult={wd_mult:.3e}")
             if mod.bias is not None:
+                assert False
                 groups.append({"params": [mod.bias], "lr": base_lr_scalar, "weight_decay": 0.0})
                 seen.add(mod.bias)
+    for name, p in model.named_parameters():
         if p in seen:
             continue
         pname = id2name.get(id(p), "<unnamed>")
             lr_scaled = base_lr_scalar
         wd_scaled = 0.
+        wd_mult = 0.
         if getattr(p, "requires_weight_decay", False):
             wd_scaled = wd / lr_scaled
+            wd_mult = 1/lr_scaled
         groups.append({"params": [p], "lr": lr_scaled, "weight_decay": wd_scaled})
+        print(f"param {name} | shape {p.shape} | scale {1.} | wd_mult={wd_mult:.3e}")
     return groups
 args = tyro.cli(NanoArgs)
         print("problem: gated MLP with MoE is not supported, because we use FA backend")
         exit(0)
+if args.legacy_gate:
+    assert not args.gate_gdn, "legacy_gate is not compatible with gate_gdn."
 # set up DDP (distributed data parallel).
 assert torch.cuda.is_available()
 dist.init_process_group(
 # load dataloaders.
 #if args.patch_level_training:
 #    assert T % args.patch_level_training_size == 0, "sequence length must be divisible by patch level training size in reduced mode"
+train_loader = DistributedDataLoader(args.input_bin, args.intra_doc_masking, B, T, ddp_rank, ddp_world_size, args.bos_id, args.dataset_type)
+val_loader = DistributedDataLoader(args.input_val_bin, args.intra_doc_masking, B, T, ddp_rank, ddp_world_size, args.bos_id, args.dataset_type)
 print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files")
 print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
 # load model.
 config_hf = DragonConfig(
+    reduce_lm_head=args.reduce_lm_head,
+    dataset_type=args.dataset_type,
+    vwn=args.vwn,
+    vwn_m=args.vwn_m,
+    vwn_n=args.vwn_n,
+    vwn_wd_alpha_beta=args.vwn_wd_alpha_beta,
+    vwn_dynamic=args.vwn_dynamic,
+    legacy_gate=args.legacy_gate,
+    init_gpt2=args.init_gpt2,
     tie_lm_head=args.tie_lm_head,
     mlp_type=args.mlp_type,
     layer_norm_scaling=args.layer_norm_scaling,
     mamba3_remove_conv=args.mamba3_remove_conv,
     mamba3_is_A_dd=args.mamba3_is_A_dd,
     mamba3_add_trapezoid=args.mamba3_add_trapezoid,
+    mamba3_postgate_norm=args.mamba3_postgate_norm,
     moe=args.moe,
     moe_num_routed_experts=args.moe_num_routed_experts,
     moe_routed_scaling_factor=args.moe_routed_scaling_factor,
     shrink_qk_da=args.shrink_qk_da,
     shrink_qk_gdn=args.shrink_qk_gdn,
     mixer_gn=args.mixer_gn,
+    gate_before_norm=args.gate_before_norm,
     kda_allow_neg_eigval=args.kda_allow_neg_eigval,
     kda_num_v_heads=args.kda_num_v_heads,
     seednorm_wd=args.seednorm_wd,
     max_position_embeddings=args.sequence_length,
     use_uscaling=args.use_uscaling,
     hidden_size=args.d_model,
+    intermediate_size=int(args.d_model * args.mlp_expand) if args.intermediate_size is None else args.intermediate_size,
     expand_factor=args.expand_factor,
     layers_config=args.layers_config,
     num_attention_heads=args.n_heads,
     model = model.cuda()
 print0(model)
 with torch.no_grad():
+    for name, p in model.named_parameters():
+        if p is None or p.numel() == 0:
+            continue
+        t = p.detach().float()
+        mean = t.mean().item()
+        std  = t.std(unbiased=False).item()
+        print0(f"{name:60s} shape={tuple(p.shape)} mean={mean:+.4e} std={std:.4e}")
 # count params. (total & active)
 num_params = sum(p.numel() for p in model.parameters())
 if args.intra_doc_masking:
     print0("!!! Using intra-document masking !!!")
+    print0("It is only compatible with GDN (conv+chunk), KDA (conv+chunk), DA and GDTPA layers. For DA/GDTPA, kv shift is also compatible. All other config will not have intra-doc masking support!!")
 # load optimizers & schedulers.
 if args.use_uscaling:
         optimizer = torch.optim.AdamW(param_list, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps)
     elif args.optim == "ademamix":
         from .optimizers.Ademamix import AdEMAMix
+        beta3_warmup = args.total_iterations
+        alpha_warmup = args.total_iterations
+        optimizer = AdEMAMix(param_list, beta3_warmup=beta3_warmup, alpha_warmup=alpha_warmup, normalize_alpha=args.alpha_normalize, alpha=args.alpha_ademamix, weight_decay=args.weight_decay)
     else:
         raise ValueError(f"Unknown optimizer for unit scaling: {args.optim}")
 else:
     if args.optim == "adamw":
+        #optimizer = torch.optim.AdamW(raw_model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps)
+        decay_params = []
+        no_decay_params = []
+        for name, p in raw_model.named_parameters():
+            if not p.requires_grad:
+                continue
+            if getattr(p, "_no_weight_decay", False):
+                no_decay_params.append(p)
+            else:
+                decay_params.append(p)
+        optimizer = torch.optim.AdamW(
+            [
+                {"params": decay_params, "weight_decay": args.weight_decay},
+                {"params": no_decay_params, "weight_decay": 0.0},
+            ],
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_eps,
+        )
     elif args.optim == "ademamix":
         from .optimizers.Ademamix import AdEMAMix
+        beta3_warmup = args.total_iterations
+        alpha_warmup = args.total_iterations
+        optimizer = AdEMAMix(raw_model.parameters(), lr=args.learning_rate, beta3_warmup=beta3_warmup, alpha_warmup=alpha_warmup, normalize_alpha=args.alpha_normalize, alpha=args.alpha_ademamix, weight_decay=args.weight_decay)
     else:
         raise ValueError(f"Unknown Optimizer: {args.optim}")
 if args.second_order_optim == "snoo":
 if args.warmdown_type == "linear":
     sched_func = partial(get_lr_wsd, args.total_iterations, args.warmup_iters, args.warmdown_iters)
     schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, sched_func) for opt in optimizers]
+elif args.warmdown_type == "cosine" or args.warmdown_type == "1-sqrt":
     sched = get_wsd_schedule(
         optimizers[0],
         num_warmup_steps=args.warmup_iters,
         num_training_steps=args.total_iterations,
         min_lr_ratio=0.,
         warmup_type='linear',
+        decay_type=args.warmdown_type,
     )
     schedulers = [sched]
 else:
         # save model & tokenizer to make evaluation easier.
         tokenizer.save_pretrained(save_dir)
         state_dict_bf16 = {k: v.detach().to(torch.bfloat16).cpu() for k, v in uncompiled_model.state_dict().items()}
+        idm_og = uncompiled_model.config.intra_doc_masking
+        uncompiled_model.config.intra_doc_masking = False
         uncompiled_model.config.torch_dtype = torch.bfloat16
         uncompiled_model.save_pretrained(save_dir, safe_serialization=True, state_dict=state_dict_bf16)
+        uncompiled_model.config.intra_doc_masking = idm_og
         # save training state.
         train_state = dict(
             iteration=iter_,
                 (loss / accumulation_steps).backward()
         else:
             (loss / accumulation_steps).backward() # just sync on the last step
+    individual_grad_norms = {}
+    """# Calculate individual param norms
+    # We use 'raw_model' to avoid 'module.' or '_orig_mod.' prefixes in wandb
+    individual_grad_norms = {}
+    # Only calculate on master process to save time, and maybe throttle frequency (e.g., every 10 steps)
+    # If you want it every step, remove the (iter_ % 10 == 0) check.
+    if master_process and (iter_ % 50 == 0):
+        for name, p in raw_model.named_parameters():
+            if p.grad is not None:
+                # Calculate L2 norm of the gradient
+                param_norm = p.grad.detach().data.norm(2).item()
+                individual_grad_norms[f"grad_norm/{name}"] = param_norm"""
     # clip those gradients.
     if args.grad_norm_clip is not None:
         grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.grad_norm_clip, foreach=True)
     # null those gradients.
     model.zero_grad(set_to_none=True)
+    # Wnorm
+    if args.wnorm:
+        with torch.no_grad():
+            for m in model.modules():
+                if getattr(m, "norm_case_1", False):
+                    W = getattr(m, "weight", None)
+                    denom = W.float().norm(p=2, dim=1, keepdim=True).clamp_min(1e-8).to(W.dtype)
+                    W.div_(denom)
+                elif getattr(m, "norm_case_2", False):
+                    W = getattr(m, "weight", None)
+                    denom = W.float().norm(p=2, dim=0, keepdim=True).clamp_min(1e-8).to(W.dtype)
+                    W.div_(denom)
     # ----------- LOGGING SECTION -----------
     approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0)
     avg_step_time = approx_training_time_ms / (iter_ + 1 - WARMUP_SKIP) if iter_ >= start_iter+WARMUP_SKIP else 0
     extra = " ".join(f"{k}:{v}" for k, v in (to_log or {}).items())
     print0(f"iteration:{iter_+1:0{len(str(start_iter+args.total_iterations))}d}/{args.total_iterations} train_loss:{train_loss.item():.4f} lr: {schedulers[0].get_last_lr()[0]:.4f} train_time:{approx_training_time_ms:.0f}ms step_avg:{avg_step_time:.2f}ms {extra}")
     if master_process:
+        wandb.log({'train_loss': train_loss.item(), 'step_avg_time': avg_step_time, **{f'lr_{i}': sched.get_last_lr()[0] for i, sched in enumerate(schedulers)}, 'grad_norm': grad_norm.item(), **to_log, **individual_grad_norms}, step=iter_)
 print0(f"peak memory consumption during training: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")
 print0("Training complete.")