mamba3 flags | mamba3 default state size to 128, headdim to 64 | mamba2 | fix mamba3 mimo (JG) | (fake) moe | intra doc maskiiiing (with SS) | seednorm tests | coord checks

Browse files

Files changed (6) hide show

configuration_dragon.py +33 -1
coordcheck_utils.py +472 -0
coordchecking_dragon.py +154 -0
inspecting_dragon.py +55 -13
modeling_dragon.py +425 -74
training_dragon.py +86 -15

configuration_dragon.py CHANGED Viewed

@@ -92,6 +92,21 @@ class DragonConfig(PretrainedConfig):
     def __init__(
         self,
         mla_kv_rank: int = 128,
         shrink_qk_da: int = 2,
         shrink_qk_gdn: int = 2,
@@ -119,6 +134,7 @@ class DragonConfig(PretrainedConfig):
         scalable_softmax: bool = True,
         resformer: bool = False,
         mamba_mimo_dim : int = 4,
         gate_type: str = "elementwise",
         gate_act: str = "silu",
         gate_attn: bool = False,
@@ -163,7 +179,7 @@ class DragonConfig(PretrainedConfig):
         rope_type_local="rope",
         rope_type_global="",
         rope_theta_local=163.,
-        rope_theta_global=10000.,
         uscaling_tau=0.2,
         attention_dropout=0.,
         hidden_dropout=0.,
@@ -176,6 +192,21 @@ class DragonConfig(PretrainedConfig):
         mlp_linking=False,
         **kwargs,
     ):
         self.mla_kv_rank = mla_kv_rank
         self.shrink_qk_da = shrink_qk_da
         self.shrink_qk_gdn = shrink_qk_gdn
@@ -228,6 +259,7 @@ class DragonConfig(PretrainedConfig):
         self.scalable_softmax = scalable_softmax
         self.resformer = resformer
         self.mamba_mimo_dim = mamba_mimo_dim
         self.vocab_size = vocab_size
         self.tie_word_embeddings = tie_word_embeddings

     def __init__(
         self,
+        mamba3_rope: bool = True,
+        mamba3_remove_BC_bias: bool = False,
+        mamba3_is_id_rms: bool = True,
+        mamba3_remove_conv: bool = True,
+        mamba3_is_A_dd: bool = True,
+        mamba3_add_trapezoid: bool = True,
+        moe: bool = False,
+        moe_num_routed_experts: int = 2,
+        moe_routed_scaling_factor: float = 2.5,
+        moe_routed_intermediate_size: int = 768,
+        moe_shared_intermediate_size: int = 768,
+        intra_doc_masking: bool = False,
+        seednorm_rank: int = 1,
+        seednorm_type: int = 1,
+        final_norm: bool = True,
         mla_kv_rank: int = 128,
         shrink_qk_da: int = 2,
         shrink_qk_gdn: int = 2,
         scalable_softmax: bool = True,
         resformer: bool = False,
         mamba_mimo_dim : int = 4,
+        mamba_ngroups : int = 1,
         gate_type: str = "elementwise",
         gate_act: str = "silu",
         gate_attn: bool = False,
         rope_type_local="rope",
         rope_type_global="",
         rope_theta_local=163.,
+        rope_theta_global=0.,
         uscaling_tau=0.2,
         attention_dropout=0.,
         hidden_dropout=0.,
         mlp_linking=False,
         **kwargs,
     ):
+        self.mamba3_rope = mamba3_rope
+        self.mamba3_remove_BC_bias = mamba3_remove_BC_bias
+        self.mamba3_is_id_rms = mamba3_is_id_rms
+        self.mamba3_remove_conv = mamba3_remove_conv
+        self.mamba3_is_A_dd = mamba3_is_A_dd
+        self.mamba3_add_trapezoid = mamba3_add_trapezoid
+        self.moe = moe
+        self.moe_num_routed_experts = moe_num_routed_experts
+        self.moe_routed_scaling_factor = moe_routed_scaling_factor
+        self.moe_routed_intermediate_size = moe_routed_intermediate_size
+        self.moe_shared_intermediate_size = moe_shared_intermediate_size
+        self.intra_doc_masking = intra_doc_masking
+        self.seednorm_rank = seednorm_rank
+        self.seednorm_type = seednorm_type
+        self.final_norm = final_norm
         self.mla_kv_rank = mla_kv_rank
         self.shrink_qk_da = shrink_qk_da
         self.shrink_qk_gdn = shrink_qk_gdn
         self.scalable_softmax = scalable_softmax
         self.resformer = resformer
         self.mamba_mimo_dim = mamba_mimo_dim
+        self.mamba_ngroups = mamba_ngroups
         self.vocab_size = vocab_size
         self.tie_word_embeddings = tie_word_embeddings

coordcheck_utils.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# Copyright 2022 Microsoft Corporation.
+"""
+Adapted from https://github.com/microsoft/mup
+In short, it has been largely simplified.
+"""
+import os
+from copy import copy
+from itertools import product
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+FDICT = {'l1': lambda x: torch.abs(x).mean(dtype=torch.float32)}
+def convert_fdict(d):
+    '''convert a dict `d` with string values to function values.
+    Input:
+        d: a dict whose values are either strings or functions
+    Output:
+        a new dict, with the same keys as `d`, but the string values are
+        converted to functions using `FDICT`.
+    '''
+    return dict([
+        ((k, FDICT[v]) if isinstance(v, str) else (k, v))
+        for k, v in d.items()])
+def _record_coords(records, width, modulename, t,
+                output_fdict=None, input_fdict=None, param_fdict=None):
+    '''Returns a forward hook that records coordinate statistics.
+    Returns a forward hook that records statistics regarding the output, input,
+    and/or parameters of a `nn.Module`. This hook is intended to run only once,
+    on the timestep specified by `t`.
+    On forward pass, the returned hook calculates statistics specified in
+    `output_fdict`, `input_fdict`, and `param_fdict`, such as the normalized l1
+    norm, of output, input, and/or parameters of the module. The statistics are
+    recorded along with the `width`, `modulename`, and `t` (the time step) as a
+    dict and inserted into `records` (which should be a list). More precisely,
+    for each output, input, and/or parameter, the inserted dict is of the form
+        {
+            'width': width, 'module': modified_modulename, 't': t,
+            # keys are keys in fdict
+            'l1': 0.241, 'l2': 0.420, 'mean': 0.0, ...
+        }
+    where `modified_modulename` is a string that combines the `modulename` with
+    an indicator of which output, input, or parameter tensor is the statistics
+    computed over.
+    The `*_fdict` inputs should be dictionaries with string keys and whose
+    values can either be functions or strings. The string values are converted
+    to functions via `convert_fdict`. The default values of `*_dict` inputs are
+    converted to `output_fdict = dict(l1=FDICT['l1'])`, `input_fdict = {}`,
+    `param_fdict = {}`, i.e., only the average coordinate size (`l1`) of the
+    output activations are recorded.
+    Inputs:
+        records:
+            list to append coordinate data to
+        width:
+            width of the model. This is used only for plotting coord check later
+            on, so it can be any notion of width.
+        modulename:
+            string name of the module. This is used only for plotting coord check.
+        t:
+            timestep of training. This is used only for plotting coord check.
+        output_fdict, input_fdict, param_fdict:
+            dicts with string keys and whose values can either be functions or
+            strings. The string values are converted to functions via
+            `convert_fdict`
+    Output:
+        a forward hook that records statistics regarding the output, input,
+        and/or parameters of a `nn.Module`, as discussed above.
+    '''
+    if output_fdict is None:
+        output_fdict = dict(l1=FDICT['l1'])
+    else:
+        output_fdict = convert_fdict(output_fdict)
+    if input_fdict is None:
+        input_fdict = {}
+    else:
+        input_fdict = convert_fdict(input_fdict)
+    if param_fdict is None:
+        param_fdict = {}
+    else:
+        param_fdict = convert_fdict(param_fdict)
+    def f(module, input, output):
+        def get_stat(d, x, fdict):
+            if isinstance(x, (tuple, list)):
+                for i, _x in enumerate(x):
+                    _d = copy(d)
+                    _d['module'] += f'[{i}]'
+                    get_stat(_d, _x, fdict)
+            elif isinstance(x, dict):
+                for name, _x in x.items():
+                    _d = copy(d)
+                    _d['module'] += f'[{name}]'
+                    get_stat(_d, _x, fdict)
+            elif isinstance(x, torch.Tensor):
+                _d = copy(d)
+                for fname, f in fdict.items():
+                    _d[fname] = f(x).item()
+                records.append(_d)
+            elif x is None:
+                pass
+            else:
+                raise NotImplementedError(f'Unexpected output type: {type(x)}')
+        with torch.no_grad():
+            ret = {
+                'width': width,
+                'module': modulename,
+                't': t
+            }
+            # output stats
+            if isinstance(output, (tuple, list)):
+                for i, out in enumerate(output):
+                    _ret = copy(ret)
+                    _ret['module'] += f':out[{i}]'
+                    get_stat(_ret, out, output_fdict)
+            elif isinstance(output, dict):
+                for name, out in output.items():
+                    _ret = copy(ret)
+                    _ret['module'] += f':out[{name}]'
+                    get_stat(_ret, out, output_fdict)
+            elif isinstance(output, torch.Tensor):
+                _ret = copy(ret)
+                for fname, f in output_fdict.items():
+                    _ret[fname] = f(output).item()
+                records.append(_ret)
+            else:
+                raise NotImplementedError(f'Unexpected output type: {type(output)}')
+            # input stats
+            if input_fdict:
+                if isinstance(input, (tuple, list)):
+                    for i, out in enumerate(input):
+                        _ret = copy(ret)
+                        _ret['module'] += f':in[{i}]'
+                        get_stat(_ret, out, input_fdict)
+                elif isinstance(input, dict):
+                    for name, out in input.items():
+                        _ret = copy(ret)
+                        _ret['module'] += f':in[{name}]'
+                        get_stat(_ret, out, input_fdict)
+                elif isinstance(input, torch.Tensor):
+                    _ret = copy(ret)
+                    for fname, f in input_fdict.items():
+                        _ret[fname] = f(input).item()
+                    records.append(_ret)
+                else:
+                    raise NotImplementedError(f'Unexpected output type: {type(input)}')
+            # param stats
+            if param_fdict:
+                for name, p in module.named_parameters():
+                    _ret = copy(ret)
+                    _ret['module'] += f':param[{name}]'
+                    for fname, f in param_fdict.items():
+                        _ret[fname] = f(p).item()
+                    records.append(_ret)
+    return f
+def _get_coord_data(models, dataloader, optcls, nsteps=5,
+                dict_in_out=False, flatten_input=False, flatten_output=False,
+                output_name='loss', lossfn='xent', filter_module_by_name=None,
+                fix_data=True, cuda=True, nseeds=1,
+                output_fdict=None, input_fdict=None, param_fdict=None,
+                show_progress=True, one_hot_target=False):
+    '''Inner method for `get_coord_data`.
+    Train the models in `models` with optimizer given by `optcls` and data from
+    `dataloader` for `nsteps` steps, and record coordinate statistics specified
+    by `output_fdict`, `input_fdict`, `param_fdict`. By default, only `l1` is
+    computed for output activations of each module.
+    Inputs:
+        models:
+            a dict of lazy models, where the keys are numbers indicating width.
+            Each entry of `models` is a function that instantiates a model given
+            nothing.
+        dataloader:
+            an iterator whose elements are either Huggingface style dicts, if
+            `dict_in_out` is True, or (input, label). If `fix_data` is True
+            (which is the default), then only the first element of `dataloader`
+            is used in a loop and the rest of `dataloder` is ignored.
+        optcls:
+            a function so that `optcls(model)` gives an optimizer used to train
+            the model.
+        nsteps:
+            number of steps to train the model
+        dict_in_out:
+            whether the data loader contains Huggingface-style dict input and
+            output. Default: False
+        flatten_input:
+            if not `dict_in_out`, reshape the input to be
+            `input.view(input.shape[0], -1)`. Typically used for testing MLPs.
+        flatten_output:
+            if not `dict_in_out`, reshape the label to be `label.view(-1,
+            input.shape[-1])`.
+        output_name:
+            if `dict_in_out`, this is the key for the loss value if the output
+            is a dict. If the output is not a dict, then we assume the first
+            element of the output is the loss.
+        lossfn:
+            loss function to use if not `dict_in_out`. Can be either a string from
+            [`xent`, 'mse', 'nll', 'l1'] or a python `callable` such that
+            `lossfn(output, target)` returns the loss value. Examples of valid
+            `callable`s are `F.cross_entropy`, `F.mse_loss`, etc, where `F` is
+            `torch.nn.functional`. Default: 'xent'
+        filter_module_by_name:
+            a function that returns a bool given module names (from
+            `model.named_modules()`), or None. If not None, then only modules
+            whose name yields True will be recorded.
+        cuda:
+            whether to use cuda or not. Default: True
+        nseeds:
+            number of times to repeat the training, each with different seeds.
+        output_fdict, input_fdict, param_fdict:
+            function dicts to be used in `_record_coords`. By default, only `l1`
+            is computed for output activations of each module.
+        show_progress:
+            show progress using tqdm. Default: True
+        one_hot_target:
+            convert target label into a one-hot vector. This typically is only
+            used for `'mse'` or `'l1'` losses in classification tasks.
+            Default: False
+    Output:
+        a pandas DataFrame containing recorded results. The column names are
+        `'width', 'module', 't'` as well as names of statistics recorded, such
+        as `'l1'` (see `FDICT` for other premade statistics that can be
+        collected).
+    Breaking Changes:
+        In v1.0.0, when `lossfn=='mse'`, the target is automatically converted
+        to a one hot vector before loss computation. Starting in v1.1.0, this
+        behavior is turned off, and the user needs to explicitly turn on this
+        behavior by setting `one_hot_target=True`.
+    '''
+    df = []
+    if fix_data:
+        batch = next(iter(dataloader))
+        dataloader = [batch] * nsteps
+    if show_progress:
+        pbar = tqdm(total=nseeds * len(models))
+    for i in range(nseeds):
+        torch.manual_seed(i)
+        for width, model in models.items():
+            model = model()
+            model = model.train()
+            if cuda:
+                model = model.cuda()
+            optimizer = optcls(model)
+            for batch_idx, batch in enumerate(dataloader, 1):
+                remove_hooks = []
+                # add hooks
+                for name, module in model.named_modules():
+                    if filter_module_by_name and not filter_module_by_name(name):
+                        continue
+                    remove_hooks.append(module.register_forward_hook(
+                        _record_coords(df, width, name, batch_idx,
+                            output_fdict=output_fdict,
+                            input_fdict=input_fdict,
+                            param_fdict=param_fdict)))
+                if dict_in_out:
+                    (data, target) = batch
+                    loss = model(input_ids=data, labels=target).loss
+                else:
+                    assert False, "Not implemented for non-dict input/output."
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                # remove hooks
+                for handle in remove_hooks:
+                    handle.remove()
+                if batch_idx == nsteps: break
+            if show_progress:
+                pbar.update(1)
+    if show_progress:
+        pbar.close()
+    return pd.DataFrame(df)
+def get_coord_data(models, dataloader, optcls, nsteps, **kwargs):
+    '''Get coord data for coord check.
+    Train the models in `models` with data from `dataloader` and optimizer
+    specified by `optimizer` and `lr` for `nsteps` steps, and record coordinate
+    statistics specified by `output_fdict`, `input_fdict`, `param_fdict`. By
+    default, only `l1` is computed for output activations of each module.
+    This function wraps around `_get_coord_data`, with the main difference being
+    user can specify common optimizers via a more convenient interface.
+    Inputs:
+        models:
+            a dict of lazy models, where the keys are numbers indicating width.
+            Each entry of `models` is a function that instantiates a model given
+            nothing.
+        dataloader:
+            an iterator whose elements are either Huggingface style dicts, if
+            `dict_in_out` is True, or (input, label). If `fix_data` is True
+            (which is the default), then only the first element of `dataloader`
+            is used in a loop and the rest of `dataloder` is ignored.
+        optimizer:
+            a string in `['sgd', 'adam', 'adamw']`, with default being `'sgd'`.
+        lr:
+            learning rate. By default is 0.1 for `'sgd'` and 1e-3 for others.
+        mup:
+            If True, then use the optimizer from `mup.optim`; otherwise, use the
+            one from `torch.optim`.
+        filter_trainable_by_name:
+            a function that returns a bool given module names (from
+            `model.named_modules()`), or None. If not None, then only modules
+            whose name yields True will be trained.
+        nsteps:
+            number of steps to train the model
+        dict_in_out:
+            whether the data loader contains Huggingface-style dict input and
+            output. Default: False
+        flatten_input:
+            if not `dict_in_out`, reshape the input to be
+            `input.view(input.shape[0], -1)`. Typically used for testing MLPs.
+        flatten_output:
+            if not `dict_in_out`, reshape the label to be `label.view(-1,
+            input.shape[-1])`.
+        output_name:
+            if `dict_in_out`, this is the key for the loss value if the output
+            is a dict. If the output is not a dict, then we assume the first
+            element of the output is the loss.
+        lossfn:
+            loss function to use if not `dict_in_out`. Can be either a string from
+            [`xent`, 'mse', 'nll', 'l1'] or a python `callable` such that
+            `lossfn(output, target)` returns the loss value. Examples of valid
+            `callable`s are `F.cross_entropy`, `F.mse_loss`, etc, where `F` is
+            `torch.nn.functional`. Default: 'xent'
+        filter_module_by_name:
+            a function that returns a bool given module names (from
+            `model.named_modules()`), or None. If not None, then only modules
+            whose name yields True will be recorded.
+        cuda:
+            whether to use cuda or not. Default: True
+        nseeds:
+            number of times to repeat the training, each with different seeds.
+        output_fdict, input_fdict, param_fdict:
+            function dicts to be used in `_record_coords`. By default, only `l1`
+            is computed for output activations of each module.
+        show_progress:
+            show progress using tqdm. Default: True
+        one_hot_target:
+            convert target label into a one-hot vector. This typically is only
+            used for `'mse'` or `'l1'` losses in classification tasks.
+            Default: False
+    Output:
+        a pandas DataFrame containing recorded results. The column names are
+        `'width', 'module', 't'` as well as names of statistics recorded, such
+        as `'l1'` (see `FDICT` for other premade statistics that can be
+        collected).
+    Breaking Changes:
+        In v1.0.0, when `lossfn=='mse'`, the target is automatically converted
+        to a one hot vector before loss computation. Starting in v1.1.0, this
+        behavior is turned off, and the user needs to explicitly turn on this
+        behavior by setting `one_hot_target=True`.
+    '''
+    data = _get_coord_data(models, dataloader, optcls, nsteps, dict_in_out=True, **kwargs)
+    return data
+def plot_coord_data(df, y='l1', save_to=None, suptitle=None, x='width', hue='module',
+                    legend='full', name_contains=None, name_not_contains=None, module_list=None,
+                    loglog=True, logbase=2, face_color=None, subplot_width=5,
+                    subplot_height=4):
+    '''Plot coord check data `df` obtained from `get_coord_data`.
+    Input:
+        df:
+            a pandas DataFrame obtained from `get_coord_data`
+        y:
+            the column of `df` to plot on the y-axis. Default: `'l1'`
+        save_to:
+            path to save the resulting figure, or None. Default: None.
+        suptitle:
+            The title of the entire figure.
+        x:
+            the column of `df` to plot on the x-axis. Default: `'width'`
+        hue:
+            the column of `df` to represent as color. Default: `'module'`
+        legend:
+            'auto', 'brief', 'full', or False. This is passed to `seaborn.lineplot`.
+        name_contains, name_not_contains:
+            only plot modules whose name contains `name_contains` and does not contain `name_not_contains`
+        module_list:
+            only plot modules that are given in the list, overrides `name_contains` and `name_not_contains`
+        loglog:
+            whether to use loglog scale. Default: True
+        logbase:
+            the log base, if using loglog scale. Default: 2
+        face_color:
+            background color of the plot. Default: None (which means white)
+        subplot_width, subplot_height:
+            The width and height for each timestep's subplot. More precisely,
+            the figure size will be
+                `(subplot_width*number_of_time_steps, subplot_height)`.
+            Default: 5, 4
+    Output:
+        the `matplotlib` figure object
+    '''
+    ### preprocessing
+    df = copy(df)
+    df = df[df.module != ''] # nn.Sequential has name '', which duplicates the output layer
+    if module_list is not None:
+        df = df[df['module'].isin(module_list)]
+    else:
+        if name_contains is not None:
+            df = df[df['module'].str.contains(name_contains)]
+        if name_not_contains is not None:
+            df = df[~(df['module'].str.contains(name_not_contains))]
+    try:
+        df['module'] = pd.to_numeric(df['module']) # for nn.Sequential, module names are numerical
+    except ValueError:
+        pass
+    ts = df.t.unique()
+    sns.set()
+    def tight_layout(plt):
+        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+    ### plot
+    fig = plt.figure(figsize=(subplot_width * len(ts), subplot_height))
+    hue_order = sorted(set(df['module']))
+    if face_color is not None:
+        fig.patch.set_facecolor(face_color)
+    ymin, ymax = min(df[y]), max(df[y])
+    for t in ts:
+        t = int(t)
+        plt.subplot(1, len(ts), t)
+        sns.lineplot(x=x, y=y, data=df[df.t == t], hue=hue, hue_order=hue_order, legend=None) # to show legend, set legend if t == 1 else None
+        plt.title(f't={t}')
+        if t != 1:
+            plt.ylabel('')
+        if loglog:
+            plt.loglog(base=logbase)
+        ax = plt.gca()
+        ax.set_ylim([ymin, ymax])
+    if suptitle:
+        plt.suptitle(suptitle)
+    tight_layout(plt)
+    if save_to is not None:
+        plt.savefig(save_to)
+        print(f'coord check plot saved to {save_to}')
+    return fig

coordchecking_dragon.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from dataclasses import dataclass
+import tyro
+from pathlib import Path
+import math
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from .configuration_dragon import DragonConfig
+from .modeling_dragon import DragonForCausalLM
+from .coordcheck_utils import get_coord_data, plot_coord_data
+# TRITON_HOME="/p/project1/jureap140/temp" python make_coord_check.py
+@dataclass
+class Args:
+    save_dir: Path
+    mup: bool = False
+    learning_rate: float = 1e-2
+    layers_config: str = "gggTgggTgggTggg"
+args = tyro.cli(Args)
+batch_size = 8
+batch_len = 1024
+max_value = 100
+widths = [128, 512, 1024, 2048]
+n_heads = [4, 8, 16, 32]
+d_head = 64
+class RandomDataset(Dataset):
+    def __len__(self):
+        return 9999999
+    def __getitem__(self, _):
+        data = torch.randint(low=0, high=max_value, size=(batch_size, batch_len))
+        return data.cuda(), data.cuda()
+def lazy_model(width):
+    config_hf = DragonConfig(
+        layers_config=args.layers_config,
+        hidden_size=width,
+        intermediate_size=4*width,
+        tpa_rank=4,
+        token_shift_attn=True,
+        head_dim=d_head,
+        shrink_qk_da=1,
+        num_attention_heads=n_heads[widths.index(width)],
+        num_signal_heads_diff=n_heads[widths.index(width)]-n_heads[widths.index(width)]//4,
+        num_key_value_heads=n_heads[widths.index(width)],
+        head_dim_gdn=d_head,
+        shrink_qk_gdn=2,
+        num_attention_heads_gdn=n_heads[widths.index(width)],
+        zero_centered_gate=True,
+        zero_centered_gate_type=4,
+        mamba_mimo_dim=4,
+        mamba_ngroups=1,
+        gate_attn=True,
+        zero_centered_gamma=True,
+        vocab_size=max_value,
+        max_position_embeddings=1024,
+        use_uscaling=True,
+        uscaling_tau=0.2,
+        initializer_range=1.,
+        use_cache=False,
+    )
+    if args.mup:
+        config_hf.use_uscaling = True
+        config_hf.initializer_range = 1.0
+    else:
+        config_hf.use_uscaling = False
+        config_hf.initializer_range = 0.006
+    return lambda: DragonForCausalLM(config_hf).to("cuda")
+def param_groups_mup(model, base_lr_hidden, base_lr_scalar, base_lr_embed, base_lr_head, wd):
+    groups, seen = [], set()
+    id2name = {id(p): n for n, p in model.named_parameters()}
+    for mod in model.modules():
+        if isinstance(mod, nn.Linear):
+            pname = id2name.get(id(mod.weight), "")
+            is_scalar = getattr(mod, "is_scalar_weight", False)
+            fan_in = mod.weight.shape[1]
+            scale = 1 / math.sqrt(fan_in)
+            if "lm_head" in pname:
+                lr_scaled = base_lr_head
+                wd_scaled = 0.0
+            elif is_scalar:
+                lr_scaled = base_lr_scalar
+                wd_scaled = 0.0
+            else:
+                lr_scaled = base_lr_hidden * scale
+                wd_scaled = wd / lr_scaled
+            groups.append({"params": [mod.weight], "lr": lr_scaled, "weight_decay": wd_scaled})
+            seen.add(mod.weight)
+            if mod.bias is not None:
+                groups.append({"params": [mod.bias], "lr": base_lr_scalar, "weight_decay": 0.0})
+                seen.add(mod.bias)
+    for p in model.parameters():
+        if p in seen:
+            continue
+        pname = id2name.get(id(p), "<unnamed>")
+        if "embedding" in pname:
+            #fan_out = p.shape[1] # nn.Embedding is transposed
+            #lr_scaled = base_lr / math.sqrt(fan_out) # u-muP
+            lr_scaled = base_lr_embed
+        else:
+            lr_scaled = base_lr_scalar
+        wd_scaled = 0.
+        if getattr(p, "requires_weight_decay", False):
+            wd_scaled = wd / lr_scaled
+        groups.append({"params": [p], "lr": lr_scaled, "weight_decay": wd_scaled})
+    return groups
+models = {width: lazy_model(width) for width in widths}
+dataset = RandomDataset()
+loader = DataLoader(dataset, batch_size=None, shuffle=True)
+iter_ = iter(loader)
+def get_optim(model):
+    if args.mup:
+        param_list = param_groups_mup(
+            model,
+            base_lr_hidden=args.learning_rate,
+            base_lr_scalar=2**-6,
+            base_lr_embed=2**-4,
+            base_lr_head=2**-6,
+            wd=0.,
+        )
+        optimizer = torch.optim.AdamW(param_list, betas=(0.9, 0.95), eps=1e-8)
+    else:
+        optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=0., betas=(0.9, 0.95), eps=1e-8)
+    return optimizer
+optcls = lambda model: get_optim(model)
+df = get_coord_data(models, iter_, optcls, nsteps=10)
+if args.mup:
+    name = f"mup_{args.learning_rate}_{args.layers_config}.png"
+else:
+    name = f"sp_{args.learning_rate}_{args.layers_config}.png"
+plot_coord_data(df, legend="full", save_to=args.save_dir / name)

inspecting_dragon.py CHANGED Viewed

@@ -19,9 +19,13 @@ class NanoArgs:
     # arch - general
     d_model : int = 768
     n_heads : int = 6 # head dim 128 suggested by @Grad62304977
     layers_config : str = 4*"lrdlr"
-    expand_factor : int = 1 # expand factor for Mamba/Dragon
     rope_theta_local: float = 10000.0
     eps_rmsnorm: float = 1e-6
     mlp_expand: int = 4 # expand factor for MLP
     fused_loss_computation : bool = True # whether to use fused linear + cross entropy loss
@@ -32,9 +36,14 @@ class NanoArgs:
     zero_centered_gate_type: int = 1 # 1, 2, 3, 4
     gate_attn: bool = False
     gate_gdn: bool = True
-    gate_type: str = "elementwise" # elementwise (one per dim), headwise (one per head)
     gate_act: str = "silu" # silu, sigmoid
     scalar_proj_as_hidden_matrix: bool = True
     # attention related
     n_kv_heads : int = 0
@@ -46,26 +55,38 @@ class NanoArgs:
     softcap_global_attn: float = 0.0
     qk_norm: bool = True
     scalable_softmax: bool = True
-    token_shift: bool = False
     num_attention_heads_indexer: int = 8
     head_dim_indexer: int = 32
     dsa_q_lora_rank: int = 128
     dsa_topk: int = 512
-    cca_head_dim: int = 128
     cca_seq_kernel_size: int = 4
-    nsa_head_dim: int = 128
     nsa_topk: int = 16
     nsa_block_size: int = 64
     nsa_window_size: int = 512
     # GDN related
     rope_gdn: Optional[str] = None # None, rope, (srope)
     n_heads_gdn: int = 0
     n_kv_heads_gdn: int = 0
     # optim
     optim: str = "adamw" # adamw, spam, stable-spam, muon, muon_moonlight, splus
-    second_order_optim : Optional[str] = None #Snoo
     batch_size: int = 8*64 # batch size, in sequences, across all devices
     device_batch_size: int = 64 # batch size, in sequences, per device
     total_iterations: int = 1000 # number of iterations to run
@@ -83,14 +104,13 @@ class NanoArgs:
     init_std: float = 0.006
     patch_level_training: bool = False
     patch_level_training_size: int = 4
-    patch_level_training_mode: str = "reduced" # reduced = ask L tokens, treat L//K. full = ask K*L tokens, treat L.
     # data
     vocab_size: int = 50304
     sequence_length: int = 1024
-    use_patch_level_training: bool = False
-    patch_size: int = 4
-    patch_training_fraction: float = 0.67
     input_bin: Optional[str] = None
     input_val_bin: Optional[str] = None
@@ -116,21 +136,39 @@ args = tyro.cli(NanoArgs)
 # load model.
 config_hf = DragonConfig(
     scalar_proj_as_hidden_matrix=args.scalar_proj_as_hidden_matrix,
-    token_shift=args.token_shift,
     patch_level_training=args.patch_level_training,
     patch_level_training_size=args.patch_level_training_size,
-    nsa_head_dim=args.nsa_head_dim,
     nsa_topk=args.nsa_topk,
     nsa_block_size=args.nsa_block_size,
     nsa_window_size=args.nsa_window_size,
-    cca_head_dim=args.cca_head_dim,
     cca_seq_kernel_size=args.cca_seq_kernel_size,
     num_attention_heads_gdn=args.n_heads_gdn,
     num_key_value_heads_gdn=args.n_kv_heads_gdn,
     zero_centered_gate=args.zero_centered_gate,
     zero_centered_gate_type=args.zero_centered_gate_type,
     scalable_softmax=args.scalable_softmax,
     gate_type=args.gate_type,
     gate_act=args.gate_act,
     gate_attn=args.gate_attn,
@@ -157,8 +195,12 @@ config_hf = DragonConfig(
     norm_epsilon=args.eps_rmsnorm,
     use_cache=False,
     sliding_window_size=args.swa_window_size,
     rope_theta_local=args.rope_theta_local,
     uscaling_tau=args.uscaling_tau,
 )
 model = DragonForCausalLM(config_hf)

     # arch - general
     d_model : int = 768
     n_heads : int = 6 # head dim 128 suggested by @Grad62304977
+    head_dim: Optional[int] = None
     layers_config : str = 4*"lrdlr"
+    expand_factor : int = 2 # expand factor for Mamba/Dragon
+    rope_type_local: str = "" #p-rope
+    rope_type_global: str = "" #p-rope
     rope_theta_local: float = 10000.0
+    rope_theta_global: float = 0.0
     eps_rmsnorm: float = 1e-6
     mlp_expand: int = 4 # expand factor for MLP
     fused_loss_computation : bool = True # whether to use fused linear + cross entropy loss
     zero_centered_gate_type: int = 1 # 1, 2, 3, 4
     gate_attn: bool = False
     gate_gdn: bool = True
+    gate_type: str = "elementwise" # elementwise (one per dim), headwise (one per head), kimi (lora)
     gate_act: str = "silu" # silu, sigmoid
     scalar_proj_as_hidden_matrix: bool = True
+    normalization_type: str = "rmsnorm" # rmsnorm, seednorm
+    seednorm_wd: bool = True
+    mixer_gn: bool = True
+    mlp_linking : bool = False
+    final_norm: bool = True
     # attention related
     n_kv_heads : int = 0
     softcap_global_attn: float = 0.0
     qk_norm: bool = True
     scalable_softmax: bool = True
+    resformer : bool = False # Works only on f layers (DiffAttention)
+    token_shift_attn: bool = False
+    token_shift_gdn: bool = False
+    token_conv1d_attn: bool = False
+    token_conv1d_gdn: bool = True
     num_attention_heads_indexer: int = 8
     head_dim_indexer: int = 32
     dsa_q_lora_rank: int = 128
     dsa_topk: int = 512
     cca_seq_kernel_size: int = 4
     nsa_topk: int = 16
     nsa_block_size: int = 64
     nsa_window_size: int = 512
+    num_signal_heads_diff: Optional[int] = None
+    tpa_rank: int = 2
+    shrink_qk_da: int = 2
+    mla_kv_rank: int = 128
     # GDN related
     rope_gdn: Optional[str] = None # None, rope, (srope)
+    head_dim_gdn: Optional[int] = None
     n_heads_gdn: int = 0
     n_kv_heads_gdn: int = 0
+    shrink_qk_gdn: int = 2
+    kda_allow_neg_eigval: bool = False
+    kda_num_v_heads: Optional[int] = None
+    mamba_mimo_dim: Optional[int] = 2
+    mamba_ngroups: Optional[int] = 1
     # optim
     optim: str = "adamw" # adamw, spam, stable-spam, muon, muon_moonlight, splus
+    second_order_optim : Optional[str] = None # snoo
     batch_size: int = 8*64 # batch size, in sequences, across all devices
     device_batch_size: int = 64 # batch size, in sequences, per device
     total_iterations: int = 1000 # number of iterations to run
     init_std: float = 0.006
     patch_level_training: bool = False
     patch_level_training_size: int = 4
+    second_order_lr: float = 0.68
+    second_order_momentum: float = 0.37
+    second_order_interval: int = 25
     # data
     vocab_size: int = 50304
     sequence_length: int = 1024
     input_bin: Optional[str] = None
     input_val_bin: Optional[str] = None
 # load model.
 config_hf = DragonConfig(
+    final_norm=args.final_norm,
+    mla_kv_rank=args.mla_kv_rank,
+    rope_gdn=args.rope_gdn,
+    shrink_qk_da=args.shrink_qk_da,
+    shrink_qk_gdn=args.shrink_qk_gdn,
+    mixer_gn=args.mixer_gn,
+    kda_allow_neg_eigval=args.kda_allow_neg_eigval,
+    kda_num_v_heads=args.kda_num_v_heads,
+    seednorm_wd=args.seednorm_wd,
+    normalization_type=args.normalization_type,
+    tpa_rank=args.tpa_rank,
+    num_signal_heads_diff=args.num_signal_heads_diff,
     scalar_proj_as_hidden_matrix=args.scalar_proj_as_hidden_matrix,
+    token_shift_attn=args.token_shift_attn,
+    token_shift_gdn=args.token_shift_gdn,
+    token_conv1d_attn=args.token_conv1d_attn,
+    token_conv1d_gdn=args.token_conv1d_gdn,
     patch_level_training=args.patch_level_training,
     patch_level_training_size=args.patch_level_training_size,
     nsa_topk=args.nsa_topk,
     nsa_block_size=args.nsa_block_size,
     nsa_window_size=args.nsa_window_size,
     cca_seq_kernel_size=args.cca_seq_kernel_size,
+    head_dim=args.head_dim,
+    head_dim_gdn=args.head_dim_gdn,
     num_attention_heads_gdn=args.n_heads_gdn,
     num_key_value_heads_gdn=args.n_kv_heads_gdn,
     zero_centered_gate=args.zero_centered_gate,
     zero_centered_gate_type=args.zero_centered_gate_type,
     scalable_softmax=args.scalable_softmax,
+    mamba_mimo_dim=args.mamba_mimo_dim,
+    mamba_ngroups=args.mamba_ngroups,
+    resformer=args.resformer,
     gate_type=args.gate_type,
     gate_act=args.gate_act,
     gate_attn=args.gate_attn,
     norm_epsilon=args.eps_rmsnorm,
     use_cache=False,
     sliding_window_size=args.swa_window_size,
+    rope_type_global=args.rope_type_global,
+    rope_type_local=args.rope_type_local,
+    rope_theta_global=args.rope_theta_global,
     rope_theta_local=args.rope_theta_local,
     uscaling_tau=args.uscaling_tau,
+    mlp_linking=args.mlp_linking
 )
 model = DragonForCausalLM(config_hf)

modeling_dragon.py CHANGED Viewed

@@ -19,11 +19,20 @@ from transformers.utils import ModelOutput, logging
 from fla.ops.nsa.parallel import parallel_nsa
 try:
     from dragon_mamba3_ops.siso_variant.ssd_combined_fused import mamba_chunk_scan_discretized_combined
     from dragon_mamba3_ops.angle_cumsum import angle_dt
     from dragon_mamba3_ops.rotary_mamba import rotary_qk
-except ImportError:
     mamba_chunk_scan_discretized_combined, angle_dt, rotary_qk = None, None, None
 try:
@@ -39,8 +48,9 @@ try:
     from fla.ops.kda import chunk_kda, fused_recurrent_kda
     from fla.ops.kda.gate import fused_kda_gate
     from fla.modules import FusedRMSNormGated, ShortConvolution
 except ImportError:
-    chunk_kda, fused_recurrent_kda, fused_kda_gate = None, None, None
 from torch.compiler import disable
@@ -56,13 +66,14 @@ ATTN_IMPL = "eager"
 try:
     import flash_attn_interface # FA3
     flash_attn_func = flash_attn_interface.flash_attn_func
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
     if not _flash_supports_window_size:
         raise ImportError("flash_attn_func does not support window_size parameter. Please update to more recent flash_attn version")
     ATTN_IMPL = "fa3"
 except ImportError:
     try:
-        from flash_attn import flash_attn_func # FA2
         ATTN_IMPL = "fa2"
     except ImportError:
         try:
@@ -123,7 +134,16 @@ class DragonNorm(nn.Module):
         if config.normalization_type == "rmsnorm":
             self.norm = DragonRMSNorm(hidden_size, eps=config.norm_epsilon, zero_centered_gamma=config.zero_centered_gamma)
         elif config.normalization_type == "seednorm":
-            self.norm = DragonSeeDNorm(config, hidden_size, eps=config.norm_epsilon)
         else:
             raise ValueError(f"Unknown normalization_type: {config.normalization_type}")
@@ -159,6 +179,54 @@ class DragonSeeDNorm(nn.Module):
         dynamic_scale = rescale.unsqueeze(-1) * self.alpha # (B, L, D)
         return (dynamic_scale + self.gamma) * self.rms(hidden_states)
 class DragonLayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6): # TODO: ZCG ?
         super().__init__()
@@ -1696,6 +1764,8 @@ class DragonDifferentialAttention(nn.Module):
         hidden_states: torch.Tensor,
         position_ids: Optional[torch.LongTensor] = None,
         cache_params: Optional[HybridDragonDynamicCache] = None,
         **kwargs,
     ):
         _, q_len, _ = hidden_states.shape
@@ -1747,6 +1817,17 @@ class DragonDifferentialAttention(nn.Module):
                 k_prev = F.pad(key_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
                 v_prev = F.pad(value_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
             key_states = alpha_k * k_prev + (1 - alpha_k) * key_states
             value_states = alpha_v * v_prev + (1 - alpha_v) * value_states
@@ -1859,18 +1940,28 @@ class DragonDifferentialAttention(nn.Module):
         elif DIFF_ATTN_IMPL == "fa2":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
-                    return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
-                o1 = flash_attn_func(q, k, v1, window_size=(wsize, 0), **kw)
-                o2 = flash_attn_func(q, k, v2, window_size=(wsize, 0), **kw)
                 o = torch.cat([o1, o2], dim=-1)
                 return o
         elif DIFF_ATTN_IMPL == "fa3":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
-                    return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)[0]
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
@@ -2350,6 +2441,8 @@ class DragonDifferentialTensorProductAttention(nn.Module):
         hidden_states: torch.Tensor,
         position_ids: Optional[torch.LongTensor] = None,
         cache_params: Optional[HybridDragonDynamicCache] = None,
         **kwargs,
     ):
         b, q_len, _ = hidden_states.shape
@@ -2398,6 +2491,17 @@ class DragonDifferentialTensorProductAttention(nn.Module):
                 k_prev = F.pad(key_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
                 v_prev = F.pad(value_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
             key_states = alpha_k * k_prev + (1 - alpha_k) * key_states
             value_states = alpha_v * v_prev + (1 - alpha_v) * value_states
@@ -2510,7 +2614,10 @@ class DragonDifferentialTensorProductAttention(nn.Module):
         elif DIFF_ATTN_IMPL == "fa2":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
-                    return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
@@ -2521,7 +2628,10 @@ class DragonDifferentialTensorProductAttention(nn.Module):
         elif DIFF_ATTN_IMPL == "fa3":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
-                    return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)[0]
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
@@ -3102,6 +3212,7 @@ class DragonGatedDeltaNet(nn.Module):
                 hidden_states: torch.Tensor,
                 position_embeddings: tuple[torch.Tensor, torch.Tensor],
                 cache_params: Optional[HybridDragonDynamicCache] = None,
                 **kwargs,
     ):
         _, q_len, _ = hidden_states.shape
@@ -3164,12 +3275,15 @@ class DragonGatedDeltaNet(nn.Module):
                     conv_cache = F.pad(mixed_qkv, (self.conv_size - mixed_qkv.shape[-1], 0))
                     cache_params.conv_caches[self.layer_idx] = conv_cache
                 if self.causal_conv1d_fn is not None:
                     mixed_qkv = self.causal_conv1d_fn(
                         x=mixed_qkv,
                         weight=self.qkv_conv1d.weight.squeeze(1),
                         bias=self.qkv_conv1d.bias,
                         activation='silu',
-                        seq_idx=None,
                     )
                 else:
                     mixed_qkv = F.silu(self.qkv_conv1d(mixed_qkv)[:, :, :q_len])
@@ -3216,7 +3330,8 @@ class DragonGatedDeltaNet(nn.Module):
                 scale=None if not self.config.use_uscaling else 1/self.dk,
                 initial_state=None,
                 output_final_state=cache_params is not None,
-                use_qk_l2norm_in_kernel=True
             ) # (B L H dv)
         else:
             o, ssm_cache = self.recurrent_gated_delta_rule(
@@ -3404,19 +3519,16 @@ class DragonMamba3(nn.Module):
             )
         self.d_model = config.hidden_size
-        self.d_state = 64
         self.conv_init = None
         self.expand = 2
-        self.headdim = 128
-        self.ngroups = 20
         self.activation = "swish"
         self.bias = False
-        self.conv_bias = True
         self.chunk_size = 128
         self.A_floor = 1e-4
         self.rope_fraction = 0.5
-        self.remove_conv = True
-        self.add_conv_activation = False
         self.dt_min = 0.001
         self.dt_max = 0.1
         self.dt_init_floor = 1e-4
@@ -3432,13 +3544,24 @@ class DragonMamba3(nn.Module):
         if self.split_tensor_size == 0:
             return
-        self.rope_proj = DragonLinear(config, self.d_model, self.num_rope_angles, bias=False)
         # Order: [x, B, C, dt]
         d_in_proj = self.d_inner + 2 * self.d_state * self.ngroups + self.nheads
-        self.A_proj = DragonLinear(config, self.d_model, self.nheads, bias=False, dtype=torch.float32)
-        self.trapezoid_proj = DragonLinear(config, self.d_model, self.nheads, bias=False)
         _dt = torch.exp(
             torch.rand(self.nheads) * (math.log(self.dt_max) - math.log(self.dt_min))
@@ -3447,21 +3570,25 @@ class DragonMamba3(nn.Module):
         _dt = torch.clamp(_dt, min=self.dt_init_floor)
         _dt_bias = _dt + torch.log(-torch.expm1(-_dt))
         self.dt_bias = nn.Parameter(_dt_bias, requires_grad=True)
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
-        self.B_bias = nn.Parameter(torch.ones((self.nheads, self.d_state)), requires_grad=True)
-        self.C_bias = nn.Parameter(torch.ones((self.nheads, self.d_state)), requires_grad=True)
-        self.B_norm = DragonNorm(config, self.d_state)
-        self.C_norm = DragonNorm(config, self.d_state)
-        if not self.remove_conv:
             conv_dim = self.d_inner + 2 * self.d_state * self.ngroups
             self.conv1d = nn.Conv1d(
                 in_channels=conv_dim,
                 out_channels=conv_dim,
-                bias=self.conv_bias,
                 kernel_size=4,
                 groups=conv_dim,
             )
@@ -3473,8 +3600,14 @@ class DragonMamba3(nn.Module):
         # D "skip" parameter
         self.D = nn.Parameter(torch.ones(self.nheads))
-    def forward(self, hidden_states, **kwargs):
         # Apply in_proj
         xBCdt = self.in_proj(hidden_states)
         xBC, dd_dt = torch.split(
@@ -3485,16 +3618,19 @@ class DragonMamba3(nn.Module):
             ],
             dim=-1)
-        _A = -F.softplus((self.A_proj(hidden_states.to(torch.float32))).to(torch.float32)) # (B, L, N)
-        _A = torch.clamp(_A, max=-self.A_floor)
         dt = F.softplus(dd_dt + self.dt_bias) # (B, L, N)
-        if not self.remove_conv:
             xBC = causal_conv1d_fn(
                 x=xBC.transpose(1, 2),
                 weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
                 bias=self.conv1d.bias,
-                activation=self.activation if self.add_conv_activation else None,
             ).transpose(1, 2) # (B, L, self.d_inner + 2 * ngroups * d_state)
         x, B, C = torch.split(
@@ -3507,37 +3643,64 @@ class DragonMamba3(nn.Module):
         B = rearrange(B, "b l (g n) -> b l g n", g=self.ngroups)
         C = rearrange(C, "b l (g n) -> b l g n", g=self.ngroups)
-        B = self.B_norm(B)
-        C = self.C_norm(C)
         if self.ngroups != self.nheads:
             B = B.expand(-1, -1, self.nheads, -1) # (B, L, N, S)
             C = C.expand(-1, -1, self.nheads, -1) # (B, L, N, S)
-        angle = self.rope_proj(hidden_states) # (B, L, S)
-        angle = angle.unsqueeze(-2).expand(-1, -1, self.nheads, -1) # (B, L, G, S)
-        angle = angle_dt(angle, dt)
-        C, B, CB_sum = rotary_qk(q=C, k=B, angle=angle, bias_q=self.C_bias, bias_k=self.B_bias, conjugate=False, inplace=False)
         x = rearrange(x, "b l (h p) -> b l h p", p=self.headdim)
         A = _A * dt
         gating_factor = dt # B, L, N
-        trap = F.sigmoid(self.trapezoid_proj(hidden_states)) # (B, L, N)
-        alpha_arr = torch.exp(A)
-        beta_arr = (1-trap)*gating_factor*alpha_arr
-        gamma_arr = trap*gating_factor
-        # roll alpha and beta to the left by 1
-        _alpha_arr = torch.roll(alpha_arr, shifts=-1, dims=1)
-        _beta_arr = torch.roll(beta_arr, shifts=-1, dims=1)
-        x_scalar = (gamma_arr*_alpha_arr + _beta_arr).to(torch.bfloat16)
-        y = mamba_chunk_scan_discretized_combined(
             x=x.bfloat16(),
             A=A,
             B=B.bfloat16(),
@@ -3547,11 +3710,117 @@ class DragonMamba3(nn.Module):
             gamma=gamma_arr,
             CB_sum=CB_sum,
             D=self.D,
-            z=None
         )
         return y, None, None
 class DragonMamba3Mimo(nn.Module):
     def __init__(self, config: DragonConfig, layer_idx: Optional[int]):
@@ -3570,7 +3839,7 @@ class DragonMamba3Mimo(nn.Module):
         self.conv_init = None
         self.expand = 2
         self.headdim = 128
-        self.ngroups = 20
         self.activation = "swish"
         self.bias = False
         self.conv_bias = True
@@ -3604,7 +3873,7 @@ class DragonMamba3Mimo(nn.Module):
         # Order: [z, x, B, C, dt]
         d_in_proj = 2 * self.d_inner + 2 * self.d_state * self.ngroups * self.mimo_dim + self.nheads
-        self.A_proj = DragonLinear(config, self.d_model, self.nheads, bias=False) # dtype=float32
         self.trapezoid_proj = DragonLinear(config, self.d_model, self.nheads, bias=False)
         _dt = torch.exp(
@@ -3618,9 +3887,9 @@ class DragonMamba3Mimo(nn.Module):
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
-        self.B_bias = nn.Parameter(torch.ones((self.nheads, self.d_state)), requires_grad=True)
-        self.C_bias = nn.Parameter(torch.ones((self.nheads, self.d_state)), requires_grad=True)
         self.B_norm = DragonNorm(config, self.d_state)
         self.C_norm = DragonNorm(config, self.d_state)
@@ -3655,9 +3924,9 @@ class DragonMamba3Mimo(nn.Module):
     def forward(self, hidden_states, **kwargs):
         # Apply in_proj
-        xBCdt = self.in_proj(hidden_states)
         z, xBC, dd_dt = torch.split(
-            xBCdt,
             [
                 self.d_inner,
                 self.d_inner + 2 * self.d_state * self.ngroups * self.mimo_dim,
@@ -3719,14 +3988,15 @@ class DragonMamba3Mimo(nn.Module):
         C = self.C_norm(C)
         if self.ngroups != self.nheads:
-            B = B.expand(-1, -1, self.nheads, -1) # (B, L, R, N, S)
-            C = C.expand(-1, -1, self.nheads, -1) # (B, L, R, N, S)
         angle = self.rope_proj(hidden_states) # (B, L, S)
         angle = angle.unsqueeze(-2).expand(-1, -1, self.nheads, -1) # (B, L, G, S)
         angle = angle_dt(angle, dt)
-        C, B, CB_sum = rotary_qk(q=C, k=B, angle=angle, bias_q=self.C_bias, bias_k=self.B_bias, conjugate=False, inplace=False)
         x = rearrange(x, "b l r (h p) -> b l r h p", p=self.headdim)
@@ -3747,7 +4017,7 @@ class DragonMamba3Mimo(nn.Module):
         z = rearrange(z, "b l r (h p) -> b l r h p", p=self.headdim)
-        y = mamba_chunk_scan_discretized_combined(
             x=x.bfloat16(),
             A=A.bfloat16(),
             B=B.bfloat16(),
@@ -3761,31 +4031,33 @@ class DragonMamba3Mimo(nn.Module):
         )
         y = rearrange(y, "b l r h p -> b l r (h p)")
-        if seqlen_og is not None:
-            y = rearrange(y, "b l r d -> (b l) r d")
         # Perform MIMO down projection (mimo_rank*d_inner -> d_inner)
         y = rearrange(y, "b l r d -> b l (r d)")
         y = rearrange(y, "b l (g d) -> b l g d", g=self.mimo_dim*self.mimo_proj_block_order)
         y = torch.einsum("blgd,drg->bldr", y, self.out_proj_mimo)
         y = rearrange(y, "b l d r -> b l (d r)")
         return y, None, None
 class DragonMLP(nn.Module):
-    def __init__(self, config: DragonConfig):
         super().__init__()
         self.config = config
         #print("previous MLP : ", PREVIOUS_MLP)
         self.link_size = 16
         self.mlp_linking = config.mlp_linking and PREVIOUS_MLP is not None
         if self.mlp_linking:
             self.previous_mlp = PREVIOUS_MLP
-            self.fc_1 = DragonLinear(config, config.hidden_size, config.intermediate_size, bias=False)
             self.lambda1 = nn.Parameter(torch.zeros(self.link_size))  # sigmoid->0.5
         else :
-            self.fc_1 = DragonLinear(config, config.hidden_size, config.intermediate_size, bias=False)
-        self.fc_2 = DragonLinear(config, config.intermediate_size, config.hidden_size, bias=False)
         self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
     def forward(self, hidden_states):
@@ -3803,7 +4075,51 @@ class DragonMLP(nn.Module):
         return hidden_states
     def get_mlp_link(self):
-        return self.mlp_link
 PREVIOUS_MLP = None
 class DragonMonoBlock(GradientCheckpointingLayer):
@@ -3878,6 +4194,16 @@ class DragonMonoBlock(GradientCheckpointingLayer):
             head_dim = self.mixer.headdim
             num_attention_heads = self.mixer.nheads
             use_gate = config.gate_gdn
         else:
             raise ValueError(f"Unknown layer type: {layer_type}")
@@ -3922,7 +4248,10 @@ class DragonMonoBlock(GradientCheckpointingLayer):
         self.input_norm = DragonNorm(config, config.hidden_size)
         self.postmixer_norm = DragonNorm(config, config.hidden_size)
-        self.mlp = DragonMLP(config)
         global PREVIOUS_MLP
         PREVIOUS_MLP = self.mlp
@@ -3938,6 +4267,8 @@ class DragonMonoBlock(GradientCheckpointingLayer):
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         key_value_last_layer: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs,
     ):
         # MIXER.
@@ -3949,6 +4280,8 @@ class DragonMonoBlock(GradientCheckpointingLayer):
             position_ids=position_ids,
             cache_params=cache_params,
             key_value_last_layer=key_value_last_layer,
         ) # (B, L, E*D)
         if self.use_gate:
             if self.config.gate_type == "elementwise" or self.config.gate_type == "kimi":
@@ -4126,8 +4459,13 @@ class DragonModel(DragonPreTrainedModel):
         self.embedding = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList([DragonBlock(config, layer_idx=i, layer_type=layer) if layer in ['l', 'r', 'd'] else DragonMonoBlock(config, layer_idx=i, layer_type=layer) for i, layer in enumerate(config.layers_config)])
-        self.rotary_emb = DragonRotaryEmbedding(config, head_dim=config.head_dim if config.head_dim else (config.expand_factor*config.hidden_size)//config.num_attention_heads, theta=config.rope_theta_local) # only for SWA
-        self.final_norm = DragonNorm(config, config.hidden_size)
         self.gradient_checkpointing = False
         self.post_init()
@@ -4148,6 +4486,8 @@ class DragonModel(DragonPreTrainedModel):
         cache_position: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         **kwargs
     ) -> DragonOutput:
         B, L = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2]
@@ -4191,7 +4531,10 @@ class DragonModel(DragonPreTrainedModel):
         all_hidden_states = () if output_hidden_states else None
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
         shared_kv = (None, None)
         for block in self.layers:
@@ -4205,11 +4548,14 @@ class DragonModel(DragonPreTrainedModel):
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
                 key_value_last_layer=shared_kv,
                 **kwargs,
             )
             shared_kv = (last_k, last_v)
-        hidden_states = self.final_norm(hidden_states)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -4242,6 +4588,9 @@ class DragonForCausalLM(DragonPreTrainedModel, GenerationMixin):
         cache_position: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         attention_mask: Optional[torch.Tensor] = None,
         token_type_ids=None,
         **kwargs,
     ) -> DragonCausalLMOutput:
@@ -4256,6 +4605,8 @@ class DragonForCausalLM(DragonPreTrainedModel, GenerationMixin):
             cache_position=cache_position,
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
             **kwargs,
         )
@@ -4299,9 +4650,9 @@ class DragonForCausalLM(DragonPreTrainedModel, GenerationMixin):
         return DragonCausalLMOutput(
             loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
         )
 DragonForCausalLM.register_for_auto_class("AutoModelForCausalLM")

 from fla.ops.nsa.parallel import parallel_nsa
+try:
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined
+except ImportError:
+    mamba_chunk_scan_combined = None
 try:
     from dragon_mamba3_ops.siso_variant.ssd_combined_fused import mamba_chunk_scan_discretized_combined
+    from dragon_mamba3_ops.mimo_variant.ssd_mimo import mamba_chunk_scan_discretized_fused_combined as mamba_mimo_chunk_scan_discretized_fused_combined
     from dragon_mamba3_ops.angle_cumsum import angle_dt
     from dragon_mamba3_ops.rotary_mamba import rotary_qk
+    from dragon_mamba3_ops.rotary_mamba_mimo import rotary_qk as mimo_rotary_qk
+except ImportError as exc:
+    print("Warning: No Mamba-3 found !")
+    print(exc)
     mamba_chunk_scan_discretized_combined, angle_dt, rotary_qk = None, None, None
 try:
     from fla.ops.kda import chunk_kda, fused_recurrent_kda
     from fla.ops.kda.gate import fused_kda_gate
     from fla.modules import FusedRMSNormGated, ShortConvolution
+    from fla.ops.utils import prepare_sequence_ids
 except ImportError:
+    chunk_kda, fused_recurrent_kda, fused_kda_gate, prepare_sequence_ids = None, None, None, None
 from torch.compiler import disable
 try:
     import flash_attn_interface # FA3
     flash_attn_func = flash_attn_interface.flash_attn_func
+    flash_attn_varlen_func = flash_attn_interface.flash_attn_varlen_func
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
     if not _flash_supports_window_size:
         raise ImportError("flash_attn_func does not support window_size parameter. Please update to more recent flash_attn version")
     ATTN_IMPL = "fa3"
 except ImportError:
     try:
+        from flash_attn import flash_attn_func, flash_attn_varlen_func # FA2
         ATTN_IMPL = "fa2"
     except ImportError:
         try:
         if config.normalization_type == "rmsnorm":
             self.norm = DragonRMSNorm(hidden_size, eps=config.norm_epsilon, zero_centered_gamma=config.zero_centered_gamma)
         elif config.normalization_type == "seednorm":
+            if config.seednorm_type == 1:
+                self.norm = DragonSeeDNorm(config, hidden_size, eps=config.norm_epsilon)
+            elif config.seednorm_type == 2:
+                self.norm = DragonSeeDNormType2(config, hidden_size, eps=config.norm_epsilon)
+            elif config.seednorm_type == 3:
+                self.norm = DragonSeeDNormType3(config, hidden_size, eps=config.norm_epsilon)
+            elif config.seednorm_type == 4:
+                self.norm = DragonSeeDNormType4(config, hidden_size, eps=config.norm_epsilon)
+            else:
+                raise ValueError(f"Unknown seednorm_type: {config.seednorm_type}")
         else:
             raise ValueError(f"Unknown normalization_type: {config.normalization_type}")
         dynamic_scale = rescale.unsqueeze(-1) * self.alpha # (B, L, D)
         return (dynamic_scale + self.gamma) * self.rms(hidden_states)
+class DragonSeeDNormType2(nn.Module):
+    def __init__(self, config: DragonConfig, hidden_size, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.beta = DragonLinear(config, hidden_size, 1, bias=False)
+        self.alpha = nn.Parameter(torch.ones(hidden_size) * 1.)
+        if config.seednorm_wd:
+            self.alpha.requires_weight_decay = True
+        self.gamma = nn.Parameter(torch.ones(hidden_size))
+        self.rms = nn.RMSNorm(hidden_size, eps=eps, elementwise_affine=False)
+    def forward(self, hidden_states):
+        rescale = F.tanh(self.beta(hidden_states)) # (B, L, 1)
+        dynamic_scale = rescale * self.alpha # (B, L, D)
+        return (dynamic_scale + self.gamma) * self.rms(hidden_states)
+class DragonSeeDNormType3(nn.Module):
+    def __init__(self, config: DragonConfig, hidden_size, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.beta = nn.Sequential(
+            DragonLinear(config, hidden_size, config.seednorm_rank, bias=False),
+            DragonLinear(config, config.seednorm_rank, hidden_size, bias=False),
+        )
+        self.gamma = nn.Parameter(torch.ones(hidden_size))
+        self.rms = nn.RMSNorm(hidden_size, eps=eps, elementwise_affine=False)
+    def forward(self, hidden_states):
+        dynamic_rescale = F.tanh(self.beta(hidden_states)) # (B, L, D)
+        return (dynamic_rescale + self.gamma) * self.rms(hidden_states)
+class DragonSeeDNormType4(nn.Module):
+    def __init__(self, config: DragonConfig, hidden_size, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.beta = nn.Sequential(
+            DragonLinear(config, hidden_size, config.seednorm_rank, bias=False),
+            DragonLinear(config, config.seednorm_rank, hidden_size, bias=False),
+        )
+        self.rms = nn.RMSNorm(hidden_size, eps=eps, elementwise_affine=False)
+    def forward(self, hidden_states):
+        dynamic_rescale = F.silu(self.beta(hidden_states) + 1.15) # (B, L, D)
+        return dynamic_rescale * self.rms(hidden_states)
 class DragonLayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6): # TODO: ZCG ?
         super().__init__()
         hidden_states: torch.Tensor,
         position_ids: Optional[torch.LongTensor] = None,
         cache_params: Optional[HybridDragonDynamicCache] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
         **kwargs,
     ):
         _, q_len, _ = hidden_states.shape
                 k_prev = F.pad(key_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
                 v_prev = F.pad(value_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
+            if position_ids is not None:
+                # first token of each doc has pos==0
+                doc_start = (position_ids == 0) # (B, L) bool
+                m = doc_start.unsqueeze(-1).unsqueeze(-1) # (B, L, 1, 1) bool
+                # zero the previous contribution at boundaries
+                k_prev  = k_prev.masked_fill(m, 0)
+                v_prev  = v_prev.masked_fill(m, 0)
+                alpha_k = alpha_k.masked_fill(m, 0)
+                alpha_v = alpha_v.masked_fill(m, 0)
             key_states = alpha_k * k_prev + (1 - alpha_k) * key_states
             value_states = alpha_v * v_prev + (1 - alpha_v) * value_states
         elif DIFF_ATTN_IMPL == "fa2":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
+                    if not self.config.intra_doc_masking:
+                        return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)
+                    else:
+                        return flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, window_size=(wsize, 0), **kw).unsqueeze(0)
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
+                if not self.config.intra_doc_masking:
+                    o1 = flash_attn_func(q, k, v1, window_size=(wsize, 0), **kw)
+                    o2 = flash_attn_func(q, k, v2, window_size=(wsize, 0), **kw)
+                else:
+                    o1 = flash_attn_varlen_func(q[0], k[0], v1[0], cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, window_size=(wsize, 0), **kw).unsqueeze(0)
+                    o2 = flash_attn_varlen_func(q[0], k[0], v2[0], cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, window_size=(wsize, 0), **kw).unsqueeze(0)
                 o = torch.cat([o1, o2], dim=-1)
                 return o
         elif DIFF_ATTN_IMPL == "fa3":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
+                    if not self.config.intra_doc_masking:
+                        return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)[0]
+                    else:
+                        return flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, window_size=(wsize, 0), **kw)[0].unsqueeze(0)
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
         hidden_states: torch.Tensor,
         position_ids: Optional[torch.LongTensor] = None,
         cache_params: Optional[HybridDragonDynamicCache] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
         **kwargs,
     ):
         b, q_len, _ = hidden_states.shape
                 k_prev = F.pad(key_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
                 v_prev = F.pad(value_states, (0, 0, 0, 0, 1, 0))[:, :-1] # (B, L, H, D)
+            if position_ids is not None:
+                # first token of each doc has pos==0
+                doc_start = (position_ids == 0) # (B, L) bool
+                m = doc_start.unsqueeze(-1).unsqueeze(-1) # (B, L, 1, 1) bool
+                # zero the previous contribution at boundaries
+                k_prev  = k_prev.masked_fill(m, 0)
+                v_prev  = v_prev.masked_fill(m, 0)
+                alpha_k = alpha_k.masked_fill(m, 0)
+                alpha_v = alpha_v.masked_fill(m, 0)
             key_states = alpha_k * k_prev + (1 - alpha_k) * key_states
             value_states = alpha_v * v_prev + (1 - alpha_v) * value_states
         elif DIFF_ATTN_IMPL == "fa2":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
+                    if not self.config.intra_doc_masking:
+                        return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)
+                    else:
+                        return flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, window_size=(wsize, 0), **kw).unsqueeze(0)
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
         elif DIFF_ATTN_IMPL == "fa3":
             def diff_attention_interface(q, k, v, wsize, **kw):
                 if self.head_qk_dim == self.head_v_dim:
+                    if not self.config.intra_doc_masking:
+                        return flash_attn_func(q, k, v, window_size=(wsize, 0), **kw)[0]
+                    else:
+                        return flash_attn_varlen_func(q[0], k[0], v[0], cu_seqlens_q=cu_seqlens, cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, window_size=(wsize, 0), **kw)[0].unsqueeze(0)
                 D = v.size(3)
                 v1 = v[:, :, :, :D//2]
                 v2 = v[:, :, :, D//2:]
                 hidden_states: torch.Tensor,
                 position_embeddings: tuple[torch.Tensor, torch.Tensor],
                 cache_params: Optional[HybridDragonDynamicCache] = None,
+                cu_seqlens: Optional[torch.Tensor] = None,
                 **kwargs,
     ):
         _, q_len, _ = hidden_states.shape
                     conv_cache = F.pad(mixed_qkv, (self.conv_size - mixed_qkv.shape[-1], 0))
                     cache_params.conv_caches[self.layer_idx] = conv_cache
                 if self.causal_conv1d_fn is not None:
+                    seq_idx = None
+                    if cu_seqlens is not None:
+                        seq_idx = prepare_sequence_ids(cu_seqlens).to(torch.int32).unsqueeze(0)
                     mixed_qkv = self.causal_conv1d_fn(
                         x=mixed_qkv,
                         weight=self.qkv_conv1d.weight.squeeze(1),
                         bias=self.qkv_conv1d.bias,
                         activation='silu',
+                        seq_idx=seq_idx,
                     )
                 else:
                     mixed_qkv = F.silu(self.qkv_conv1d(mixed_qkv)[:, :, :q_len])
                 scale=None if not self.config.use_uscaling else 1/self.dk,
                 initial_state=None,
                 output_final_state=cache_params is not None,
+                use_qk_l2norm_in_kernel=True,
+                cu_seqlens=cu_seqlens,
             ) # (B L H dv)
         else:
             o, ssm_cache = self.recurrent_gated_delta_rule(
             )
         self.d_model = config.hidden_size
+        self.d_state = 128
         self.conv_init = None
         self.expand = 2
+        self.headdim = 64
+        self.ngroups = config.mamba_ngroups
         self.activation = "swish"
         self.bias = False
         self.chunk_size = 128
         self.A_floor = 1e-4
         self.rope_fraction = 0.5
         self.dt_min = 0.001
         self.dt_max = 0.1
         self.dt_init_floor = 1e-4
         if self.split_tensor_size == 0:
             return
+        if config.mamba3_rope:
+            self.rope_proj = DragonLinear(config, self.d_model, self.num_rope_angles, bias=False)
         # Order: [x, B, C, dt]
         d_in_proj = self.d_inner + 2 * self.d_state * self.ngroups + self.nheads
+        if self.config.mamba3_is_A_dd:
+            self.A_proj = DragonLinear(config, self.d_model, self.nheads, bias=False, dtype=torch.float32)
+        else:
+            A_init_range = (1, 16)
+            assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
+            A = torch.empty(self.nheads, dtype=torch.float32).uniform_(*A_init_range)
+            A_log = torch.log(A).to(dtype=torch.float32)
+            self.A_log = nn.Parameter(A_log)
+            self.A_log._no_weight_decay = True
+        if config.mamba3_add_trapezoid:
+            self.trapezoid_proj = DragonLinear(config, self.d_model, self.nheads, bias=False)
         _dt = torch.exp(
             torch.rand(self.nheads) * (math.log(self.dt_max) - math.log(self.dt_min))
         _dt = torch.clamp(_dt, min=self.dt_init_floor)
         _dt_bias = _dt + torch.log(-torch.expm1(-_dt))
         self.dt_bias = nn.Parameter(_dt_bias, requires_grad=True)
+        self.dt_bias._no_weight_decay = True
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
+        self.B_bias, self.C_bias = None, None
+        if not config.mamba3_remove_BC_bias:
+            self.B_bias = nn.Parameter(torch.ones((self.nheads, self.d_state)), requires_grad=True)
+            self.C_bias = nn.Parameter(torch.ones((self.nheads, self.d_state)), requires_grad=True)
+        if config.mamba3_is_id_rms:
+            self.B_norm = DragonNorm(config, self.d_state)
+            self.C_norm = DragonNorm(config, self.d_state)
+        if not config.mamba3_remove_conv:
             conv_dim = self.d_inner + 2 * self.d_state * self.ngroups
             self.conv1d = nn.Conv1d(
                 in_channels=conv_dim,
                 out_channels=conv_dim,
+                bias=False,
                 kernel_size=4,
                 groups=conv_dim,
             )
         # D "skip" parameter
         self.D = nn.Parameter(torch.ones(self.nheads))
+        self.D._no_weight_decay = True
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridDragonDynamicCache] = None,
+        **kwargs
+    ):
         # Apply in_proj
         xBCdt = self.in_proj(hidden_states)
         xBC, dd_dt = torch.split(
             ],
             dim=-1)
+        if self.config.mamba3_is_A_dd:
+            _A = -F.softplus((self.A_proj(hidden_states.to(torch.float32))).to(torch.float32)) # (B, L, N)
+            _A = torch.clamp(_A, max=-self.A_floor)
+        else:
+            _A = -torch.exp(self.A_log).unsqueeze(0).unsqueeze(0)
         dt = F.softplus(dd_dt + self.dt_bias) # (B, L, N)
+        if not self.config.mamba3_remove_conv:
             xBC = causal_conv1d_fn(
                 x=xBC.transpose(1, 2),
                 weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
                 bias=self.conv1d.bias,
+                activation=self.activation,
             ).transpose(1, 2) # (B, L, self.d_inner + 2 * ngroups * d_state)
         x, B, C = torch.split(
         B = rearrange(B, "b l (g n) -> b l g n", g=self.ngroups)
         C = rearrange(C, "b l (g n) -> b l g n", g=self.ngroups)
+        if self.config.mamba3_is_id_rms:
+            B = self.B_norm(B)
+            C = self.C_norm(C)
         if self.ngroups != self.nheads:
             B = B.expand(-1, -1, self.nheads, -1) # (B, L, N, S)
             C = C.expand(-1, -1, self.nheads, -1) # (B, L, N, S)
+        if self.config.mamba3_rope:
+            angle = self.rope_proj(hidden_states) # (B, L, S)
+            angle = angle.unsqueeze(-2).expand(-1, -1, self.nheads, -1) # (B, L, G, S)
+            angle = angle_dt(angle, dt)
+            C, B, CB_sum = rotary_qk(q=C, k=B, angle=angle, bias_q=self.C_bias, bias_k=self.B_bias, conjugate=False, inplace=False)
+        else:
+            if not self.config.mamba3_remove_BC_bias:
+                og_dtpe = B.dtype
+                B = (B + self.B_bias).to(og_dtpe)
+                C = (C + self.C_bias).to(og_dtpe)
+            CB_sum = torch.sum(
+                B.to(torch.float32)*C.to(torch.float32),
+                dim=-1,
+                keepdim=False
+            )
         x = rearrange(x, "b l (h p) -> b l h p", p=self.headdim)
         A = _A * dt
         gating_factor = dt # B, L, N
+        if self.config.mamba3_add_trapezoid:
+            trap = F.sigmoid(self.trapezoid_proj(hidden_states)) # (B, L, N)
+            alpha_arr = torch.exp(A)
+            beta_arr = (1-trap)*gating_factor*alpha_arr
+            gamma_arr = trap*gating_factor
+            # roll alpha and beta to the left by 1
+            _alpha_arr = torch.roll(alpha_arr, shifts=-1, dims=1)
+            _beta_arr = torch.roll(beta_arr, shifts=-1, dims=1)
+            x_scalar = (gamma_arr*_alpha_arr + _beta_arr).to(torch.bfloat16)
+        else:
+            alpha_arr = torch.exp(A)
+            beta_arr = torch.zeros_like(alpha_arr)
+            gamma_arr = gating_factor
+            # roll alpha to the left by 1
+            _alpha_arr = torch.roll(alpha_arr, shifts=-1, dims=1)
+            x_scalar = (gamma_arr*_alpha_arr).to(torch.bfloat16)
+        ssm_cache = None
+        if cache_params is not None:
+            ssm_cache = cache_params.ssm_caches[self.layer_idx]
+        out = mamba_chunk_scan_discretized_combined(
             x=x.bfloat16(),
             A=A,
             B=B.bfloat16(),
             gamma=gamma_arr,
             CB_sum=CB_sum,
             D=self.D,
+            z=None,
+            initial_states=ssm_cache,
+            return_final_states=cache_params is not None,
         )
+        if cache_params is not None:
+            y, ssm_cache = out
+            cache_params.ssm_caches[self.layer_idx] = ssm_cache
+        else:
+            y = out
         return y, None, None
+class DragonMamba2(nn.Module):
+    def __init__(self, config: DragonConfig, layer_idx: Optional[int]):
+        super().__init__()
+        self.d_model = config.hidden_size
+        self.d_state = 128
+        self.expand = 2
+        self.d_inner = self.expand * self.d_model
+        self.headdim = 64
+        self.ngroups = config.mamba_ngroups
+        assert self.d_inner % self.headdim == 0
+        self.nheads = self.d_inner // self.headdim
+        self.layer_idx = layer_idx
+        # Order: [x, B, C, dt]
+        d_in_proj = self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
+        self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=False)
+        conv_dim = self.d_inner + 2 * self.ngroups * self.d_state
+        self.conv1d = nn.Conv1d(
+            in_channels=conv_dim,
+            out_channels=conv_dim,
+            bias=False,
+            kernel_size=4,
+            groups=conv_dim,
+            padding=4-1,
+        )
+        self.act = nn.SiLU()
+        # Initialize log dt bias
+        dt_min=0.001
+        dt_max=0.1
+        dt_init_floor=1e-4
+        dt_limit=(0.0, float("inf"))
+        dt = torch.exp(torch.rand(self.nheads) * (math.log(dt_max) - math.log(dt_min)) + math.log(dt_min))
+        dt = torch.clamp(dt, min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        self.dt_bias = nn.Parameter(inv_dt)
+        self.dt_bias._no_weight_decay = True
+        # A parameter
+        A_init_range=(1, 16)
+        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
+        A = torch.empty(self.nheads, dtype=torch.float32).uniform_(*A_init_range)
+        A_log = torch.log(A)
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.nheads))
+        self.D._no_weight_decay = True
+    def forward(self, hidden_states, **kwargs):
+        """
+        u: (B, L, D)
+        Returns: same shape as u
+        """
+        _, seqlen, _ = hidden_states.shape
+        zxbcdt = self.in_proj(hidden_states)  # (B, L, d_in_proj)
+        A = -torch.exp(self.A_log)  # (nheads) or (d_inner, d_state)
+        xBC, dt = torch.split(
+            zxbcdt, [self.d_inner + 2 * self.ngroups * self.d_state, self.nheads], dim=-1
+        )
+        dt = F.softplus(dt + self.dt_bias)  # (B, L, nheads)
+        # 1D Convolution
+        if causal_conv1d_fn is None:
+            xBC = self.act(
+                self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)
+            )  # (B, L, self.d_inner + 2 * ngroups * d_state)
+            xBC = xBC[:, :seqlen, :]
+        else:
+            xBC = causal_conv1d_fn(
+                x=xBC.transpose(1, 2),
+                weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                bias=self.conv1d.bias,
+                activation="swish",
+            ).transpose(1, 2)
+        # Split into 3 main branches: X, B, C
+        # These correspond to V, K, Q respectively in the SSM/attention duality
+        x, B, C = torch.split(xBC, [self.d_inner, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
+        y = mamba_chunk_scan_combined(
+            rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
+            dt,
+            A,
+            rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
+            rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
+            chunk_size=256,
+            D=self.D,
+            z=None,
+            seq_idx=None,
+            initial_states=None,
+        )
+        return y, None, None
 class DragonMamba3Mimo(nn.Module):
     def __init__(self, config: DragonConfig, layer_idx: Optional[int]):
         self.conv_init = None
         self.expand = 2
         self.headdim = 128
+        self.ngroups = config.mamba_ngroups
         self.activation = "swish"
         self.bias = False
         self.conv_bias = True
         # Order: [z, x, B, C, dt]
         d_in_proj = 2 * self.d_inner + 2 * self.d_state * self.ngroups * self.mimo_dim + self.nheads
+        self.A_proj = DragonLinear(config, self.d_model, self.nheads, bias=False, dtype=torch.float32)
         self.trapezoid_proj = DragonLinear(config, self.d_model, self.nheads, bias=False)
         _dt = torch.exp(
         self.in_proj = DragonLinear(config, self.d_model, d_in_proj, bias=self.bias)
+        self.B_bias = nn.Parameter(torch.ones((self.mimo_dim, self.nheads, self.d_state)), requires_grad=True)
+        self.C_bias = nn.Parameter(torch.ones((self.mimo_dim, self.nheads, self.d_state)), requires_grad=True)
         self.B_norm = DragonNorm(config, self.d_state)
         self.C_norm = DragonNorm(config, self.d_state)
     def forward(self, hidden_states, **kwargs):
         # Apply in_proj
+        zxBCdt = self.in_proj(hidden_states)
         z, xBC, dd_dt = torch.split(
+            zxBCdt,
             [
                 self.d_inner,
                 self.d_inner + 2 * self.d_state * self.ngroups * self.mimo_dim,
         C = self.C_norm(C)
         if self.ngroups != self.nheads:
+            B = B.expand(-1, -1, -1, self.nheads, -1) # (B, L, R, N, S)
+            C = C.expand(-1, -1, -1, self.nheads, -1) # (B, L, R, N, S)
         angle = self.rope_proj(hidden_states) # (B, L, S)
         angle = angle.unsqueeze(-2).expand(-1, -1, self.nheads, -1) # (B, L, G, S)
         angle = angle_dt(angle, dt)
+        C, B, CB_sum = mimo_rotary_qk(q=C, k=B, angle=angle, bias_q=self.C_bias, bias_k=self.B_bias, conjugate=False, inplace=False)
         x = rearrange(x, "b l r (h p) -> b l r h p", p=self.headdim)
         z = rearrange(z, "b l r (h p) -> b l r h p", p=self.headdim)
+        y = mamba_mimo_chunk_scan_discretized_fused_combined(
             x=x.bfloat16(),
             A=A.bfloat16(),
             B=B.bfloat16(),
         )
         y = rearrange(y, "b l r h p -> b l r (h p)")
+        #if seqlen_og is not None:
+        #    y = rearrange(y, "b l r d -> (b l) r d")
         # Perform MIMO down projection (mimo_rank*d_inner -> d_inner)
         y = rearrange(y, "b l r d -> b l (r d)")
         y = rearrange(y, "b l (g d) -> b l g d", g=self.mimo_dim*self.mimo_proj_block_order)
         y = torch.einsum("blgd,drg->bldr", y, self.out_proj_mimo)
         y = rearrange(y, "b l d r -> b l (d r)")
+        y = rearrange(y, "b l (h d) -> b l h d", d=self.headdim)
         return y, None, None
 class DragonMLP(nn.Module):
+    def __init__(self, config: DragonConfig, intermediate_size: Optional[int] = None):
         super().__init__()
         self.config = config
+        intermediate_size = intermediate_size or config.intermediate_size
         #print("previous MLP : ", PREVIOUS_MLP)
         self.link_size = 16
         self.mlp_linking = config.mlp_linking and PREVIOUS_MLP is not None
         if self.mlp_linking:
             self.previous_mlp = PREVIOUS_MLP
+            self.fc_1 = DragonLinear(config, config.hidden_size, intermediate_size, bias=False)
             self.lambda1 = nn.Parameter(torch.zeros(self.link_size))  # sigmoid->0.5
         else :
+            self.fc_1 = DragonLinear(config, config.hidden_size, intermediate_size, bias=False)
+        self.fc_2 = DragonLinear(config, intermediate_size, config.hidden_size, bias=False)
         self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
     def forward(self, hidden_states):
         return hidden_states
     def get_mlp_link(self):
+        mlp_link = self.mlp_link
+        self.mlp_link = None
+        return mlp_link
+class DragonGatedMLP(nn.Module):
+    def __init__(self, config: DragonConfig, intermediate_size: Optional[int] = None, num_active_experts: int = 1):
+        super().__init__()
+        self.config = config
+        self.intermediate_size = intermediate_size
+        self.fc_1 = DragonLinear(config, config.hidden_size, num_active_experts*self.intermediate_size, bias=False)
+        self.fc_2 = DragonLinear(config, num_active_experts*self.intermediate_size, config.hidden_size, bias=False)
+        self.register_buffer("_2_sqrt_5", torch.tensor(2/math.sqrt(5)) if config.use_uscaling else torch.tensor(1.), persistent=False)
+    def forward(self, hidden_states, gates):
+        B, L, D = hidden_states.size()
+        hidden_states = self.fc_1(hidden_states) # (B, L, E*D)
+        hidden_states = self._2_sqrt_5 * F.relu(hidden_states).square().view(B, L, -1, self.intermediate_size)  # (B, L, E, D)
+        hidden_states = hidden_states * gates.unsqueeze(-1)  # (B, L, E, D)
+        hidden_states = self.fc_2(hidden_states.view(B, L, -1))  # (B, L, D)
+        return hidden_states
+class DragonMoE(nn.Module):
+    def __init__(self, config: DragonConfig):
+        super().__init__()
+        self.config = config
+        self.num_experts = config.moe_num_routed_experts
+        self.routed_scaling_factor = config.moe_routed_scaling_factor
+        self.router = DragonLinear(config, config.hidden_size, self.num_experts, bias=False, dtype=torch.float32)
+        self.experts = DragonGatedMLP(config, config.moe_routed_intermediate_size, self.num_experts)
+        if config.moe_shared_intermediate_size > 0:
+            self.shared_expert = DragonMLP(config, config.moe_shared_intermediate_size)
+    def forward(self, hidden_states):
+        # compute gating score.
+        weights = F.sigmoid(self.router(hidden_states.to(torch.float32))) # (B, L, experts)
+        weights = weights / weights.sum(dim=-1, keepdim=True) # (B, L, experts)
+        weights = (weights * self.routed_scaling_factor).to(hidden_states.dtype)
+        # forward through (routed) experts.
+        y = self.experts(hidden_states, weights) # (B, L, E, D)
+        # forward through shared expert.
+        if self.config.moe_shared_intermediate_size > 0:
+            y = y + self.shared_expert(hidden_states)
+        return y
 PREVIOUS_MLP = None
 class DragonMonoBlock(GradientCheckpointingLayer):
             head_dim = self.mixer.headdim
             num_attention_heads = self.mixer.nheads
             use_gate = config.gate_gdn
+        elif layer_type == '2':
+            self.mixer = DragonMamba2(config, layer_idx=layer_idx)
+            head_dim = self.mixer.headdim
+            num_attention_heads = self.mixer.nheads
+            use_gate = config.gate_gdn
+        elif layer_type == 'M':
+            self.mixer = DragonMamba3Mimo(config, layer_idx=layer_idx)
+            head_dim = self.mixer.headdim
+            num_attention_heads = self.mixer.nheads
+            use_gate = False # inside Mamba3Mimo
         else:
             raise ValueError(f"Unknown layer type: {layer_type}")
         self.input_norm = DragonNorm(config, config.hidden_size)
         self.postmixer_norm = DragonNorm(config, config.hidden_size)
+        if not config.moe:
+            self.mlp = DragonMLP(config)
+        else:
+            self.mlp = DragonMoE(config)
         global PREVIOUS_MLP
         PREVIOUS_MLP = self.mlp
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         key_value_last_layer: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
         **kwargs,
     ):
         # MIXER.
             position_ids=position_ids,
             cache_params=cache_params,
             key_value_last_layer=key_value_last_layer,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
         ) # (B, L, E*D)
         if self.use_gate:
             if self.config.gate_type == "elementwise" or self.config.gate_type == "kimi":
         self.embedding = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList([DragonBlock(config, layer_idx=i, layer_type=layer) if layer in ['l', 'r', 'd'] else DragonMonoBlock(config, layer_idx=i, layer_type=layer) for i, layer in enumerate(config.layers_config)])
+        if self.config.rope_type_global != '' or self.config.rope_type_local != '':
+            self.rotary_emb = DragonRotaryEmbedding(config, head_dim=config.head_dim if config.head_dim else (config.expand_factor*config.hidden_size)//config.num_attention_heads, theta=config.rope_theta_local) # only for SWA
+        else:
+            self.rotary_emb = None
+        if self.config.final_norm:
+            self.final_norm = DragonNorm(config, config.hidden_size)
         self.gradient_checkpointing = False
         self.post_init()
         cache_position: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
         **kwargs
     ) -> DragonOutput:
         B, L = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2]
         all_hidden_states = () if output_hidden_states else None
+        if self.rotary_emb is not None:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        else:
+            position_embeddings = None
         shared_kv = (None, None)
         for block in self.layers:
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
                 key_value_last_layer=shared_kv,
+                cu_seqlens=cu_seqlens,
+                max_seqlen=max_seqlen,
                 **kwargs,
             )
             shared_kv = (last_k, last_v)
+        if self.config.final_norm:
+            hidden_states = self.final_norm(hidden_states)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
         cache_position: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        just_loss: Optional[bool] = False,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
         token_type_ids=None,
         **kwargs,
     ) -> DragonCausalLMOutput:
             cache_position=cache_position,
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
             **kwargs,
         )
         return DragonCausalLMOutput(
             loss=loss,
+            logits=logits if not just_loss else None,
+            past_key_values=outputs.past_key_values if not just_loss else None,
+            hidden_states=outputs.hidden_states if not just_loss else None,
         )
 DragonForCausalLM.register_for_auto_class("AutoModelForCausalLM")

training_dragon.py CHANGED Viewed

@@ -35,8 +35,8 @@ class NanoArgs:
     head_dim: Optional[int] = None
     layers_config : str = 4*"lrdlr"
     expand_factor : int = 2 # expand factor for Mamba/Dragon
-    rope_type_local: str = "rope" #p-rope
-    rope_type_global: str = "rope" #p-rope
     rope_theta_local: float = 10000.0
     rope_theta_global: float = 0.0
     eps_rmsnorm: float = 1e-6
@@ -54,8 +54,18 @@ class NanoArgs:
     scalar_proj_as_hidden_matrix: bool = True
     normalization_type: str = "rmsnorm" # rmsnorm, seednorm
     seednorm_wd: bool = True
     mixer_gn: bool = True
     mlp_linking : bool = False
     # attention related
     n_kv_heads : int = 0
@@ -93,6 +103,14 @@ class NanoArgs:
     shrink_qk_gdn: int = 2
     kda_allow_neg_eigval: bool = False
     kda_num_v_heads: Optional[int] = None
     # optim
     optim: str = "adamw" # adamw, spam, stable-spam, muon, muon_moonlight, splus
@@ -120,7 +138,9 @@ class NanoArgs:
     # data
     vocab_size: int = 50304
     sequence_length: int = 1024
     input_bin: Optional[str] = None
     input_val_bin: Optional[str] = None
@@ -138,6 +158,7 @@ class NanoArgs:
     load_optim: bool = True
     load_sched: bool = True
     compile: bool = True
     # used during training
     slw_window: int = 0
@@ -166,9 +187,11 @@ def _load_data_shard(filename):
     return tokens
 class DistributedDataLoader:
-    def __init__(self, filename_pattern, B, T, process_rank, num_processes):
         self.process_rank = process_rank
         self.num_processes = num_processes
         self.B = B # micro batch size
         self.T = T
@@ -221,12 +244,32 @@ class DistributedDataLoader:
         x = torch.from_numpy(buf.reshape(B, T)) # inputs
         y = torch.from_numpy(buf.reshape(B, T)) # targets
         # advance current position and load next shard if necessary
         self.current_position += B * T * self.num_processes
         if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
             self.advance()
-        return x.cuda(), y.cuda()
 def param_groups_mup(model, base_lr_hidden, base_lr_scalar, base_lr_embed, base_lr_head, wd):
     groups, seen = [], set()
@@ -277,6 +320,11 @@ def param_groups_mup(model, base_lr_hidden, base_lr_scalar, base_lr_embed, base_
 args = tyro.cli(NanoArgs)
 # set up DDP (distributed data parallel).
 assert torch.cuda.is_available()
 dist.init_process_group(
@@ -293,6 +341,8 @@ torch.cuda.set_device(device)
 print(f"using device: {device}")
 master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc.
 torch._dynamo.config.optimize_ddp=False
 # setup logging.
 resume_dir = None
@@ -363,16 +413,33 @@ if args.patch_level_training:
 assert args.batch_size % (B * ddp_world_size) == 0
 accumulation_steps = args.batch_size // (B * ddp_world_size)
 # load dataloaders.
 #if args.patch_level_training:
 #    assert T % args.patch_level_training_size == 0, "sequence length must be divisible by patch level training size in reduced mode"
-train_loader = DistributedDataLoader(args.input_bin, B, T, ddp_rank, ddp_world_size)
-val_loader = DistributedDataLoader(args.input_val_bin, B, T, ddp_rank, ddp_world_size)
 print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files")
 print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
 # load model.
 config_hf = DragonConfig(
     mla_kv_rank=args.mla_kv_rank,
     rope_gdn=args.rope_gdn,
     shrink_qk_da=args.shrink_qk_da,
@@ -402,6 +469,8 @@ config_hf = DragonConfig(
     zero_centered_gate=args.zero_centered_gate,
     zero_centered_gate_type=args.zero_centered_gate_type,
     scalable_softmax=args.scalable_softmax,
     resformer=args.resformer,
     gate_type=args.gate_type,
     gate_act=args.gate_act,
@@ -461,7 +530,7 @@ with torch.no_grad():
 # count params. (total & active)
 num_params = sum(p.numel() for p in model.parameters())
 """model.eval()
-x, y = train_loader.next_batch()
 with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
     model(input_ids=x[[0], [0]].unsqueeze(0)).logits.sum().backward()
 num_active = sum(p.grad.count_nonzero() for p in model.parameters() if p.grad is not None)
@@ -472,12 +541,16 @@ print0(f"number of total parameters:  {num_params}")
 # DDP & compile.
 uncompiled_model = model
-model = torch.compile(model, dynamic=True) if args.compile else model
 model.train()
 model = DDP(model, device_ids=[ddp_local_rank], find_unused_parameters=args.resformer)
 raw_model = model.module
 ctx = torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)
 # load optimizers & schedulers.
 if args.use_uscaling:
     #assert args.optim == "adamw", "uscaling is only supported with AdamW optimizer currently"
@@ -553,9 +626,7 @@ WARMUP_SKIP = 10
 # begin training.
 train_loader.reset()
-#tokenizer = transformers.AutoTokenizer.from_pretrained("openai-community/gpt2", use_fast=True) # for saving
-tokenizer = transformers.AutoTokenizer.from_pretrained("/leonardo_work/BOOST_LCustodi/script/training/temp/hf_models/gpt2", use_fast=True)
-x, y = train_loader.next_batch()
 for iter_ in range(start_iter, start_iter+args.total_iterations+1):
     last_iter = (iter_ == start_iter+args.total_iterations)
@@ -588,9 +659,9 @@ for iter_ in range(start_iter, start_iter+args.total_iterations+1):
         val_loss = torch.zeros((), device=device, dtype=torch.float32)
         for _ in range(args.val_iterations):
             for _ in range(accumulation_steps):
-                inputs, targets = val_loader.next_batch()
                 with ctx:
-                    val_loss += model(input_ids=inputs, labels=targets).loss.detach()
         val_loss /= args.val_iterations * accumulation_steps
         dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
         val_loss = val_loss.item()
@@ -641,10 +712,10 @@ for iter_ in range(start_iter, start_iter+args.total_iterations+1):
     for i in range(1, accumulation_steps+1):
         # forward pass.
         with ctx:
-            loss = model(input_ids=x, labels=y).loss
             train_loss = loss.detach()
         # prepare next batch.
-        x, y = train_loader.next_batch()
         # backward pass.
         if i < accumulation_steps:
             with model.no_sync():

     head_dim: Optional[int] = None
     layers_config : str = 4*"lrdlr"
     expand_factor : int = 2 # expand factor for Mamba/Dragon
+    rope_type_local: str = "" #p-rope
+    rope_type_global: str = "" #p-rope
     rope_theta_local: float = 10000.0
     rope_theta_global: float = 0.0
     eps_rmsnorm: float = 1e-6
     scalar_proj_as_hidden_matrix: bool = True
     normalization_type: str = "rmsnorm" # rmsnorm, seednorm
     seednorm_wd: bool = True
+    seednorm_type: int = 1
+    seednorm_rank: int = 1
     mixer_gn: bool = True
     mlp_linking : bool = False
+    final_norm: bool = True
+    # MoE
+    moe: bool = False
+    moe_num_routed_experts: int = 2
+    moe_routed_scaling_factor: float = 2.5
+    moe_routed_intermediate_size: int = 768
+    moe_shared_intermediate_size: int = 768
     # attention related
     n_kv_heads : int = 0
     shrink_qk_gdn: int = 2
     kda_allow_neg_eigval: bool = False
     kda_num_v_heads: Optional[int] = None
+    mamba_mimo_dim: Optional[int] = 2
+    mamba_ngroups: Optional[int] = 1
+    mamba3_rope: bool = True
+    mamba3_remove_BC_bias: bool = False
+    mamba3_is_id_rms: bool = True
+    mamba3_remove_conv: bool = True
+    mamba3_is_A_dd: bool = True
+    mamba3_add_trapezoid: bool = True
     # optim
     optim: str = "adamw" # adamw, spam, stable-spam, muon, muon_moonlight, splus
     # data
     vocab_size: int = 50304
+    bos_id: int = 50256
     sequence_length: int = 1024
+    intra_doc_masking: bool = False
     input_bin: Optional[str] = None
     input_val_bin: Optional[str] = None
     load_optim: bool = True
     load_sched: bool = True
     compile: bool = True
+    compile_dynamic: bool = False
     # used during training
     slw_window: int = 0
     return tokens
 class DistributedDataLoader:
+    def __init__(self, filename_pattern, intra_doc_masking,B, T, process_rank, num_processes, bos_id):
         self.process_rank = process_rank
         self.num_processes = num_processes
+        self.intra_doc_masking = intra_doc_masking
+        self.bos_id = bos_id
         self.B = B # micro batch size
         self.T = T
         x = torch.from_numpy(buf.reshape(B, T)) # inputs
         y = torch.from_numpy(buf.reshape(B, T)) # targets
+        # compute cumulative document positions for intra-document masking
+        cu = None
+        maxlen = None
+        position_ids = None
+        if self.intra_doc_masking:
+            assert self.B == 1
+            starts = (x == self.bos_id).nonzero(as_tuple=True)[1].to(torch.long)
+            if starts.numel() == 0 or starts[0] != 0:
+                starts = torch.cat([torch.zeros(1, dtype=torch.long), starts])
+            ends = torch.cat([starts[1:], torch.tensor([x.numel()])])
+            seqlens = (ends - starts).to(torch.int32)
+            # cu_seqlens, max_seqlen.
+            cu = torch.cat([torch.zeros(1, dtype=torch.int32), seqlens.cumsum(0)]).cuda().to(torch.int32)
+            maxlen = int(seqlens.max())
+            # position_ids.
+            lengths = seqlens.to(torch.long)
+            starts_per_token = torch.repeat_interleave(starts.to(torch.long), lengths)
+            idx = torch.arange(T, device=x.device, dtype=torch.long)
+            position_ids = (idx - starts_per_token).unsqueeze(0)
         # advance current position and load next shard if necessary
         self.current_position += B * T * self.num_processes
         if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
             self.advance()
+        return x.cuda(), y.cuda(), cu, maxlen, position_ids
 def param_groups_mup(model, base_lr_hidden, base_lr_scalar, base_lr_embed, base_lr_head, wd):
     groups, seen = [], set()
 args = tyro.cli(NanoArgs)
+if args.intra_doc_masking:
+    if args.device_batch_size != 1:
+        args.device_batch_size = 1
+        print("!!! Forcing device_batch_size to 1 for intra-document masking !!!")
 # set up DDP (distributed data parallel).
 assert torch.cuda.is_available()
 dist.init_process_group(
 print(f"using device: {device}")
 master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc.
 torch._dynamo.config.optimize_ddp=False
+if args.compile_dynamic:
+    torch._dynamo.config.allow_unspec_int_on_nn_module=True
 # setup logging.
 resume_dir = None
 assert args.batch_size % (B * ddp_world_size) == 0
 accumulation_steps = args.batch_size // (B * ddp_world_size)
+tokenizer = transformers.AutoTokenizer.from_pretrained("/leonardo_work/BOOST_LCustodi/script/training/temp/hf_models/gpt2", use_fast=True)
 # load dataloaders.
 #if args.patch_level_training:
 #    assert T % args.patch_level_training_size == 0, "sequence length must be divisible by patch level training size in reduced mode"
+train_loader = DistributedDataLoader(args.input_bin, args.intra_doc_masking, B, T, ddp_rank, ddp_world_size, args.bos_id)
+val_loader = DistributedDataLoader(args.input_val_bin, args.intra_doc_masking, B, T, ddp_rank, ddp_world_size, args.bos_id)
 print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files")
 print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
 # load model.
 config_hf = DragonConfig(
+    mamba3_rope=args.mamba3_rope,
+    mamba3_remove_BC_bias=args.mamba3_remove_BC_bias,
+    mamba3_is_id_rms=args.mamba3_is_id_rms,
+    mamba3_remove_conv=args.mamba3_remove_conv,
+    mamba3_is_A_dd=args.mamba3_is_A_dd,
+    mamba3_add_trapezoid=args.mamba3_add_trapezoid,
+    moe=args.moe,
+    moe_num_routed_experts=args.moe_num_routed_experts,
+    moe_routed_scaling_factor=args.moe_routed_scaling_factor,
+    moe_routed_intermediate_size=args.moe_routed_intermediate_size,
+    moe_shared_intermediate_size=args.moe_shared_intermediate_size,
+    intra_doc_masking=args.intra_doc_masking,
+    seednorm_rank=args.seednorm_rank,
+    seednorm_type=args.seednorm_type,
+    final_norm=args.final_norm,
     mla_kv_rank=args.mla_kv_rank,
     rope_gdn=args.rope_gdn,
     shrink_qk_da=args.shrink_qk_da,
     zero_centered_gate=args.zero_centered_gate,
     zero_centered_gate_type=args.zero_centered_gate_type,
     scalable_softmax=args.scalable_softmax,
+    mamba_mimo_dim=args.mamba_mimo_dim,
+    mamba_ngroups=args.mamba_ngroups,
     resformer=args.resformer,
     gate_type=args.gate_type,
     gate_act=args.gate_act,
 # count params. (total & active)
 num_params = sum(p.numel() for p in model.parameters())
 """model.eval()
+x, y, _, _, _ = train_loader.next_batch()
 with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
     model(input_ids=x[[0], [0]].unsqueeze(0)).logits.sum().backward()
 num_active = sum(p.grad.count_nonzero() for p in model.parameters() if p.grad is not None)
 # DDP & compile.
 uncompiled_model = model
+model = torch.compile(model, dynamic=args.compile_dynamic) if args.compile else model
 model.train()
 model = DDP(model, device_ids=[ddp_local_rank], find_unused_parameters=args.resformer)
 raw_model = model.module
 ctx = torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)
+if args.intra_doc_masking:
+    print0("!!! Using intra-document masking !!!")
+    print0("It is only compatible with GDN (conv+chunk), DA and GDTPA layers. For DA/GDTPA, kv shift is also compatible. All other config will not have intra-doc masking support!!")
 # load optimizers & schedulers.
 if args.use_uscaling:
     #assert args.optim == "adamw", "uscaling is only supported with AdamW optimizer currently"
 # begin training.
 train_loader.reset()
+x, y, cu, maxlen, position_ids = train_loader.next_batch()
 for iter_ in range(start_iter, start_iter+args.total_iterations+1):
     last_iter = (iter_ == start_iter+args.total_iterations)
         val_loss = torch.zeros((), device=device, dtype=torch.float32)
         for _ in range(args.val_iterations):
             for _ in range(accumulation_steps):
+                inputs, targets, cu, maxlen, position_ids = val_loader.next_batch()
                 with ctx:
+                    val_loss += model(input_ids=inputs, labels=targets, just_loss=True, cu_seqlens=cu, max_seqlen=maxlen, position_ids=position_ids).loss.detach()
         val_loss /= args.val_iterations * accumulation_steps
         dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
         val_loss = val_loss.item()
     for i in range(1, accumulation_steps+1):
         # forward pass.
         with ctx:
+            loss = model(input_ids=x, labels=y, just_loss=True, cu_seqlens=cu, max_seqlen=maxlen, position_ids=position_ids).loss
             train_loss = loss.detach()
         # prepare next batch.
+        x, y, cu, maxlen, position_ids = train_loader.next_batch()
         # backward pass.
         if i < accumulation_steps:
             with model.no_sync():