KitsuVp
/

NeoLLM

@@ -414,10 +414,59 @@ class NeoLLMConfig(PretrainedConfig):
         Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
         with Context Re-Positioning.* arXiv:2512.14391.
         Xiao, D., Meng, Q., Li, S. & Yuan, X. (2025). *MUDDFormer: Breaking
         Residual Bottlenecks in Transformers via Multiway Dynamic Dense
         Connections.* arXiv:2502.12170.
         use_mudd (:obj:`bool`, *optional*, defaults to ``False``):
             Enable **Multiway Dynamic Dense (MUDD) connections** (Xiao et al.,
             2025). Replaces standard residual connections with learned,
@@ -478,6 +527,172 @@ class NeoLLMConfig(PretrainedConfig):
             Enables independent rescaling per stream when
             ``mudd_dense_type="qkvr"``. Adds 2 × SeeDNorm parameters per
             decoder layer. Ignored when ``mudd_dense_type="l"``.
     """
     model_type = "neollm"
@@ -519,7 +734,7 @@ class NeoLLMConfig(PretrainedConfig):
         directional_routing_temp=3.0,
         # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
         use_attn_res=False,
-        attn_res_num_blocks=4,
         fan_ratio=0.125,
         fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
@@ -557,13 +772,42 @@ class NeoLLMConfig(PretrainedConfig):
         versatile_gumbel_temp_end=0.1,
         versatile_gumbel_temp_decay=0.99984,
         versatile_aux_loss_weight=1e-5,
         # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
         use_mudd=False,
         mudd_dense_type="qkvr",
         mudd_dynamic_dense=True,
-        mudd_round64=False,
         mudd_expand_last=True,
         mudd_sepln=False,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
@@ -612,12 +856,41 @@ class NeoLLMConfig(PretrainedConfig):
                 f"num_hidden_layers={num_hidden_layers}."
             )
-        # ── MUDD: validate and resolve ────────────────────────────────────
-        if use_mudd and use_attn_res:
             raise ValueError(
-                "`use_mudd=True` and `use_attn_res=True` are mutually exclusive. "
-                "Both mechanisms replace residual aggregation across depth and "
-                "cannot be active simultaneously. Set exactly one to True."
             )
         if use_mudd and mudd_dense_type not in ("qkvr", "l"):
             raise ValueError(
@@ -732,6 +1005,12 @@ class NeoLLMConfig(PretrainedConfig):
         self.repo_start_layer              = repo_start_layer
         self.repo_d_p                      = repo_d_p
         # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
         self.use_mudd                      = use_mudd
         self.mudd_dense_type               = mudd_dense_type
@@ -740,6 +1019,29 @@ class NeoLLMConfig(PretrainedConfig):
         self.mudd_expand_last              = mudd_expand_last
         self.mudd_sepln                    = mudd_sepln
         # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
         self.use_versatile_ffn             = use_versatile_ffn
         self.versatile_total_experts       = versatile_total_experts

         Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
         with Context Re-Positioning.* arXiv:2512.14391.
+        Heddes, M. et al. (2025). *DeepCrossAttention: Supercharging
+        Transformer Residual Connections.* arXiv:2502.06785.
         Xiao, D., Meng, Q., Li, S. & Yuan, X. (2025). *MUDDFormer: Breaking
         Residual Bottlenecks in Transformers via Multiway Dynamic Dense
         Connections.* arXiv:2502.12170.
+        use_dca (:obj:`bool`, *optional*, defaults to ``False``):
+            Enable **DeepCrossAttention (DCA)** (Heddes et al., 2025).
+            Replaces standard residual connections with three independent
+            GRN-v3 modules per decoder layer — one each for the Query,
+            Key, and Value streams — that dynamically aggregate outputs of
+            all preceding layers with both dimension-dependent (static) and
+            input-dependent (dynamic) learned weights.
+            **k-DCA efficiency** (``dca_k``): only the first and last ``k``
+            layer outputs are kept in the stack, bounding memory and
+            computation at ``O(2k)`` rather than ``O(L)``.
+            **Mutual exclusion**: ``use_dca``, ``use_attn_res``, and
+            ``use_mudd`` are mutually exclusive. Set exactly one to True.
+            Reference: Heddes, M. et al. (2025). *DeepCrossAttention:
+            Supercharging Transformer Residual Connections.*
+            arXiv:2502.06785.
+        dca_k (:obj:`int`, *optional*, defaults to 2):
+            Number of first and last layer outputs retained in the
+            depth-wise stack for k-DCA. With ``k=2`` the stack contains
+            at most 4 tensors (first 2 + last 2) regardless of depth.
+            Paper Table 1 results (24-layer model on LM1B):
+            - ``k=1`` — 0.33× time to transformer PPL, PPL 14.48
+            - ``k=2`` — 0.33× time, PPL 14.41  ← recommended default
+            - ``k=4`` — 0.37× time, PPL 14.50
+            - ``k=24`` (full) — 0.39× time, PPL 14.35
+            Smaller k gives faster training and lower inference latency
+            at a very small perplexity cost. k=2 is the best
+            efficiency-quality trade-off per the paper.
+        dca_use_final_grn (:obj:`bool`, *optional*, defaults to ``True``):
+            Apply a final GRN-v3 aggregation (``num_outputs=1``) over the
+            k-selected depth stack after all decoder layers, before the
+            output norm and lm_head. This matches the DCAGPT architecture
+            exactly. The final GRN collapses all depth history into the
+            final hidden representation using learned weights, rather than
+            using the raw last-layer output.
+        dca_grn_eps (:obj:`float`, *optional*, defaults to 1e-6):
+            Epsilon for the no-scale RMSNorm inside each GRN-v3 module.
         use_mudd (:obj:`bool`, *optional*, defaults to ``False``):
             Enable **Multiway Dynamic Dense (MUDD) connections** (Xiao et al.,
             2025). Replaces standard residual connections with learned,
             Enables independent rescaling per stream when
             ``mudd_dense_type="qkvr"``. Adds 2 × SeeDNorm parameters per
             decoder layer. Ignored when ``mudd_dense_type="l"``.
+        use_stacktrans (:obj:`bool`, *optional*, defaults to ``False``):
+            Enable **StackTrans** (Zhang et al., NeurIPS 2025): inserts a
+            differentiable multi-head hidden-state stack between each pair of
+            Transformer layers, providing an explicit push/pop memory that
+            allows the model to learn Chomsky-hierarchy grammars (regular
+            expressions and deterministic context-free grammars) and improves
+            compositional generalisation and reasoning.
+            The stack is positioned at the **very beginning** of each decoder
+            layer forward pass, before the attention sublayer, so the
+            attention computation sees the stack-enriched hidden state.
+            **Mutual exclusion**: ``use_stacktrans`` cannot be active
+            simultaneously with ``use_attn_res``, ``use_mudd``, or
+            ``use_dca`` because all four alter the information flow entering
+            the residual stream. Set exactly one to ``True``.
+            Reference: Zhang, K. et al. (2025). *Recursive Transformer:
+            Boosting Reasoning Ability with State Stack.* NeurIPS 2025.
+        stacktrans_num_heads (:obj:`int`, *optional*, defaults to 4):
+            Number of independent stack heads ``H``. Each head maintains its
+            own low-rank stack of dimension ``ds = stacktrans_stack_d_model //
+            H``. The paper ablation (Fig. 4a) shows that ``H = 4`` is the
+            optimal trade-off: performance plateaus past this value while
+            overhead grows.
+        stacktrans_stack_slots (:obj:`int`, *optional*, defaults to 24):
+            Maximum depth ``S`` of the stack (number of slots). Overflow
+            elements are truncated to zero (a form of forgetting). The paper
+            ablation (Fig. 4c) shows that ``S = 24`` is optimal; increasing
+            to 32 yields no measurable gain.
+        stacktrans_stack_d_model (:obj:`int`, *optional*, defaults to 64):
+            Total dimensionality of the low-rank stack space, equal to
+            ``H × ds``. The hidden state is projected down from
+            ``hidden_size`` to this dimension before stack operations, and
+            projected back up afterward. The paper uses ``H=4, ds=16``
+            (stack_d_model=64). From the ablation (Fig. 4b), ``ds`` in the
+            range 16–64 provides the best efficiency–quality trade-off.
+        stacktrans_forward_bs (:obj:`int`, *optional*, defaults to 1):
+            Batch size of the internal ``k_cache`` and ``action_cache``
+            buffers used for autoregressive generation. Must be ≥ the
+            actual generation batch size. At training time these buffers are
+            never used (``enable_cache=False``). Increasing this above 1
+            is only needed for batched generation.
+        use_laurel (:obj:`bool`, *optional*, defaults to ``False``):
+            Enable the **LAuReL** framework (Menghani, Kumar & Kumar, ICML
+            2025): a learned generalisation of the canonical residual
+            connection that augments the residual stream with lightweight
+            learnable components, improving model quality with minimal
+            parameter overhead.
+            Standard Pre-LN residual connection:
+            .. math::
+                x_{i+1} = f(x_i) + x_i
+            LAuReL replaces this with:
+            .. math::
+                x_{i+1} = \\alpha \\cdot f(x_i) + g(x_i)
+            where :math:`\\alpha` is a learned scalar and :math:`g` is a
+            learned linear function. Two sub-variants are controlled
+            independently by ``use_laurel_rw`` and ``use_laurel_lr``.
+            In NeoLLM, LAuReL is applied to **both** residual connections
+            per decoder layer (attention and MLP), immediately before GPAS:
+            - Attention: ``GPAS(LAuReL(attn_out, residual_attn))``
+            - MLP:       ``GPAS(LAuReL(delta_m,  residual_mlp))``
+            GPAS then operates on the LAuReL-combined stream with its
+            stop-gradient scaling, so the two techniques remain
+            structurally orthogonal.
+            **Mutual exclusion**: ``use_laurel`` is incompatible with
+            ``use_attn_res``, ``use_mudd``, and ``use_dca`` because those
+            three replace the residual streams (``residual_attn``,
+            ``residual_mlp``) with custom-aggregated tensors whose
+            statistical properties differ from the standard accumulated
+            hidden state that LAuReL's initialisation guarantees assume.
+            ``use_laurel`` is compatible with ``use_stacktrans``.
+            Reference: Menghani, G., Kumar, R. & Kumar, S. (ICML 2025).
+            *LAuReL: Learned Augmented Residual Layer.* arXiv:2411.07501.
+        use_laurel_rw (:obj:`bool`, *optional*, defaults to ``True``):
+            Enable the **Residual Weights** (RW) sub-variant of LAuReL.
+            Requires ``use_laurel=True``.
+            Assigns independently learned scalar weights to the nonlinear
+            component :math:`f(x_i)` and the residual :math:`x_i`:
+            .. math::
+                x_{i+1} = \\alpha \\cdot f(x_i) + \\beta \\cdot g_{\\text{LR}}(x_i)
+            where :math:`[\\alpha, \\beta] = \\operatorname{softmax}([a, b])`
+            with :math:`a, b \\in \\mathbb{R}` learnable scalars (2 parameters
+            per LAuReL instantiation). Softmax normalisation prevents
+            unbounded growth confirmed by the paper's ablation.
+            Initialized with :math:`a = b = 0`, giving
+            :math:`\\alpha = \\beta = 0.5` at step 0 — the model quickly
+            learns the optimal weighting. In earlier layers the nonlinear
+            component dominates; in deeper layers the residual gains
+            relative importance, adaptively mitigating vanishing gradients.
+            When ``use_laurel_lr=False``, the RW formula becomes:
+            .. math::
+                x_{i+1} = \\alpha \\cdot f(x_i) + \\beta \\cdot x_i
+            When both RW and LR are active (recommended combination):
+            .. math::
+                x_{i+1} = \\alpha \\cdot f(x_i)
+                         + \\beta \\cdot (B A x_i + x_i)
+        use_laurel_lr (:obj:`bool`, *optional*, defaults to ``True``):
+            Enable the **Low-Rank** (LR) sub-variant of LAuReL.
+            Requires ``use_laurel=True``.
+            Introduces a learnable low-rank linear transformation of the
+            residual :math:`x_i` that runs in parallel with the nonlinear
+            :math:`f(x_i)`:
+            .. math::
+                x_{i+1} = f(x_i) + W x_i + x_i,
+                \\quad W = A B + I,
+                \\quad A \\in \\mathbb{R}^{D \\times r},\\;
+                B^{\\top} \\in \\mathbb{R}^{D \\times r}
+            Equivalently written as a LoRA-style decomposition:
+            .. math::
+                x_{i+1} = f(x_i) + x_i + \\underbrace{A (B x_i)}_{\\text{low-rank term}}
+            where :math:`B \\in \\mathbb{R}^{r \\times D}` (down-projection,
+            column-orthogonal init) and :math:`A \\in \\mathbb{R}^{D \\times r}`
+            (up-projection, **zero init**). Zero init on :math:`A` ensures
+            the low-rank term is exactly zero at step 0, so the model starts
+            as a standard residual connection. Parameter count per LAuReL
+            layer: :math:`2rD`.
+            Adds :math:`2rD` parameters per residual connection (2 per decoder
+            layer since both attention and MLP residuals are augmented).
+            For hidden_size=512 and ``laurel_lr_rank=32``:
+            :math:`2 \\times 32 \\times 512 = 32{,}768` parameters per layer.
+            The paper recommends :math:`r \\in \\{32, 48, 64\\}` for LLMs.
+            For NeoLLM at 135M with 12 layers, ``r=32`` adds ≈786K parameters
+            (≈0.6% of total), within the negligible overhead budget of the
+            paper (0.012%–0.1% for 1B–4B models).
+        laurel_lr_rank (:obj:`int`, *optional*, defaults to 32):
+            Rank :math:`r` of the low-rank matrices :math:`A` and :math:`B`
+            in the LAuReL-LR sub-variant. Controls the capacity vs. overhead
+            trade-off. The paper's ablation (Figure 3) shows performance
+            peaks at :math:`r \\in \\{16, 32\\}` for ResNet; for LLMs,
+            :math:`r \\in \\{32, 48, 64\\}` are recommended. Ignored when
+            ``use_laurel_lr=False``.
     """
     model_type = "neollm"
         directional_routing_temp=3.0,
         # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
         use_attn_res=False,
+        attn_res_num_blocks=2,
         fan_ratio=0.125,
         fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
         versatile_gumbel_temp_end=0.1,
         versatile_gumbel_temp_decay=0.99984,
         versatile_aux_loss_weight=1e-5,
+        # ── DCA (Heddes et al., 2025) ─────────────────────────────────────
+        use_dca=False,
+        dca_k=1,
+        dca_use_final_grn=True,
+        dca_grn_eps=1e-6,
         # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
         use_mudd=False,
         mudd_dense_type="qkvr",
         mudd_dynamic_dense=True,
+        mudd_round64=True,
         mudd_expand_last=True,
         mudd_sepln=False,
+        # ── StackTrans (Zhang et al., NeurIPS 2025) ───────────────────────
+        use_stacktrans=False,
+        stacktrans_num_heads=4,
+        stacktrans_stack_slots=24,
+        stacktrans_stack_d_model=64,
+        stacktrans_forward_bs=1,
+        # ── LAuReL (Menghani, Kumar & Kumar, ICML 2025) ───────────────────
+        use_laurel=False,
+        use_laurel_rw=True,
+        use_laurel_lr=True,
+        laurel_lr_rank=32,
+        # ── GatedDeltaNet linear attention (Yang et al., 2024) ───────────
+        # Replaces full attention every `linear_attention_every_n` layers
+        # (0-indexed: layers 2, 5, 8, ... for every_n=3).
+        # REPO applies to linear attention layers when both
+        # use_repo=True and use_repo_in_linear_attn=True.
+        use_linear_attention=False,
+        linear_attention_every_n=3,
+        use_repo_in_linear_attn=False,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=32,
+        linear_value_head_dim=32,
+        linear_num_key_heads=8,
+        linear_num_value_heads=16,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
                 f"num_hidden_layers={num_hidden_layers}."
             )
+        # ── Residual-replacement mutex ────────────────────────────────────
+        _active = [n for n, f in [('use_dca', use_dca),
+                                   ('use_mudd', use_mudd),
+                                   ('use_attn_res', use_attn_res)] if f]
+        if len(_active) > 1:
             raise ValueError(
+                f"use_dca, use_mudd, and use_attn_res are mutually exclusive. "
+                f"Got {_active} simultaneously. Set exactly one to True."
+            )
+        # ── StackTrans / residual-replacement mutex ───────────────────────
+        if use_stacktrans and len(_active) > 0:
+            raise ValueError(
+                f"use_stacktrans is mutually exclusive with use_attn_res, "
+                f"use_mudd, and use_dca. Got use_stacktrans=True alongside "
+                f"{_active}. Set exactly one residual-replacement flag to True."
+            )
+        # ── LAuReL / residual-replacement mutex ───────────────────────────
+        # LAuReL's initialisation guarantees (BA=0 at step 0) assume that
+        # residual_attn and residual_mlp are standard accumulated hidden
+        # states. MUDD, DCA, and AttnRes replace these with custom-
+        # aggregated tensors, invalidating the assumption.
+        # LAuReL IS compatible with use_stacktrans (different position).
+        if use_laurel and len(_active) > 0:
+            raise ValueError(
+                f"use_laurel is mutually exclusive with use_attn_res, "
+                f"use_mudd, and use_dca (residual tensors are not standard "
+                f"accumulated hidden states when those flags are active). "
+                f"Got use_laurel=True alongside {_active}."
+            )
+        if use_laurel and not (use_laurel_rw or use_laurel_lr):
+            raise ValueError(
+                "use_laurel=True requires at least one of "
+                "use_laurel_rw=True or use_laurel_lr=True."
             )
         if use_mudd and mudd_dense_type not in ("qkvr", "l"):
             raise ValueError(
         self.repo_start_layer              = repo_start_layer
         self.repo_d_p                      = repo_d_p
+        # ── DCA (Heddes et al., 2025) ─────────────────────────────────────
+        self.use_dca             = use_dca
+        self.dca_k               = dca_k
+        self.dca_use_final_grn   = dca_use_final_grn
+        self.dca_grn_eps         = dca_grn_eps
         # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
         self.use_mudd                      = use_mudd
         self.mudd_dense_type               = mudd_dense_type
         self.mudd_expand_last              = mudd_expand_last
         self.mudd_sepln                    = mudd_sepln
+        # ── StackTrans (Zhang et al., NeurIPS 2025) ───────────────────────
+        self.use_stacktrans                = use_stacktrans
+        self.stacktrans_num_heads          = stacktrans_num_heads
+        self.stacktrans_stack_slots        = stacktrans_stack_slots
+        self.stacktrans_stack_d_model      = stacktrans_stack_d_model
+        self.stacktrans_forward_bs         = stacktrans_forward_bs
+        # ── LAuReL (Menghani, Kumar & Kumar, ICML 2025) ───────────────────
+        self.use_laurel                    = use_laurel
+        self.use_laurel_rw                 = use_laurel_rw
+        self.use_laurel_lr                 = use_laurel_lr
+        self.laurel_lr_rank                = laurel_lr_rank
+        # ── GatedDeltaNet linear attention ────────────────────────────────
+        self.use_linear_attention     = use_linear_attention
+        self.linear_attention_every_n = linear_attention_every_n
+        self.use_repo_in_linear_attn  = use_repo_in_linear_attn
+        self.linear_conv_kernel_dim   = linear_conv_kernel_dim
+        self.linear_key_head_dim      = linear_key_head_dim
+        self.linear_value_head_dim    = linear_value_head_dim
+        self.linear_num_key_heads     = linear_num_key_heads
+        self.linear_num_value_heads   = linear_num_value_heads
         # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
         self.use_versatile_ffn             = use_versatile_ffn
         self.versatile_total_experts       = versatile_total_experts