KitsuVp
/

NeoLLM

@@ -413,6 +413,71 @@ class NeoLLMConfig(PretrainedConfig):
         Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
         with Context Re-Positioning.* arXiv:2512.14391.
     """
     model_type = "neollm"
@@ -454,7 +519,7 @@ class NeoLLMConfig(PretrainedConfig):
         directional_routing_temp=3.0,
         # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
         use_attn_res=False,
-        attn_res_num_blocks=0,
         fan_ratio=0.125,
         fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
@@ -492,6 +557,13 @@ class NeoLLMConfig(PretrainedConfig):
         versatile_gumbel_temp_end=0.1,
         versatile_gumbel_temp_decay=0.99984,
         versatile_aux_loss_weight=1e-5,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
@@ -540,6 +612,18 @@ class NeoLLMConfig(PretrainedConfig):
                 f"num_hidden_layers={num_hidden_layers}."
             )
         # ── VersatileFFN: validate expert configuration ────────────────────
         if use_versatile_ffn:
             if not (1 <= versatile_active_experts < versatile_total_experts):
@@ -648,6 +732,14 @@ class NeoLLMConfig(PretrainedConfig):
         self.repo_start_layer              = repo_start_layer
         self.repo_d_p                      = repo_d_p
         # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
         self.use_versatile_ffn             = use_versatile_ffn
         self.versatile_total_experts       = versatile_total_experts

         Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
         with Context Re-Positioning.* arXiv:2512.14391.
+        Xiao, D., Meng, Q., Li, S. & Yuan, X. (2025). *MUDDFormer: Breaking
+        Residual Bottlenecks in Transformers via Multiway Dynamic Dense
+        Connections.* arXiv:2502.12170.
+        use_mudd (:obj:`bool`, *optional*, defaults to ``False``):
+            Enable **Multiway Dynamic Dense (MUDD) connections** (Xiao et al.,
+            2025). Replaces standard residual connections with learned,
+            input-dependent depth-wise aggregation over all preceding layer
+            outputs, producing up to four decoupled input streams (Q, K, V, R)
+            for each Transformer block.
+            **Mutually exclusive with** ``use_attn_res``. Both mechanisms
+            replace residual aggregation and cannot be active simultaneously.
+            Reference: Xiao, D. et al. (2025). *MUDDFormer: Breaking Residual
+            Bottlenecks in Transformers via Multiway Dynamic Dense Connections.*
+            arXiv:2502.12170.
+        mudd_dense_type (:obj:`str`, *optional*, defaults to ``"qkvr"``):
+            Stream configuration for the DA modules. Two options:
+            - ``"qkvr"``: four independent aggregated streams, one each for the
+              Query, Key, Value and Residual inputs of every Transformer block.
+              This is the full MUDDFormer configuration and the main
+              contribution of the paper. Cross-layer communication bandwidth is
+              expanded 4× relative to single-stream approaches.
+            - ``"l"``: a single aggregated stream applied only to the residual
+              path (equivalent to DDFormer / DenseFormer-dynamic).
+            Ablation (Table 5 of the paper): removing any single stream hurts
+            performance; the value stream benefits most.
+        mudd_dynamic_dense (:obj:`bool`, *optional*, defaults to ``True``):
+            Whether to generate connection weights dynamically from the current
+            hidden state (``True``, MUDDFormer) or use only learned static
+            scalar weights (``False``, equivalent to DenseFormer).
+            Dynamic weights are computed position-wise via a two-layer MLP:
+            .. math::
+                A_i(X_i) = \text{GELU}(\text{RMSNorm}(X_i)\,W_1)\,W_2 + a_i
+            where :math:`a_i` is a learnable static prior (initialized as
+            identity on the current layer). Setting this to ``False`` disables
+            :math:`W_1` and :math:`W_2`, retaining only the static bias.
+        mudd_round64 (:obj:`bool`, *optional*, defaults to ``True``):
+            Round the inner hidden dimension of each DA module up to the
+            nearest multiple of 64 for hardware-aligned tensor operations.
+            Recommended for training on CUDA devices. Slightly increases
+            parameter count but improves throughput.
+        mudd_expand_last (:obj:`bool`, *optional*, defaults to ``True``):
+            Multiply the DA module hidden dimension by 4 for the final
+            Transformer layer. The last layer's aggregation benefits from
+            higher capacity because it summarizes the entire depth of the
+            network before the output projection.
+        mudd_sepln (:obj:`bool`, *optional*, defaults to ``False``):
+            Use separate SeeDNorm pre-normalization layers for the K and V
+            input streams (Q already uses the existing ``input_layernorm``).
+            Enables independent rescaling per stream when
+            ``mudd_dense_type="qkvr"``. Adds 2 × SeeDNorm parameters per
+            decoder layer. Ignored when ``mudd_dense_type="l"``.
     """
     model_type = "neollm"
         directional_routing_temp=3.0,
         # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
         use_attn_res=False,
+        attn_res_num_blocks=4,
         fan_ratio=0.125,
         fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
         versatile_gumbel_temp_end=0.1,
         versatile_gumbel_temp_decay=0.99984,
         versatile_aux_loss_weight=1e-5,
+        # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
+        use_mudd=False,
+        mudd_dense_type="qkvr",
+        mudd_dynamic_dense=True,
+        mudd_round64=False,
+        mudd_expand_last=True,
+        mudd_sepln=False,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
                 f"num_hidden_layers={num_hidden_layers}."
             )
+        # ── MUDD: validate and resolve ──��─────────────────────────────────
+        if use_mudd and use_attn_res:
+            raise ValueError(
+                "`use_mudd=True` and `use_attn_res=True` are mutually exclusive. "
+                "Both mechanisms replace residual aggregation across depth and "
+                "cannot be active simultaneously. Set exactly one to True."
+            )
+        if use_mudd and mudd_dense_type not in ("qkvr", "l"):
+            raise ValueError(
+                f"`mudd_dense_type` must be 'qkvr' or 'l', got '{mudd_dense_type}'."
+            )
         # ── VersatileFFN: validate expert configuration ────────────────────
         if use_versatile_ffn:
             if not (1 <= versatile_active_experts < versatile_total_experts):
         self.repo_start_layer              = repo_start_layer
         self.repo_d_p                      = repo_d_p
+        # ── MUDD connections (Xiao et al., 2025) ─────────────────────────
+        self.use_mudd                      = use_mudd
+        self.mudd_dense_type               = mudd_dense_type
+        self.mudd_dynamic_dense            = mudd_dynamic_dense
+        self.mudd_round64                  = mudd_round64
+        self.mudd_expand_last              = mudd_expand_last
+        self.mudd_sepln                    = mudd_sepln
         # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
         self.use_versatile_ffn             = use_versatile_ffn
         self.versatile_total_experts       = versatile_total_experts