KitsuVp
/

NeoLLM

@@ -349,6 +349,75 @@ class NeoLLMConfig(PretrainedConfig):
             is less rich than the full hidden representation. Ignored
             when ``use_repo=False``.
     Constraints:
         - ``use_jtokm=True`` requires ``use_token_generator=True``.
         - ``1 ≤ jtokm_top_k < jtokm_num_experts`` when ``use_jtokm=True``.
@@ -358,6 +427,9 @@ class NeoLLMConfig(PretrainedConfig):
         - ``repo_start_layer`` must satisfy
           ``0 <= repo_start_layer < num_hidden_layers`` when
           ``use_repo=True``.
     Examples::
@@ -413,6 +485,9 @@ class NeoLLMConfig(PretrainedConfig):
         Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
         with Context Re-Positioning.* arXiv:2512.14391.
     """
     model_type = "neollm"
@@ -492,6 +567,11 @@ class NeoLLMConfig(PretrainedConfig):
         versatile_gumbel_temp_end=0.1,
         versatile_gumbel_temp_decay=0.99984,
         versatile_aux_loss_weight=1e-5,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
@@ -554,6 +634,24 @@ class NeoLLMConfig(PretrainedConfig):
                     f"`versatile_total_experts` ({versatile_total_experts})."
                 )
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
         # ── Core Transformer ──────────────────────────────────────────────
@@ -658,6 +756,12 @@ class NeoLLMConfig(PretrainedConfig):
         self.versatile_gumbel_temp_decay   = versatile_gumbel_temp_decay
         self.versatile_aux_loss_weight     = versatile_aux_loss_weight
         self.auto_map = {
             "AutoConfig":           "configuration_neollm.NeoLLMConfig",
             "AutoModel":            "modeling_neollm.NeoLLMModel",

             is less rich than the full hidden representation. Ignored
             when ``use_repo=False``.
+        use_laurel (:obj:`bool`, *optional*, defaults to ``False``):
+            Enable the Learned Augmented Residual Layer (LAUREL) framework
+            (Menghani, Kumar & Kumar, ICML 2025). LAUREL generalises the
+            canonical residual connection:
+            .. math::
+                x_{i+1} = \\alpha \\cdot f(x_i) + g(x_i)
+            where :math:`g` is a learned linear function operating on the
+            residual stream.  Applied independently to both the attention
+            and MLP sublayers of every decoder layer.
+            At least one of ``use_laurel_rw`` or ``use_laurel_lr`` must be
+            ``True`` when this flag is active; both may be active
+            simultaneously, producing the combined **LAUREL-RW+LR** variant
+            (paper eq. 5).
+            Incompatible with ``use_attn_res=True`` — both methods modify
+            the residual stream and their interaction is undefined.
+            Reference: Menghani, G., Kumar, R. & Kumar, S. (2025).
+            *LAUREL: Learned Augmented Residual Layer.* ICML 2025.
+        use_laurel_rw (:obj:`bool`, *optional*, defaults to ``True``):
+            Enable the **LAUREL-RW** (Residual Weights) variant.  Assigns
+            independent learned scalars :math:`\\alpha, \\beta` to the
+            sublayer output and residual respectively:
+            .. math::
+                x_{i+1} = \\alpha_s \\cdot f(x_i) + \\beta_s \\cdot x_i
+            :math:`\\alpha_s, \\beta_s = \\text{softmax}([\\tilde{\\alpha},
+            \\tilde{\\beta}])` so that they are non-negative and sum to 1,
+            preventing unbounded growth (paper §2.1).  Adds **2 parameters
+            per sublayer** (4 per decoder layer).
+            When combined with ``use_laurel_lr=True`` (LAUREL-RW+LR,
+            paper eq. 5):
+            .. math::
+                x_{i+1} = \\alpha_s \\cdot f(x_i)
+                         + \\beta_s \\cdot (B A x_i + x_i)
+            Ignored when ``use_laurel=False``.
+        use_laurel_lr (:obj:`bool`, *optional*, defaults to ``False``):
+            Enable the **LAUREL-LR** (Low-Rank) variant.  Augments the
+            residual with a rank-``laurel_lr_rank`` correction:
+            .. math::
+                x_{i+1} = f(x_i) + B A x_i + x_i
+            where :math:`A \\in \\mathbb{R}^{D \\times r}` and
+            :math:`B \\in \\mathbb{R}^{r \\times D}` are learnable matrices
+            (paper eq. 3).  :math:`A` is initialised with column-orthogonal
+            values :math:`A_{i,j} = 1/\\sqrt{rD}` if :math:`i \\bmod r = j`
+            else 0; :math:`B` is initialised to zero — matching the LoRA
+            convention and ensuring the residual starts as identity
+            (paper §3.3).  Adds **2·r·D parameters per sublayer**
+            (4·r·D per decoder layer).
+            Ignored when ``use_laurel=False``.
+        laurel_lr_rank (:obj:`int`, *optional*, defaults to ``32``):
+            Rank ``r`` of the low-rank matrices in LAUREL-LR.  The paper
+            recommends :math:`r \\in \\{32, 48, 64\\}` for LLMs
+            (paper §3.3).  Ignored when ``use_laurel=False`` or
+            ``use_laurel_lr=False``.
     Constraints:
         - ``use_jtokm=True`` requires ``use_token_generator=True``.
         - ``1 ≤ jtokm_top_k < jtokm_num_experts`` when ``use_jtokm=True``.
         - ``repo_start_layer`` must satisfy
           ``0 <= repo_start_layer < num_hidden_layers`` when
           ``use_repo=True``.
+        - ``use_laurel=True`` is incompatible with ``use_attn_res=True``.
+        - When ``use_laurel=True``, at least one of ``use_laurel_rw`` or
+          ``use_laurel_lr`` must be ``True``.
     Examples::
         Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
         with Context Re-Positioning.* arXiv:2512.14391.
+        Menghani, G., Kumar, R. & Kumar, S. (2025). *LAUREL: Learned Augmented
+        Residual Layer.* ICML 2025. arXiv:2411.07501.
     """
     model_type = "neollm"
         versatile_gumbel_temp_end=0.1,
         versatile_gumbel_temp_decay=0.99984,
         versatile_aux_loss_weight=1e-5,
+        # ── LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) ─
+        use_laurel=True,
+        use_laurel_rw=True,
+        use_laurel_lr=True,
+        laurel_lr_rank=32,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
                     f"`versatile_total_experts` ({versatile_total_experts})."
                 )
+        # ── LAuReL: mutual exclusion and sub-flag consistency ─────────────
+        # use_laurel and use_attn_res both modify the residual stream and are
+        # structurally incompatible: AttnRes replaces the accumulation entirely
+        # with learned depth-wise attention, while LAuReL scales/augments the
+        # additive residual in-place.
+        if use_laurel and use_attn_res:
+            raise ValueError(
+                "`use_laurel=True` is incompatible with `use_attn_res=True`. "
+                "Both methods modify the residual stream: AttnRes replaces it "
+                "with depth-wise softmax attention, while LAuReL applies learned "
+                "scalar/low-rank augmentation in-place. Enable at most one."
+            )
+        if use_laurel and not use_laurel_rw and not use_laurel_lr:
+            raise ValueError(
+                "`use_laurel=True` requires at least one sub-variant to be active. "
+                "Set `use_laurel_rw=True` and/or `use_laurel_lr=True`."
+            )
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
         # ── Core Transformer ──────────────────────────────────────────────
         self.versatile_gumbel_temp_decay   = versatile_gumbel_temp_decay
         self.versatile_aux_loss_weight     = versatile_aux_loss_weight
+        # ── LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) ─
+        self.use_laurel                    = use_laurel
+        self.use_laurel_rw                 = use_laurel_rw
+        self.use_laurel_lr                 = use_laurel_lr
+        self.laurel_lr_rank                = laurel_lr_rank
         self.auto_map = {
             "AutoConfig":           "configuration_neollm.NeoLLMConfig",
             "AutoModel":            "modeling_neollm.NeoLLMModel",