KitsuVp
/

NeoLLM

@@ -247,6 +247,28 @@ class NeoLLMConfig(PretrainedConfig):
             Coefficient ``λ`` for the load-balancing auxiliary loss.
         jtokm_norm_eps (:obj:`float`, *optional*, defaults to 1e-6):
             Epsilon for L2 normalisation of modulation vectors.
     Constraints:
         - ``use_jtokm=True`` requires ``use_token_generator=True``.
@@ -353,6 +375,8 @@ class NeoLLMConfig(PretrainedConfig):
         jtokm_num_modes=4,
         jtokm_aux_loss_weight=1e-4,
         jtokm_norm_eps=1e-6,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
@@ -463,6 +487,9 @@ class NeoLLMConfig(PretrainedConfig):
         self.jtokm_aux_loss_weight         = jtokm_aux_loss_weight
         self.jtokm_norm_eps                = jtokm_norm_eps
         self.auto_map = {
             "AutoConfig":           "configuration_neollm.NeoLLMConfig",
             "AutoModel":            "modeling_neollm.NeoLLMModel",

             Coefficient ``λ`` for the load-balancing auxiliary loss.
         jtokm_norm_eps (:obj:`float`, *optional*, defaults to 1e-6):
             Epsilon for L2 normalisation of modulation vectors.
+        use_hadamard_o_proj (:obj:`bool`, *optional*, defaults to ``False``):
+            Replace the dense ``W_O ∈ R^{d×d}`` output projection in every
+            multi-head attention block with a fixed Walsh–Hadamard Transform
+            followed by a learnable per-channel affine rescaling
+            ``α ⊙ FWHT(x)/√d + β``.
+            The WHT is a parameter-free orthogonal matrix whose singular values
+            are all identically 1, so the effective condition number is
+            ``κ = 1`` by construction and cannot grow during training. This
+            directly addresses the high-κ pathology (κ up to 10^5) observed in
+            the dense ``o_proj`` matrices, which causes FP8 per-tensor
+            quantisation to lose low-magnitude directions entirely.
+            Parameter reduction: replaces ``d²`` weights with ``2d``
+            (``α`` and ``β``), saving ≈25% of attention parameters per block.
+            Requires ``hidden_size`` to be a power of 2 (512 ✓, 1024 ✓,
+            768 ✗).
+            Reference: Aggarwal & Kumar (2026). *Rethinking Attention Output
+            Projection: Structured Hadamard Transforms for Efficient
+            Transformers.* arXiv:2603.08343.
     Constraints:
         - ``use_jtokm=True`` requires ``use_token_generator=True``.
         jtokm_num_modes=4,
         jtokm_aux_loss_weight=1e-4,
         jtokm_norm_eps=1e-6,
+        # ── Hadamard output projection (Aggarwal & Kumar, 2026) ───────────
+        use_hadamard_o_proj=True,
         **kwargs,
     ):
         # ── Generator / tying consistency ─────────────────────────────────
         self.jtokm_aux_loss_weight         = jtokm_aux_loss_weight
         self.jtokm_norm_eps                = jtokm_norm_eps
+        # ── Hadamard output projection (Aggarwal & Kumar, 2026) ───────────
+        self.use_hadamard_o_proj           = use_hadamard_o_proj
         self.auto_map = {
             "AutoConfig":           "configuration_neollm.NeoLLMConfig",
             "AutoModel":            "modeling_neollm.NeoLLMModel",