KitsuVp
/

NeoLLM

@@ -148,6 +148,23 @@ class PolyNormAnalysis:
     output:            Optional[torch.Tensor] = None  # final PolyNorm output
 @dataclass
 class AttentionAnalysis:
     """
@@ -218,6 +235,9 @@ class AttentionAnalysis:
     attn_output_pre_gate: Optional[torch.Tensor] = None  # pre gate multiply [B,S,H,d]
     attn_output_final:    Optional[torch.Tensor] = None  # after o_proj [B,S,D]
 @dataclass
 class MLPAnalysis:
@@ -1617,15 +1637,110 @@ def affine_scaled_flash_attention_forward(
     # ── Combine and apply dropout to the full affine output ───────────────
     output = alpha_t * flash_out + beta_t * v_cumsum_t           # [B, S, H_q, d_head]
-    # Apply output dropout on the combined affine result.
-    # This regularises the full [α·flash + β·V_cumsum] output consistently.
-    if dropout > 0.0 and module.training:
-        output = nn.functional.dropout(output, p=dropout, training=True)
     # attn_weights is None — flash never exposes the softmax weight matrix.
     return output, None
 class NeoLLMAttention(nn.Module):
     """
     Full attention with FANformer, SeeDNorm, ResFormer, Learnable Multipliers,
@@ -1644,8 +1759,16 @@ class NeoLLMAttention(nn.Module):
         → MEAHeadSeeDNorm → XSA → Directional Routing → reshape
         → o_proj · sigmoid(gate) → dropout
     References:
         Directional Routing: Taylor (2026). arXiv:2603.14923.
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -1674,6 +1797,7 @@ class NeoLLMAttention(nn.Module):
         self.lucid_attention_eps           = float(
             getattr(config, "lucid_attention_eps", config.rms_norm_eps)
         )
         self.fan_layer = FANLayer(
             hidden_size=config.hidden_size,
@@ -1699,10 +1823,23 @@ class NeoLLMAttention(nn.Module):
             fan_output_dim, self.num_mea_component_heads * self.head_dim,
             bias=config.attention_bias,
         )
-        self.o_proj = LinearWithMultipliers(
-            config.num_attention_heads * self.head_dim, config.hidden_size,
-            bias=config.attention_bias, use_row_multiplier=True, use_column_multiplier=True,
-        )
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
@@ -2089,7 +2226,12 @@ class NeoLLMAttention(nn.Module):
             attn_analysis.attn_output_pre_gate = attn_out_flat.detach()
             gate_sig = torch.sigmoid(gate)
             attn_analysis.gate_sigmoid = gate_sig.detach()
-            attn_out_gated = self.o_proj(attn_out_flat * gate_sig)
         else:
             attn_out_gated = self.o_proj(attn_out_flat * torch.sigmoid(gate))
@@ -2644,7 +2786,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         return LayerAnalysis(
             seednorm_pre_attn  = SeeDNormAnalysis(),
             seednorm_post_attn = SeeDNormAnalysis(),
-            attention          = AttentionAnalysis(fan=FANAnalysis()),
             mlp                = MLPAnalysis(
                 fan      = FANAnalysis(),
                 polynorm = PolyNormAnalysis(),
@@ -3098,6 +3243,7 @@ __all__ = [
     "VectorMultiplier",
     "LinearWithMultipliers",
     "MEAHeadSeeDNorm",
     # Analysis dataclasses — exported so external tools can type-hint against them
     "AnalysisState",
     "LayerAnalysis",
@@ -3107,6 +3253,7 @@ __all__ = [
     "SeeDNormAnalysis",
     "GPASAnalysis",
     "PolyNormAnalysis",
     "JTokMAnalysis",
     "AttnResAnalysis",
     "GeneratorAnalysis",

     output:            Optional[torch.Tensor] = None  # final PolyNorm output
+@dataclass
+class HadamardAnalysis:
+    """
+    Internals of a HadamardOProj forward pass.
+    Only populated when use_hadamard_o_proj=True.
+    Reference: Aggarwal & Kumar (2026). arXiv:2603.08343.
+    post_fwht:      WHT output before α scaling [..., D] — useful to verify
+                    that the transform is truly norm-preserving (κ=1 sanity check).
+    alpha_snapshot: detached copy of the learnable α vector [D] — tracks how
+                    per-channel scaling evolves during training analysis.
+    """
+    post_fwht:      Optional[torch.Tensor] = None  # FWHT(x)/√d  before α [B,S,D]
+    alpha_snapshot: Optional[torch.Tensor] = None  # self.alpha [D] — learned scale
 @dataclass
 class AttentionAnalysis:
     """
     attn_output_pre_gate: Optional[torch.Tensor] = None  # pre gate multiply [B,S,H,d]
     attn_output_final:    Optional[torch.Tensor] = None  # after o_proj [B,S,D]
+    # ── HadamardOProj internals (conditional on use_hadamard_o_proj) ──
+    hadamard: Optional["HadamardAnalysis"] = None  # None when dense o_proj active
 @dataclass
 class MLPAnalysis:
     # ── Combine and apply dropout to the full affine output ───────────────
     output = alpha_t * flash_out + beta_t * v_cumsum_t           # [B, S, H_q, d_head]
     # attn_weights is None — flash never exposes the softmax weight matrix.
     return output, None
+class HadamardOProj(nn.Module):
+    """
+    Parameter-free Walsh–Hadamard output projection with learnable affine rescaling.
+    Replaces the dense W_O ∈ R^{d×d} in multi-head attention with a fixed
+    orthogonal Walsh–Hadamard Transform followed by a per-channel learnable
+    affine: output = α ⊙ FWHT(x) + β
+    Motivation (Aggarwal & Kumar, 2026, arXiv:2603.08343):
+        The standard dense o_proj develops extreme condition numbers during
+        training (κ up to 10^5 observed in practice) because the optimiser has
+        no incentive to keep singular values balanced — some directions are
+        amplified while others collapse toward zero.  This makes the layer
+        hostile to FP8 quantisation, which uses a single per-tensor scale and
+        therefore loses the low-magnitude directions entirely.
+        The Walsh–Hadamard Transform is a fixed orthogonal matrix whose
+        singular values are all identically 1, making κ = 1 by construction.
+        It cannot develop condition-number pathology because it has no
+        parameters.  The learnable α/β restore per-channel expressivity at
+        a cost of 2·d parameters instead of d².
+    Properties:
+        - Condition number:   κ = 1  (exact, permanent, by construction)
+        - Parameters:         2·d  vs  d² for dense  (~25% attention params saved)
+        - Forward FLOPs:      O(d log d)  vs  O(d²) for dense
+        - Norm preservation:  FWHT is isometric — ‖FWHT(x)‖₂ = ‖x‖₂
+        - FP8 friendliness:   single per-tensor scale covers all directions equally
+        - Requires:           d must be a power of 2
+    The FWHT is implemented as an in-place iterative butterfly (Cooley-Tukey
+    pattern over additions/subtractions) followed by 1/√d normalisation to
+    produce an orthonormal transform (H^T H = I).  No external dependency.
+    Reference:
+        Aggarwal, S. & Kumar, L. (2026). "Rethinking Attention Output
+        Projection: Structured Hadamard Transforms for Efficient Transformers."
+        arXiv:2603.08343.
+    """
+    def __init__(self, dim: int, bias: bool = True):
+        super().__init__()
+        assert dim > 0 and (dim & (dim - 1)) == 0, (
+            f"HadamardOProj requires dim to be a power of 2, got {dim}"
+        )
+        self.dim  = dim
+        self.norm = dim ** -0.5            # 1/√d — makes H^T H = I
+        # Learnable affine rescaling: α ⊙ FWHT(x) + β
+        # Initialised to α=1, β=0 so the layer starts as a pure WHT,
+        # identical to an orthonormal projection with unit gain.
+        self.alpha = nn.Parameter(torch.ones(dim))
+        self.beta  = nn.Parameter(torch.zeros(dim)) if bias else None
+    def _fwht(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Iterative in-place Fast Walsh–Hadamard Transform over the last dim.
+        Butterfly pattern: log₂(d) stages, each pairing elements at stride h.
+        Cost: d·log₂(d) additions/subtractions, zero multiplications.
+        Compatible with torch.compile — all shapes are static, no Python loops
+        visible to the tracer once d is fixed.
+        """
+        h = 1
+        while h < self.dim:
+            # Reshape to expose pairs at current stride
+            x = x.reshape(*x.shape[:-1], -1, 2 * h)
+            a, b = x[..., :h], x[..., h:]
+            # Butterfly: (a+b, a-b) — only additions and subtractions
+            x = torch.cat([a + b, a - b], dim=-1)
+            x = x.reshape(*x.shape[:-2], self.dim)
+            h *= 2
+        return x
+    def forward(
+        self,
+        x: torch.Tensor,
+        analysis: Optional["HadamardAnalysis"] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:        [..., dim]  — concatenated multi-head attention outputs
+            analysis: HadamardAnalysis container populated when analysis mode
+                      is active (eval + model.enable_analysis()). None otherwise.
+        Returns:
+            α ⊙ (FWHT(x) / √dim) + β  of shape [..., dim]
+        """
+        out = self._fwht(x) * self.norm    # normalise: H^T H = I
+        if analysis is not None:
+            analysis.post_fwht      = out.detach()
+            analysis.alpha_snapshot = self.alpha.detach()
+        out = out * self.alpha             # per-channel learnable scale
+        if self.beta is not None:
+            out = out + self.beta          # per-channel learnable bias
+        return out
 class NeoLLMAttention(nn.Module):
     """
     Full attention with FANformer, SeeDNorm, ResFormer, Learnable Multipliers,
         → MEAHeadSeeDNorm → XSA → Directional Routing → reshape
         → o_proj · sigmoid(gate) → dropout
+    o_proj variants (controlled by config.use_hadamard_o_proj):
+        False (default): dense LinearWithMultipliers — full expressivity,
+                         develops high κ during training (FP8 risk).
+        True:            HadamardOProj — fixed WHT + learnable α/β,
+                         κ = 1 by construction, 25% fewer attention params,
+                         FP8-friendly (Aggarwal & Kumar, 2026, arXiv:2603.08343).
     References:
         Directional Routing: Taylor (2026). arXiv:2603.14923.
+        Hadamard o_proj:     Aggarwal & Kumar (2026). arXiv:2603.08343.
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         self.lucid_attention_eps           = float(
             getattr(config, "lucid_attention_eps", config.rms_norm_eps)
         )
+        self.use_hadamard_o_proj           = getattr(config, "use_hadamard_o_proj", False)
         self.fan_layer = FANLayer(
             hidden_size=config.hidden_size,
             fan_output_dim, self.num_mea_component_heads * self.head_dim,
             bias=config.attention_bias,
         )
+        # ── Output projection (Aggarwal & Kumar, 2026, arXiv:2603.08343) ────
+        # use_hadamard_o_proj=False (default): dense LinearWithMultipliers.
+        # use_hadamard_o_proj=True:  HadamardOProj — fixed WHT + learnable α/β.
+        #   κ = 1 by construction, 25% fewer attention params, FP8-friendly.
+        #   Requires hidden_size to be a power of 2 (512 ✓, 1024 ✓, 768 ✗).
+        _o_in = config.num_attention_heads * self.head_dim
+        if self.use_hadamard_o_proj:
+            assert _o_in == config.hidden_size, (
+                f"HadamardOProj requires in_dim == out_dim, "
+                f"got {_o_in} vs {config.hidden_size}"
+            )
+            self.o_proj = HadamardOProj(config.hidden_size, bias=config.attention_bias)
+        else:
+            self.o_proj = LinearWithMultipliers(
+                _o_in, config.hidden_size,
+                bias=config.attention_bias, use_row_multiplier=True, use_column_multiplier=True,
+            )
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
             attn_analysis.attn_output_pre_gate = attn_out_flat.detach()
             gate_sig = torch.sigmoid(gate)
             attn_analysis.gate_sigmoid = gate_sig.detach()
+            gated = attn_out_flat * gate_sig
+            if self.use_hadamard_o_proj:
+                # Pass HadamardAnalysis sub-object so post_fwht and alpha are captured
+                attn_out_gated = self.o_proj(gated, analysis=attn_analysis.hadamard)
+            else:
+                attn_out_gated = self.o_proj(gated)
         else:
             attn_out_gated = self.o_proj(attn_out_flat * torch.sigmoid(gate))
         return LayerAnalysis(
             seednorm_pre_attn  = SeeDNormAnalysis(),
             seednorm_post_attn = SeeDNormAnalysis(),
+            attention          = AttentionAnalysis(
+                fan      = FANAnalysis(),
+                hadamard = HadamardAnalysis() if getattr(cfg, "use_hadamard_o_proj", False) else None,
+            ),
             mlp                = MLPAnalysis(
                 fan      = FANAnalysis(),
                 polynorm = PolyNormAnalysis(),
     "VectorMultiplier",
     "LinearWithMultipliers",
     "MEAHeadSeeDNorm",
+    "HadamardOProj",
     # Analysis dataclasses — exported so external tools can type-hint against them
     "AnalysisState",
     "LayerAnalysis",
     "SeeDNormAnalysis",
     "GPASAnalysis",
     "PolyNormAnalysis",
+    "HadamardAnalysis",
     "JTokMAnalysis",
     "AttnResAnalysis",
     "GeneratorAnalysis",