KitsuVp
/

NeoLLM

@@ -77,10 +77,34 @@ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_u
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
-from .configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
@@ -339,6 +363,22 @@ class JTokMAnalysis:
     lns_scale:        Optional[float] = None          # 1/√(2ℓ) scaling factor
 @dataclass
 class AttnResAnalysis:
     """
@@ -350,6 +390,44 @@ class AttnResAnalysis:
     sources_count:    Optional[int] = None            # number of sources including partial
 @dataclass
 class LayerAnalysis:
     """
@@ -378,8 +456,12 @@ class LayerAnalysis:
     gpas_mlp:           Optional[GPASAnalysis] = None      # GPAS after MLP residual
     # Optional components (None when inactive)
-    jtokm:    Optional[JTokMAnalysis] = None    # if use_jtokm
-    attn_res: Optional[AttnResAnalysis] = None  # if use_attn_res
 @dataclass
@@ -444,6 +526,7 @@ class AnalysisState:
     layers:                 Optional[List[LayerAnalysis]] = None
     jtokm_aux_stats:        Optional[list] = None
     attn_res_sources_final: Optional[list] = None
     logits:                 Optional[torch.Tensor] = None
 class ScalarMultiplier(nn.Module):
@@ -2363,6 +2446,8 @@ class NeoLLMAttention(nn.Module):
         first_layer_fan: Optional[torch.Tensor] = None,
         attn_analysis: Optional[AttentionAnalysis] = None,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
@@ -2373,6 +2458,14 @@ class NeoLLMAttention(nn.Module):
             h_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * h_fan
         current_layer_fan = h_fan.clone()
         query_shape = (*input_shape, self.config.num_attention_heads, self.head_dim)
         kv_shape    = (*input_shape, self.num_mea_component_heads, self.head_dim)
@@ -2387,8 +2480,8 @@ class NeoLLMAttention(nn.Module):
             attn_analysis.gate_raw = gate.detach()
         q = self.q_norm(q_raw.view(query_shape)).transpose(1, 2)
-        k = self.k_norm(self.k_proj(h_fan).view(kv_shape)).transpose(1, 2)
-        v = self.v_proj(h_fan).view(kv_shape).transpose(1, 2)
         if attn_analysis is not None:
             attn_analysis.q_post_norm = q.detach()
@@ -3065,6 +3158,1036 @@ class NeoLLMMLP(nn.Module):
         return result
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections, optional JTok-M injection.
@@ -3087,7 +4210,23 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         self.layer_idx     = layer_idx
         self.use_jtokm     = config.use_jtokm
-        self.self_attn                = NeoLLMAttention(config, layer_idx)
         self.mlp                      = (
             VersatileFFN(config)
             if getattr(config, "use_versatile_ffn", False)
@@ -3120,10 +4259,78 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             self.attn_res_query_attn = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_query_mlp  = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_norm       = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.attn_res_query_attn = None
             self.attn_res_query_mlp  = None
             self.attn_res_norm       = None
     def _attn_res(
         self,
@@ -3173,6 +4380,10 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         B_vals: Optional[torch.Tensor] = None,
         attn_res_sources: Optional[list] = None,
         attn_res_partial: Optional[torch.Tensor] = None,
         layer_analysis: Optional[LayerAnalysis] = None,
         output_attentions: Optional[bool] = False,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
@@ -3182,6 +4393,63 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         if layer_analysis is not None:
             layer_analysis.hidden_states_input = hidden_states.detach()
         # ── Attention Residuals: compute pre-attention input ──────────────
         # When active, the input to the attention sublayer is no longer the
         # raw hidden_states (accumulated residual) but a softmax-weighted
@@ -3195,10 +4463,19 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
                 attn_res_sources, attn_res_partial, self.attn_res_query_attn,
                 ar_analysis, "attn",
             )
             residual_attn = attn_res_partial
         else:
-            h_attn = hidden_states
-            residual_attn = hidden_states
         # ── Attention block ───────────────────────────────────────────────
         sn_pre = layer_analysis.seednorm_pre_attn if layer_analysis is not None else None
@@ -3207,21 +4484,49 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         if layer_analysis is not None:
             layer_analysis.lns_attn_output = h_lns.detach()
-        hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
-            hidden_states=h_lns,
-            attention_mask=attention_mask,
-            position_embeddings=position_embeddings,
-            first_layer_fan=first_layer_fan,
-            attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
-            repo_rope_args=repo_rope_args,
-            **kwargs,
-        )
         if layer_analysis is not None:
             layer_analysis.attn_contribution = hidden_states.detach()
         gpas_attn_a = layer_analysis.gpas_attn if layer_analysis is not None else None
-        h_tilde = self.gpas_attn(residual_attn + hidden_states, analysis=gpas_attn_a)
         if layer_analysis is not None:
             layer_analysis.h_tilde = h_tilde.detach()
@@ -3257,8 +4562,11 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         if layer_analysis is not None:
             layer_analysis.mlp_contribution = delta_m.detach()
-        # ── JTok-M injection (additive alongside MLP residual) ────────────
-        aux_stats = None
         if self.use_jtokm and z_tilde is not None and B_vals is not None:
             orig_shape = h_tilde.shape
             h_flat     = h_tilde.reshape(-1, self.hidden_size)
@@ -3269,11 +4577,21 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             delta_r, aux_stats = self.jtokm(h_flat, z_flat, B_flat, analysis=jtokm_a)
             delta_r = delta_r.reshape(orig_shape)
-            gpas_mlp_a    = layer_analysis.gpas_mlp if layer_analysis is not None else None
-            hidden_states = self.gpas_mlp(residual_mlp + delta_m + delta_r, analysis=gpas_mlp_a)
         else:
-            gpas_mlp_a    = layer_analysis.gpas_mlp if layer_analysis is not None else None
-            hidden_states = self.gpas_mlp(residual_mlp + delta_m, analysis=gpas_mlp_a)
         if layer_analysis is not None:
             layer_analysis.hidden_states_output = hidden_states.detach()
@@ -3285,6 +4603,9 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             outputs += (aux_stats,)
         if versatile_aux is not None:
             outputs += (versatile_aux,)
         return outputs
@@ -3612,6 +4933,9 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
             if hasattr(module, "alpha_ma"):
                 module.alpha_ma.zero_()
         elif isinstance(module, GPAS):
             module.alpha.data.fill_(0.0)
@@ -3668,6 +4992,45 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
                 module.attn_res_query_attn.data.zero_()
                 module.attn_res_query_mlp.data.zero_()
         elif isinstance(module, SpellingBeeEmbedding):
             # byte_emb initialised identically to token embeddings: std=1/√d.
             # Ensures E[‖e_byte‖²] ≈ 1 at init, matching etok, so the
@@ -3737,8 +5100,99 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.gradient_checkpointing = False
         self.first_layer_fan        = None
         self.post_init()
     def get_input_embeddings(self):
         if self.config.use_token_generator:
             return self.token_generator
@@ -3764,7 +5218,9 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             getattr(cfg, "use_repo", False)
             and layer_idx >= getattr(cfg, "repo_start_layer", cfg.num_hidden_layers // 3)
         )
-        _versatile = getattr(cfg, "use_versatile_ffn", False)
         return LayerAnalysis(
             seednorm_pre_attn  = SeeDNormAnalysis(),
             seednorm_post_attn = SeeDNormAnalysis(),
@@ -3778,10 +5234,14 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 polynorm = PolyNormAnalysis() if not _versatile else None,
                 versatile = VersatileFFNAnalysis() if _versatile else None,
             ),
-            gpas_attn = GPASAnalysis(),
-            gpas_mlp  = GPASAnalysis(),
-            jtokm     = JTokMAnalysis() if cfg.use_jtokm else None,
-            attn_res  = AttnResAnalysis() if getattr(cfg, "use_attn_res", False) else None,
         )
     def forward(
@@ -3883,6 +5343,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if getattr(self.config, "use_repo", False) else None
         )
         # ── Attention Residuals state ──────────────────────────────────────
         # Full AttnRes (attn_res_num_blocks=0): sources grows by one entry per
         # decoder layer — all previous outputs are kept, max N=num_layers+1.
@@ -3897,13 +5361,57 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         if use_attn_res:
             attn_res_sources = [hidden_states]   # b_0 = token embedding
             attn_res_partial = hidden_states     # initial partial sum
-            num_blocks  = getattr(self.config, 'attn_res_num_blocks', 0)
-            block_size  = (
-                max(self.config.num_hidden_layers // num_blocks, 1)
-                if num_blocks > 0
-                else 1   # Full AttnRes: every layer is its own "block"
-            )
         # Pre-allocate per-layer analysis list when analysis is active
         if analysis_state is not None:
@@ -3913,17 +5421,13 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
-            # ── Block AttnRes: boundary handling ──────────────────────────
-            # At each block boundary (excluding layer 0): append the current
-            # partial sum to sources as a completed block summary, then reset
-            # partial to None so the new block builds from scratch — matching
-            # the paper's pseudocode exactly.
-            # For Full AttnRes (block_size=1): every layer is a boundary, so
-            # partial is appended and reset after every layer. The partial is
-            # re-seeded from the previous hidden_states below.
-            if use_attn_res and layer_idx > 0 and layer_idx % block_size == 0:
-                attn_res_sources = attn_res_sources + [attn_res_partial]
-                attn_res_partial = hidden_states  # start new block from current output
             # Build per-layer analysis container (only in eval + analysis mode)
             layer_analysis = None
@@ -3932,15 +5436,26 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 layer_analysis.layer_idx = layer_idx
                 analysis_state.layers.append(layer_analysis)
             layer_outputs = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
-                attention_mask=causal_mask,
                 first_layer_fan=self.first_layer_fan,
                 z_tilde=z_tilde,
                 B_vals=B_vals,
                 attn_res_sources=attn_res_sources,
                 attn_res_partial=attn_res_partial if use_attn_res else None,
                 layer_analysis=layer_analysis,
                 output_attentions=output_attentions,
                 repo_rope_args=repo_rope_args,
@@ -3948,23 +5463,76 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             )
             hidden_states = layer_outputs[0]
             # Update AttnRes partial sum — the new partial is the layer output
             if use_attn_res:
                 attn_res_partial = hidden_states
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
-            # Collect JTok-M aux stats (last element if present)
-            if self.config.use_jtokm and len(layer_outputs) > (2 if output_attentions else 1):
-                all_aux_stats.append(layer_outputs[-1])
-            # Collect VersatileFFN aux stats (second-to-last if jtokm also present,
-            # or last if jtokm is absent). Only non-None during training.
             if getattr(self.config, "use_versatile_ffn", False):
-                for item in layer_outputs[1:]:
                     if isinstance(item, tuple) and len(item) == 3:
-                        # (p_sum, f_sum, N_tokens) signature
                         all_aux_stats.append(("versatile", item))
                         break
@@ -3972,6 +5540,16 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                     and hasattr(decoder_layer, "current_layer_fan")):
                 self.first_layer_fan = decoder_layer.current_layer_fan
         hidden_states = self.norm(hidden_states)
         if output_hidden_states:
@@ -3984,6 +5562,9 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             analysis_state.attn_res_sources_final  = (
                 attn_res_sources if use_attn_res else None
             )
         if not return_dict:
             return tuple(
@@ -4124,6 +5705,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             layers                 = None,   # filled by NeoLLMModel.forward
             jtokm_aux_stats        = [] if cfg.use_jtokm else None,
             attn_res_sources_final = [] if getattr(cfg, "use_attn_res", False) else None,
         )
     # ── Standard model API ────────────────────────────────────────────────
@@ -4261,6 +5843,12 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
     "NeoLLMForCausalLM",
     "NeoLLMModel",
     "NeoLLMPreTrainedModel",
@@ -4278,7 +5866,7 @@ __all__ = [
     "REPOModule",
     "VersatileFFN",
     "compute_versatile_aux_loss",
-    # Analysis dataclasses — exported so external tools can type-hint against them
     "AnalysisState",
     "LayerAnalysis",
     "AttentionAnalysis",
@@ -4292,6 +5880,9 @@ __all__ = [
     "VersatileFFNAnalysis",
     "JTokMAnalysis",
     "AttnResAnalysis",
     "GeneratorAnalysis",
 ]

 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
+from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+# ── Optional fast-path dependencies (GatedDeltaNet linear attention) ─────────
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update as _causal_conv1d_update
+    _causal_conv1d_available = True
+except ImportError:
+    causal_conv1d_fn = None
+    _causal_conv1d_update = None
+    _causal_conv1d_available = False
+try:
+    from fla.modules import FusedRMSNormGated
+    from fla.ops.gated_delta_rule import (
+        chunk_gated_delta_rule,
+        fused_recurrent_gated_delta_rule,
+    )
+    _fla_available = True
+except ImportError:
+    FusedRMSNormGated = None
+    chunk_gated_delta_rule = None
+    fused_recurrent_gated_delta_rule = None
+    _fla_available = False
+is_linear_attn_fast_path = _causal_conv1d_available and _fla_available
 logger = logging.get_logger(__name__)
     lns_scale:        Optional[float] = None          # 1/√(2ℓ) scaling factor
+@dataclass
+class DCAAnalysis:
+    """
+    GRN-v3 depth-wise aggregate weights from a DeepCrossAttention layer.
+    Only populated when use_dca=True.
+    grn_depth_weights: softmax-free aggregate scalars used to weight each
+        source layer, shape [3, y, B, S] where 3 = Q/K/V streams,
+        y = selected stack depth (at most 2*dca_k), B = batch, S = seq.
+        These are the per-position, per-layer scalars *before* adding the
+        static bias — useful to see which layers the dynamic component
+        selectively suppresses (ReLU zeros out negative entries).
+    """
+    grn_depth_weights: Optional[torch.Tensor] = None  # [3, y, B, S]
 @dataclass
 class AttnResAnalysis:
     """
     sources_count:    Optional[int] = None            # number of sources including partial
+@dataclass
+class StackMemoryAnalysis:
+    """
+    Internals of a StackMemory forward pass.
+    Only populated when use_stacktrans=True AND model is in eval + analysis mode.
+    Reference: Zhang, K. et al. (NeurIPS 2025). "Recursive Transformer:
+    Boosting Reasoning Ability with State Stack."
+    action_probs:   softmax distribution [push, pop, no-op] per head and
+                    token position. Shape [B, S, H, 3]. Visualising this
+                    across layers reveals the push-heavy early layers and
+                    pop-heavy later layers described in the paper (§B.2).
+    stack_in:       stack state entering this layer (the output of the
+                    previous layer's StackMemory). Shape [B, H, slots, ds].
+                    None for layer 0 (starts as all-zeros).
+    stack_out:      updated stack state after processing this sequence.
+                    Shape [B, H, slots, ds]. This is new_stack[:, -1] —
+                    the stack at the final sequence position, passed to
+                    the next layer as stack_in.
+    mask_out:       validity mask for stack_out. Shape [B, H, slots].
+                    Values near 1 indicate active slots; near 0 = empty.
+    gate_weights:   softmax attention weights used for global reading.
+                    Shape [B, S, H, slots]. High weight on slot i at
+                    position t means the model retrieved from slot i there.
+    memory_output:  weighted stack readout before up_proj.
+                    Shape [B, S, stack_d_model].
+    residual_scale: value of the learnable res_weight scalar at this step.
+    """
+    action_probs:   Optional[torch.Tensor] = None  # [B,S,H,3]
+    stack_in:       Optional[torch.Tensor] = None  # [B,H,slots,ds] entering layer
+    stack_out:      Optional[torch.Tensor] = None  # [B,H,slots,ds] leaving layer
+    mask_out:       Optional[torch.Tensor] = None  # [B,H,slots]
+    gate_weights:   Optional[torch.Tensor] = None  # [B,S,H,slots]
+    memory_output:  Optional[torch.Tensor] = None  # [B,S,stack_d_model]
+    residual_scale: Optional[float]        = None  # res_weight scalar
 @dataclass
 class LayerAnalysis:
     """
     gpas_mlp:           Optional[GPASAnalysis] = None      # GPAS after MLP residual
     # Optional components (None when inactive)
+    jtokm:       Optional[JTokMAnalysis]       = None  # if use_jtokm
+    attn_res:    Optional[AttnResAnalysis]     = None  # if use_attn_res
+    dca:         Optional[DCAAnalysis]         = None  # if use_dca
+    stack:       Optional[StackMemoryAnalysis] = None  # if use_stacktrans
+    laurel_attn: Optional["LAuReLAnalysis"]    = None  # if use_laurel (attention residual)
+    laurel_mlp:  Optional["LAuReLAnalysis"]    = None  # if use_laurel (MLP residual)
 @dataclass
     layers:                 Optional[List[LayerAnalysis]] = None
     jtokm_aux_stats:        Optional[list] = None
     attn_res_sources_final: Optional[list] = None
+    dca_all_tokens_final:   Optional[list] = None
     logits:                 Optional[torch.Tensor] = None
 class ScalarMultiplier(nn.Module):
         first_layer_fan: Optional[torch.Tensor] = None,
         attn_analysis: Optional[AttentionAnalysis] = None,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
+        mudd_xk: Optional[torch.Tensor] = None,
+        mudd_xv: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
             h_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * h_fan
         current_layer_fan = h_fan.clone()
+        # ── MUDD: separate K/V FAN paths ─────────────────────────────────
+        # When mudd_xk/mudd_xv are provided (MUDD qkvr mode), they have already
+        # been normalized by the decoder layer's K/V norm chain. Here they go
+        # through their own FAN transform before k_proj/v_proj, keeping the
+        # FANformer periodicity modeling orthogonally intact per stream.
+        h_fan_k = self.fan_layer(mudd_xk) if mudd_xk is not None else h_fan
+        h_fan_v = self.fan_layer(mudd_xv) if mudd_xv is not None else h_fan
         query_shape = (*input_shape, self.config.num_attention_heads, self.head_dim)
         kv_shape    = (*input_shape, self.num_mea_component_heads, self.head_dim)
             attn_analysis.gate_raw = gate.detach()
         q = self.q_norm(q_raw.view(query_shape)).transpose(1, 2)
+        k = self.k_norm(self.k_proj(h_fan_k).view(kv_shape)).transpose(1, 2)
+        v = self.v_proj(h_fan_v).view(kv_shape).transpose(1, 2)
         if attn_analysis is not None:
             attn_analysis.q_post_norm = q.detach()
         return result
+class NeoLLMMUDDModule(nn.Module):
+    """
+    Multiway Dynamic Dense (MUDD) Depth-wise Aggregate module.
+    Generates per-position, per-stream connection weights over all preceding
+    layer outputs (and the token embedding) and produces up to C=4 aggregated
+    streams (Q, K, V, R) for the next Transformer block.
+    Architecture (Xiao et al., 2025, arXiv:2502.12170):
+        dw = GELU(RMSNorm(x) @ W1) @ W2 + a   # [B, T, C*(lidx+2)]
+        dw = reshape to [C, B, T, (lidx+2)]
+        stream_c = Σ_j dw[c, :, :, j] * hiddens[j]   for c in range(C)
+    W1 ~ N(0, 1/D), W2 = 0, a = identity on last index → reduces to standard
+    Transformer at init (dynamic part is zero, static bias selects Xi).
+    Args:
+        hidden_size: model dimension D
+        lidx:        layer index (0-based); history has lidx+2 entries
+        num_ways:    C, number of output streams (4 for "qkvr", 1 for "l")
+        is_last:     whether this is the last layer (controls expand_last)
+        expand_last: multiply hid_dim by 4 for the final layer's DA module
+        round64:     round hid_dim up to the nearest multiple of 64
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        lidx: int,
+        num_ways: int = 4,
+        is_last: bool = False,
+        expand_last: bool = False,
+        round64: bool = False,
+    ) -> None:
+        super().__init__()
+        self.lidx     = lidx
+        self.num_ways = num_ways
+        l             = lidx + 2           # history length: embedding + lidx layers
+        hid_dim       = l * num_ways
+        out_dim       = l * num_ways
+        if is_last and expand_last:
+            hid_dim *= 4
+        if round64:
+            hid_dim = (hid_dim // 64 + 1) * 64
+        # RMSNorm without learnable scale (paper uses RMSnormNoscale)
+        self.norm = nn.RMSNorm(hidden_size, elementwise_affine=False,
+                               eps=1e-6)
+        self.w1   = nn.Linear(hidden_size, hid_dim, bias=False)
+        self.act  = nn.GELU()
+        self.w2   = nn.Linear(hid_dim, out_dim, bias=False)
+        self._reset_mudd_parameters(hidden_size)
+    def _reset_mudd_parameters(self, D: int) -> None:
+        # W1 ~ N(0, 1/D); W2 = 0 → dynamic part starts at zero
+        nn.init.normal_(self.w1.weight, mean=0.0, std=1.0 / D)
+        nn.init.zeros_(self.w2.weight)
+    def forward(
+        self,
+        x: torch.Tensor,                # [B, T, D] — current layer output (Xi)
+        hiddens: list,                   # list of lidx+2 tensors [B, T, D]
+        static_bias: torch.Tensor,       # [C, lidx+2] — learnable static prior
+    ) -> tuple:
+        """
+        Returns:
+            Tuple of num_ways tensors, each [B, T, D] — the aggregated streams.
+        """
+        B, T, D = x.shape
+        # Dynamic weight generation: [B, T, C*(lidx+2)]
+        dw = self.w2(self.act(self.w1(self.norm(x))))
+        # Add static bias (broadcast over B and T)
+        # static_bias: [C, L] → [1, 1, C*L] via reshape
+        C, L = static_bias.shape
+        dw = dw + static_bias.reshape(1, 1, C * L).to(dw.dtype)
+        # Reshape to [C, B, T, L]
+        dw = dw.view(B, T, C, L).permute(2, 0, 1, 3)   # [C, B, T, L]
+        # Stack history: [L, B, T, D]
+        stacked = torch.stack(hiddens, dim=0)            # [L, B, T, D]
+        # Aggregate: Σ_j dw[c, :, :, j] * hiddens[j]
+        # einsum "cbtl, lbtd -> cbtd"
+        streams = torch.einsum('cbtl,lbtd->cbtd', dw, stacked)  # [C, B, T, D]
+        return tuple(streams[c] for c in range(C))
+def dca_select_layers(stacked: torch.Tensor, k: int) -> torch.Tensor:
+    """
+    k-DCA layer selection (Heddes et al., 2025, §3.1).
+    Keeps only the first k and last k tensors from the depth stack,
+    capping memory at 2k layer representations regardless of depth.
+    When the stack has <= 2k entries all are kept (early layers).
+    Args:
+        stacked: [y, B, S, D] — stack of all layer outputs so far.
+        k:       number of first/last layers to retain.
+    Returns:
+        [min(y, 2k), B, S, D]
+    """
+    y = stacked.shape[0]
+    if y <= k * 2:
+        return stacked
+    return torch.cat([stacked[:k], stacked[-k:]], dim=0)
+class NeoLLMGRN(nn.Module):
+    """
+    Generalized Residual Network v3 (GRN-v3) from DeepCrossAttention
+    (Heddes et al., 2025, arXiv:2502.06785, §3.1).
+    Produces `num_outputs` aggregated streams from a depth-wise stack of
+    layer representations. Weights are simultaneously:
+    - **Input-dependent** (dynamic): a two-layer mapping
+      ``w̄ = ReLU(RMSNorm(G) @ W)`` produces one scalar per
+      (output-stream, depth-position, batch-token). ``W`` is initialized
+      to zero so the dynamic contribution starts neutral.
+    - **Dimension-dependent** (static): a learnable bias ``b`` of shape
+      ``[num_outputs, num_stack_layers, hidden_size]`` initialized to ones
+      provides a per-dimension, per-layer prior. At initialization the
+      dynamic part is zero and the static bias sums to an equal-weight
+      average over all stack entries, reducing to a standard residual mean.
+    The combined weight for output stream ``o``, stack position ``y``,
+    batch ``b``, token ``n``, feature ``d`` is::
+        weight[o, y, b, n, d] = ReLU(dynamic[y, b, n, o]) + bias[o, y, d]
+    Output ``o`` is then the weighted sum over depth::
+        out[o, b, n, d] = Σ_y  stack[y, b, n, d] * weight[o, y, b, n, d]
+    Reference:
+        Heddes, M. et al. (2025). *DeepCrossAttention: Supercharging
+        Transformer Residual Connections.* arXiv:2502.06785.
+    Args:
+        hidden_size:      model dimension D.
+        num_stack_layers: number of depth entries this GRN will receive
+                          (= min(layer_idx+1, 2*dca_k)).
+        num_outputs:      number of output streams (3 for DCA Q/K/V,
+                          1 for the final aggregation GRN).
+        eps:              epsilon for the internal RMSNorm.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_stack_layers: int,
+        num_outputs: int = 3,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.num_outputs      = num_outputs
+        self.num_stack_layers = num_stack_layers
+        # Dynamic component: RMSNorm(no scale) → Linear → ReLU
+        # Linear maps D → num_outputs; init zeros so dynamic part = 0 at step 0.
+        _linear = nn.Linear(hidden_size, num_outputs, bias=False)
+        nn.init.zeros_(_linear.weight)
+        self.norm_noscale = nn.RMSNorm(
+            hidden_size, eps=eps, elementwise_affine=False
+        )
+        self.to_dynamic   = nn.Sequential(_linear, nn.ReLU())
+        # Static bias: [num_outputs, num_stack_layers, hidden_size], init ones.
+        # At init: weight = 0 + bias = 1 per entry → equal-weight average → residual.
+        self.bias = nn.Parameter(
+            torch.ones(num_outputs, num_stack_layers, hidden_size)
+        )
+    def forward(
+        self,
+        stack: torch.Tensor,
+        analysis: Optional["DCAAnalysis"] = None,
+    ) -> tuple:
+        """
+        Args:
+            stack:    [y, B, S, D] — selected depth stack (y ≤ 2*dca_k).
+            analysis: optional DCAAnalysis to deposit grn_depth_weights.
+        Returns:
+            Tuple of num_outputs tensors each [B, S, D].
+            When num_outputs=1 returns a single [B, S, D] tensor directly.
+        """
+        y, B, S, D = stack.shape
+        assert y == self.num_stack_layers, (
+            f"NeoLLMGRN expected stack depth {self.num_stack_layers}, got {y}"
+        )
+        # Dynamic aggregate: [y, B, S, D] → norm → [y, B, S, D]
+        #   → to_dynamic → [y, B, S, num_outputs]
+        #   → permute → [num_outputs, y, B, S]
+        normed   = self.norm_noscale(stack)                   # [y, B, S, D]
+        dynamic  = self.to_dynamic(normed)                    # [y, B, S, num_outputs]
+        dynamic  = dynamic.permute(3, 0, 1, 2)               # [o, y, B, S]
+        if analysis is not None:
+            analysis.grn_depth_weights = dynamic.detach()
+        # Combined weight: dynamic scalar + static bias per dimension
+        # dynamic:  [o, y, B, S]     → [o, y, B, S, 1]
+        # bias:     [o, y, D]        → [o, y, 1, 1, D]
+        weights = dynamic.unsqueeze(-1) + self.bias.unsqueeze(2).unsqueeze(3)
+        # weights: [o, y, B, S, D]
+        # Weighted depth-sum: Σ_y stack[y] * weights[o, y]
+        # stack: [y, B, S, D] → [1, y, B, S, D]
+        output = (stack.unsqueeze(0) * weights).sum(dim=1)   # [o, B, S, D]
+        if self.num_outputs == 1:
+            return output.squeeze(0)                          # [B, S, D]
+        return tuple(output[i] for i in range(self.num_outputs))
+class StackMemory(nn.Module):
+    """
+    Differentiable multi-head hidden-state stack for NeoLLM.
+    Implements the StackTrans module from Zhang et al. (NeurIPS 2025):
+    "Recursive Transformer: Boosting Reasoning Ability with State Stack."
+    Architecture (one forward call, covering the full sequence in parallel):
+        1. down_proj  : [B,S,D] → [B,S,stack_d_model]
+        2. action_head: → [B,S,H,3] softmax (push / pop / no-op)
+        3. k_values   : reshape to [B,S,H,ds]
+        4. _vectorized_update: applies soft push/pop/no-op to each
+           (batch, head) stack in parallel across the sequence dim.
+           This is the training-parallelism approximation from §3.3:
+           every token sees the *same* initial stack, breaking strict
+           temporal ordering within a sequence in exchange for full
+           data-parallelism. Cross-token memory is recovered during
+           autoregressive generation via the step() / enable_cache path.
+        5. gate_proj  : global read — softmax over all stack slots
+           (paper §3.1: "query-over-stack attention"), masked by the
+           validity mask. Returns weighted sum of the stack.
+        6. up_proj    : [B,S,stack_d_model] → [B,S,D]
+        7. residual   : output = up_proj_out * res_weight + hidden_states
+    Vertical passing (layer-to-layer):
+        Returns new_stack[:, -1] and new_mask[:, -1] — the stack state
+        at the last sequence position — which becomes the initial stack
+        for the next decoder layer. This propagates hierarchical context
+        depth-wise through the network.
+    Temporal accumulation (generation):
+        During autoregressive decoding, enable_cache=True and step() is
+        used: k_cache and action_cache store previous-token values so the
+        update equation integrates the full generated history rather than
+        starting from zeros each step.
+    Args:
+        config: NeoLLMConfig instance. Reads:
+            stacktrans_num_heads     (H, number of stack heads)
+            stacktrans_stack_slots   (S, stack depth)
+            stacktrans_stack_d_model (H×ds, low-rank dimension)
+            stacktrans_forward_bs    (batch size for cache buffers)
+    """
+    def __init__(self, config: NeoLLMConfig):
+        super().__init__()
+        self.num_stack_heads  = config.stacktrans_num_heads
+        self.stack_slots      = config.stacktrans_stack_slots
+        self.stack_d_model    = config.stacktrans_stack_d_model
+        self.head_dim         = self.stack_d_model // self.num_stack_heads
+        # Dimension reduction / expansion (standard nn.Linear, no multipliers —
+        # StackMemory is architecturally independent per the paper §A)
+        self.down_proj   = nn.Linear(config.hidden_size, self.stack_d_model, bias=True)
+        self.up_proj     = nn.Linear(self.stack_d_model, config.hidden_size, bias=True)
+        # Action prediction: push / pop / no-op probabilities, one triple per head
+        self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
+        # Global read query: one scalar gate per stack slot per head
+        self.gate_proj   = nn.Linear(self.head_dim, 1, bias=True)
+        # Learnable residual gate (paper h'_t = g_h·h_t + R_t, g_h scalar)
+        self.res_weight  = nn.Parameter(torch.ones(1))
+        # ── Autoregressive generation cache ──────────────────────────────
+        # k_cache and action_cache hold per-token values from previous steps
+        # so step() can reconstruct the full sequence history. Only used when
+        # enable_cache=True (set by NeoLLMModel.forward when use_cache=True).
+        _fbs = getattr(config, "stacktrans_forward_bs", 1)
+        _cs  = getattr(config, "cache_size", 2048)
+        self.register_buffer(
+            "k_cache",
+            torch.zeros(_fbs, _cs, self.num_stack_heads, self.head_dim),
+        )
+        self.register_buffer(
+            "action_cache",
+            torch.zeros(_fbs, _cs, self.num_stack_heads, 3),
+        )
+        self.cache_position = 0
+        self.enable_cache   = False
+    # ── Cache helpers ─────────────────────────────────────────────────────
+    def reset_cache(self) -> None:
+        self.cache_position = 0
+    def _update_cache(
+        self,
+        k_values: torch.Tensor,   # [B,S,H,ds] detached
+        actions:  torch.Tensor,   # [B,S,H,3]  detached
+    ) -> None:
+        seq_len = k_values.shape[1]
+        if self.cache_position + seq_len <= self.k_cache.shape[1]:
+            self.k_cache  [:, self.cache_position:self.cache_position + seq_len] = k_values
+            self.action_cache[:, self.cache_position:self.cache_position + seq_len] = actions
+            self.cache_position += seq_len
+        else:
+            self.reset_cache()
+    # ── Core stack update ─────────────────────────────────────────────────
+    def _vectorized_update(
+        self,
+        stack:    torch.Tensor,  # [B,   H, slots, ds]  (4-D) or [B,S,H,slots,ds] (5-D)
+        mask:     torch.Tensor,  # [B,   H, slots]      (3-D) or [B,S,H,slots]    (4-D)
+        actions:  torch.Tensor,  # [B, S, H, 3]
+        k_values: torch.Tensor,  # [B, S, H, ds]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Vectorized soft push/pop/no-op stack update.
+        Every token position receives the *same* initial stack (the one
+        passed in from the previous layer), and operations are applied in
+        parallel across S.  This is the §3.3 training-parallelism
+        approximation: strict sequential dependency within a sequence is
+        broken intentionally to allow full batch processing.
+        Returns:
+            new_stack [B, S, H, slots, ds]
+            new_mask  [B, S, H, slots]
+        """
+        batch_size, seq_len = actions.shape[:2]
+        # Broadcast 4-D initial state along the sequence dimension
+        if stack.dim() == 4:
+            stack = stack.unsqueeze(1).expand(-1, seq_len, -1, -1, -1)
+            mask  = mask.unsqueeze(1).expand(-1, seq_len, -1, -1)
+        # Push: new value at top, shift everything down (overflow discarded)
+        push_stack = torch.cat([k_values.unsqueeze(3), stack[:, :, :, :-1]], dim=3)
+        push_mask  = torch.cat([torch.ones_like(mask[:, :, :, :1]),
+                                 mask[:, :, :, :-1]], dim=3)
+        # Pop: shift everything up, zero at bottom
+        pop_stack = torch.cat([stack[:, :, :, 1:],
+                                torch.zeros_like(stack[:, :, :, :1])], dim=3)
+        pop_mask  = torch.cat([mask[:, :, :, 1:],
+                                torch.zeros_like(mask[:, :, :, :1])], dim=3)
+        # Soft combination weighted by action probabilities
+        # actions: [B,S,H,3] → unsqueeze to [B,S,H,3,1,1] for stack broadcast
+        aw     = actions.unsqueeze(-1).unsqueeze(-1)            # [B,S,H,3,1,1]
+        stacks = torch.stack([push_stack, pop_stack, stack], dim=3)  # [B,S,H,3,slots,ds]
+        masks  = torch.stack([push_mask,  pop_mask,  mask],  dim=3)  # [B,S,H,3,slots]
+        new_stack = (stacks * aw).sum(dim=3)                   # [B,S,H,slots,ds]
+        new_mask  = (masks  * aw.squeeze(-1)).sum(dim=3)       # [B,S,H,slots]
+        return new_stack, new_mask
+    # ── Training forward (full sequence) ─────────────────────────────────
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        stack: Optional[torch.Tensor] = None,
+        mask:  Optional[torch.Tensor] = None,
+        analysis: Optional[StackMemoryAnalysis] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Full-sequence forward pass (training and prefill).
+        Args:
+            hidden_states: [B, S, D]
+            stack:         [B, H, slots, ds] — previous layer's stack state,
+                           or None (initialised to zeros for layer 0).
+            mask:          [B, H, slots]     — validity mask for stack,
+                           or None (initialised to zeros for layer 0).
+            analysis:      StackMemoryAnalysis container; populated when
+                           model is in eval + analysis mode.
+        Returns:
+            (output, new_stack, new_mask)
+            output    [B, S, D]
+            new_stack [B, H, slots, ds]  — stack at final sequence position
+            new_mask  [B, H, slots]
+        """
+        batch_size, seq_len, _ = hidden_states.shape
+        device = hidden_states.device
+        # Capture incoming stack for analysis before it is updated
+        if analysis is not None:
+            analysis.stack_in = stack.detach() if stack is not None else None
+        # Initialise empty stack / mask for layer 0
+        if stack is None:
+            stack = torch.zeros(
+                batch_size, self.num_stack_heads, self.stack_slots, self.head_dim,
+                device=device, dtype=hidden_states.dtype,
+            )
+        if mask is None:
+            mask = torch.zeros(
+                batch_size, self.num_stack_heads, self.stack_slots,
+                device=device, dtype=hidden_states.dtype,
+            )
+        # 1. Project down
+        h_proj = self.down_proj(hidden_states)                 # [B,S,stack_d_model]
+        # 2. Action probabilities
+        action_logits = self.action_head(h_proj) / math.sqrt(self.head_dim)
+        actions = F.softmax(
+            action_logits.view(batch_size, seq_len, self.num_stack_heads, 3), dim=-1
+        )                                                       # [B,S,H,3]
+        # 3. Values to push
+        k_values = h_proj.view(batch_size, seq_len, self.num_stack_heads, self.head_dim)
+        # 4. Vectorized stack update
+        new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
+        # new_stack: [B,S,H,slots,ds],  new_mask: [B,S,H,slots]
+        # 5. Global read (query-over-stack attention, paper §3.1)
+        gate_scores  = self.gate_proj(new_stack).squeeze(-1)   # [B,S,H,slots]
+        gate_weights = F.softmax(gate_scores + (1 - new_mask) * -1e9, dim=-1)
+        memory_out   = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
+        # memory_out: [B,S,H,ds] → [B,S,stack_d_model]
+        memory_out   = memory_out.view(batch_size, seq_len, self.stack_d_model)
+        # 6. Project back up
+        memory_out_proj = self.up_proj(memory_out)             # [B,S,D]
+        # 7. Residual
+        output = memory_out_proj * self.res_weight + hidden_states
+        # 8. Update generation cache (no-op during training)
+        if self.enable_cache:
+            self._update_cache(k_values.detach(), actions.detach())
+        # Populate analysis fields
+        if analysis is not None:
+            analysis.action_probs   = actions.detach()
+            analysis.stack_out      = new_stack[:, -1].detach()
+            analysis.mask_out       = new_mask[:, -1].detach()
+            analysis.gate_weights   = gate_weights.detach()
+            analysis.memory_output  = memory_out.detach()
+            analysis.residual_scale = self.res_weight.item()
+        # Return output + last-position stack state for next layer
+        return output, new_stack[:, -1], new_mask[:, -1]
+    # ── Autoregressive single-token forward ──────────────────────────────
+    def step(
+        self,
+        hidden_state: torch.Tensor,   # [B, D]
+        stack: torch.Tensor,          # [B, H, slots, ds]
+        mask:  torch.Tensor,          # [B, H, slots]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Single-token forward for autoregressive generation.
+        When enable_cache=False (simple path used by NeoLLM generation):
+            Calls forward() with a length-1 sequence and unpacks the result.
+            The stack state passed in carries all history from previous tokens
+            (propagated by NeoLLMModel.forward across generation steps).
+        When enable_cache=True (full-history reconstruction path):
+            Concatenates the current token with cached previous-token values
+            and replays the full vectorized update, extracting only the last
+            position. This gives a more accurate stack that sees full history
+            at the cost of O(T) computation per step.
+        Returns:
+            (output, new_stack, new_mask)
+            output    [B, D]
+            new_stack [B, H, slots, ds]
+            new_mask  [B, H, slots]
+        """
+        if not self.enable_cache:
+            # Simple path: forward with seq_len=1, squeeze the sequence dim
+            out, new_stack, new_mask = self.forward(
+                hidden_state.unsqueeze(1), stack, mask
+            )
+            return out.squeeze(1), new_stack, new_mask
+        batch_size = hidden_state.shape[0]
+        # Compute features for the current token
+        h_proj    = self.down_proj(hidden_state)               # [B, stack_d_model]
+        a_logits  = self.action_head(h_proj) / math.sqrt(self.head_dim)
+        cur_act   = F.softmax(
+            a_logits.view(batch_size, 1, self.num_stack_heads, 3), dim=-1
+        )                                                       # [B,1,H,3]
+        cur_k     = h_proj.view(batch_size, 1, self.num_stack_heads, self.head_dim)
+        # Prepend cached history (all previous tokens in this generation)
+        if self.cache_position > 0:
+            k_values = torch.cat([self.k_cache[:batch_size, :self.cache_position], cur_k],   dim=1)
+            actions  = torch.cat([self.action_cache[:batch_size, :self.cache_position], cur_act], dim=1)
+        else:
+            k_values = cur_k
+            actions  = cur_act
+        # Full vectorized update over history + current token; take last position
+        new_stack_seq, new_mask_seq = self._vectorized_update(stack, mask, actions, k_values)
+        new_stack = new_stack_seq[:, -1]                       # [B,H,slots,ds]
+        new_mask  = new_mask_seq[:, -1]                        # [B,H,slots]
+        # Global read on the new stack state
+        gate_scores  = self.gate_proj(new_stack).squeeze(-1)   # [B,H,slots]
+        gate_weights = F.softmax(gate_scores + (1 - new_mask) * -1e9, dim=-1)
+        memory_out   = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=2)
+        memory_out   = memory_out.view(batch_size, self.stack_d_model)
+        memory_out_proj = self.up_proj(memory_out)             # [B,D]
+        output = memory_out_proj * self.res_weight + hidden_state
+        self._update_cache(cur_k, cur_act)
+        return output, new_stack, new_mask
+@dataclass
+class LAuReLAnalysis:
+    """
+    Internals of one LAuReL residual connection forward pass.
+    Only populated when use_laurel=True AND model is in eval + analysis mode.
+    Instantiated twice per layer: once for the attention residual, once for MLP.
+    Reference: Menghani, G., Kumar, R. & Kumar, S. (ICML 2025).
+    *LAuReL: Learned Augmented Residual Layer.* arXiv:2411.07501.
+    Math (combined RW+LR, both sub-variants active):
+        x_{i+1} = α · f(x_i) + β · (A·(B·x_i) + x_i)
+    where [α, β] = softmax([a, b]), a,b ∈ ℝ learnable (RW component),
+    B ∈ ℝ^{r×D} column-orthogonal init, A ∈ ℝ^{D×r} zero init (LR component).
+    At step 0: A=0 → lr_term=0, so x_{i+1} = 0.5·f(x) + 0.5·x_i (RW only)
+    or x_{i+1} = f(x_i) + x_i (LR only, standard residual).
+    Fields:
+        alpha_rw:    softmax(a) — weight on f(x_i). [scalar float]
+                     None when use_laurel_rw=False.
+        beta_rw:     softmax(b) — weight on g(x_i). [scalar float]
+                     None when use_laurel_rw=False.
+        lr_term:     A·(B·x_res) — the low-rank residual augmentation.
+                     Shape [B, S, D]. Zero at init. None when use_laurel_lr=False.
+        output:      Final combined tensor before GPAS. Shape [B, S, D].
+    """
+    alpha_rw: Optional[float]        = None  # softmax weight on f(x)
+    beta_rw:  Optional[float]        = None  # softmax weight on g(x)
+    lr_term:  Optional[torch.Tensor] = None  # A(Bx) low-rank augmentation [B,S,D]
+    output:   Optional[torch.Tensor] = None  # combined pre-GPAS [B,S,D]
+class LAuReLLayer(nn.Module):
+    """
+    LAuReL: Learned Augmented Residual Layer.
+    A lightweight replacement for the canonical residual connection
+    that learns to blend the nonlinear sub-layer output f(x) with a
+    richer linear function of the residual x, optionally augmented by a
+    low-rank transformation.
+    Reference: Menghani, G., Kumar, R. & Kumar, S. (ICML 2025).
+    *LAuReL: Learned Augmented Residual Layer.* arXiv:2411.07501.
+    ── Sub-variants ────────────────────────────────────────────────────
+    Controlled by config flags; any combination is valid:
+    **RW only** (use_laurel_rw=True, use_laurel_lr=False):
+        x_{i+1} = α · f(x_i) + β · x_i
+        [α, β]  = softmax([a, b]),  a,b ∈ ℝ  (2 params)
+    **LR only** (use_laurel_rw=False, use_laurel_lr=True):
+        x_{i+1} = f(x_i) + A·(B·x_i) + x_i
+        B ∈ ℝ^{r×D}  column-orthogonal init  (down-projection)
+        A ∈ ℝ^{D×r}  zero init               (up-projection)
+        Params: 2·r·D per layer.
+    **RW + LR** (both True, paper recommendation):
+        x_{i+1} = α · f(x_i) + β · (A·(B·x_i) + x_i)
+    ── Initialisation ──────────────────────────────────────────────────
+    RW: raw logits [a, b] = [0, 0] → α=β=0.5 at step 0.
+    LR: A (up) = zeros → lr_term = 0 at step 0 → pure residual at init.
+    This ensures the model starts as a standard residual and smoothly
+    diverges as the gates and low-rank matrices are trained.
+    ── Integration in NeoLLM ───────────────────────────────────────────
+    Applied immediately before GPAS at both residual sums per layer:
+        h_tilde = GPAS( LAuReL(attn_out, residual_attn) )
+        output  = GPAS( LAuReL(delta_m,  residual_mlp)  )
+    GPAS then applies its stop-gradient scaling on the combined stream,
+    preserving gradient magnitudes across the depth of the network.
+    The two techniques are structurally orthogonal: LAuReL controls the
+    *mixing ratio* of f(x) and x at each residual junction; GPAS
+    controls the *magnitude* of the combined stream with a learned gate
+    and a stop-gradient operator that prevents gradient vanishing.
+    Args:
+        config: NeoLLMConfig. Reads use_laurel_rw, use_laurel_lr,
+                laurel_lr_rank, hidden_size.
+    """
+    def __init__(self, config: NeoLLMConfig):
+        super().__init__()
+        self.use_rw   = getattr(config, "use_laurel_rw", True)
+        self.use_lr   = getattr(config, "use_laurel_lr", True)
+        D             = config.hidden_size
+        r             = getattr(config, "laurel_lr_rank", 32)
+        if self.use_rw:
+            # Raw logits for softmax([α, β]).
+            # Stored as a single 2-vector so softmax is one op.
+            # Init to zero → α=β=0.5 at step 0.
+            self.rw_logits = nn.Parameter(torch.zeros(2))
+        if self.use_lr:
+            # down: B ∈ ℝ^{r×D}, column-orthogonal init (paper §3.3 LLM recommendation)
+            # up:   A ∈ ℝ^{D×r}, zero init → lr_term=0 at step 0 (LoRA-style)
+            self.lr_down = nn.Linear(D, r, bias=False)
+            self.lr_up   = nn.Linear(r, D, bias=False)
+    def forward(
+        self,
+        f_out:    torch.Tensor,           # output of f(x): attn or MLP  [B,S,D]
+        x_res:    torch.Tensor,           # residual (skip connection)    [B,S,D]
+        analysis: Optional[LAuReLAnalysis] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            f_out:    Output of f(x) — attention output or MLP delta.
+            x_res:    Residual tensor — accumulated hidden state.
+            analysis: Optional analysis container; populated in eval+analysis mode.
+        Returns:
+            Combined tensor [B, S, D] to be fed into GPAS.
+        """
+        # ── LR component: A·(B·x_res) ────────────────────────────────────
+        lr_term = None
+        if self.use_lr:
+            lr_term = self.lr_up(self.lr_down(x_res))     # [B,S,D]
+            g_res   = lr_term + x_res                      # enriched residual
+        else:
+            g_res   = x_res
+        # ── RW component: α·f + β·g ──────────────────────────────────────
+        if self.use_rw:
+            weights = torch.softmax(self.rw_logits, dim=0)  # [2]
+            alpha   = weights[0]
+            beta    = weights[1]
+            out     = alpha * f_out + beta * g_res
+        else:
+            # LR only: standard sum with enriched residual
+            out = f_out + g_res
+        if analysis is not None:
+            if self.use_rw:
+                analysis.alpha_rw = alpha.item()
+                analysis.beta_rw  = beta.item()
+            if self.use_lr:
+                analysis.lr_term  = lr_term.detach()
+            analysis.output   = out.detach()
+        return out
+# ==================== GATED DELTA NET (LINEAR ATTENTION) ====================
+# Active when use_linear_attention=True. Replaces NeoLLMAttention every
+# `linear_attention_every_n` layers (pattern 0-indexed: layers 2, 5, 8 …).
+#
+# References:
+#   Yang et al. (2024). "Gated Delta Networks." arXiv:2412.06464.
+#   Li et al. (2026). "REPO." arXiv:2512.14391.
+def _apply_mask_to_padding_states(
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+) -> torch.Tensor:
+    if (
+        attention_mask is not None
+        and attention_mask.shape[1] > 1
+        and attention_mask.shape[0] > 1
+    ):
+        hidden_states = (
+            hidden_states * attention_mask[:, :, None]
+        ).to(hidden_states.dtype)
+    return hidden_states
+def _l2norm(x: torch.Tensor, dim: int = -1, eps: float = 1e-6) -> torch.Tensor:
+    return x / torch.sqrt((x * x).sum(dim=dim, keepdim=True) + eps)
+def _torch_causal_conv1d_update(
+    hidden_states, conv_state, weight, bias=None, activation=None
+):
+    _, hidden_size, seq_len = hidden_states.shape
+    state_len = conv_state.shape[-1]
+    combined  = torch.cat([conv_state, hidden_states], dim=-1).to(weight.dtype)
+    conv_state.copy_(combined[:, :, -state_len:])
+    out = F.conv1d(combined, weight.unsqueeze(1), bias, padding=0, groups=hidden_size)
+    return F.silu(out[:, :, -seq_len:]).to(hidden_states.dtype)
+def _torch_chunk_gated_delta_rule(
+    query, key, value, g, beta,
+    chunk_size=64, initial_state=None, output_final_state=False,
+    use_qk_l2norm_in_kernel=False,
+):
+    initial_dtype = query.dtype
+    if use_qk_l2norm_in_kernel:
+        query, key = _l2norm(query), _l2norm(key)
+    query, key, value, beta, g = [
+        x.transpose(1, 2).contiguous().to(torch.float32)
+        for x in (query, key, value, beta, g)
+    ]
+    bs, seq, nh, kdim = key.shape
+    vdim = value.shape[-1]
+    pad  = (chunk_size - nh % chunk_size) % chunk_size
+    for t in (query, key, value):
+        t = F.pad(t, (0, 0, 0, pad))
+    query  = F.pad(query,  (0, 0, 0, pad))
+    key    = F.pad(key,    (0, 0, 0, pad))
+    value  = F.pad(value,  (0, 0, 0, pad))
+    beta   = F.pad(beta,   (0, pad))
+    g      = F.pad(g,      (0, pad))
+    tot    = nh + pad
+    scale  = query.shape[-1] ** -0.5
+    query  = query * scale
+    vb     = value * beta.unsqueeze(-1)
+    kb     = key   * beta.unsqueeze(-1)
+    query, key, value, kb, vb = [
+        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1])
+        for x in (query, key, value, kb, vb)
+    ]
+    g    = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
+    triu = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), 0)
+    g    = g.cumsum(-1)
+    dm   = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp()).tril()
+    attn = -((kb @ key.transpose(-1, -2)) * dm).masked_fill(triu, 0)
+    for i in range(1, chunk_size):
+        r = attn[..., i, :i].clone(); s = attn[..., :i, :i].clone()
+        attn[..., i, :i] = r + (r.unsqueeze(-1) * s).sum(-2)
+    eye   = torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+    attn  = attn + eye
+    value = attn @ vb
+    kcd   = attn @ (kb * g.exp().unsqueeze(-1))
+    st    = torch.zeros(bs, seq, kdim, vdim, dtype=value.dtype, device=value.device) if initial_state is None else initial_state.to(value)
+    out   = torch.zeros_like(value)
+    triu2 = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), 1)
+    for i in range(tot // chunk_size):
+        qi, ki, vi = query[:,:,i], key[:,:,i], value[:,:,i]
+        a   = (qi @ ki.transpose(-1,-2) * dm[:,:,i]).masked_fill_(triu2, 0)
+        vp  = kcd[:,:,i] @ st
+        vn  = vi - vp
+        out[:,:,i] = (qi * g[:,:,i,:,None].exp()) @ st + a @ vn
+        st  = st * g[:,:,i,-1,None,None].exp() + (ki * (g[:,:,i,-1,None]-g[:,:,i]).exp()[...,None]).transpose(-1,-2) @ vn
+    if not output_final_state: st = None
+    out = out.reshape(out.shape[0], out.shape[1], -1, out.shape[-1])[:,:,:nh]
+    return out.transpose(1,2).contiguous().to(initial_dtype), st
+def _torch_recurrent_gated_delta_rule(
+    query, key, value, g, beta, initial_state, output_final_state,
+    use_qk_l2norm_in_kernel=False,
+):
+    initial_dtype = query.dtype
+    if use_qk_l2norm_in_kernel:
+        query, key = _l2norm(query), _l2norm(key)
+    query, key, value, beta, g = [
+        x.transpose(1,2).contiguous().to(torch.float32)
+        for x in (query, key, value, beta, g)
+    ]
+    bs, seq, nh, kdim = key.shape
+    vdim   = value.shape[-1]
+    query  = query * (query.shape[-1] ** -0.5)
+    out    = torch.zeros(bs, seq, nh, vdim, dtype=value.dtype, device=value.device)
+    st     = torch.zeros(bs, seq, kdim, vdim, dtype=value.dtype, device=value.device) if initial_state is None else initial_state.to(value)
+    for i in range(nh):
+        qt, kt, vt = query[:,:,i], key[:,:,i], value[:,:,i]
+        gt, bt     = g[:,:,i].exp().unsqueeze(-1).unsqueeze(-1), beta[:,:,i].unsqueeze(-1)
+        st         = st * gt
+        delta       = (vt - (st * kt.unsqueeze(-1)).sum(-2)) * bt
+        st         = st + kt.unsqueeze(-1) * delta.unsqueeze(-2)
+        out[:,:,i] = (st * qt.unsqueeze(-1)).sum(-2)
+    if not output_final_state: st = None
+    return out.transpose(1,2).contiguous().to(initial_dtype), st
+class _NeoLLMRMSNormGated(nn.Module):
+    """Gated RMSNorm fallback when FLA unavailable."""
+    def __init__(self, hidden_size, eps=1e-6, **kwargs):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps    = eps
+    def forward(self, x, gate):
+        dtype = x.dtype
+        x     = x.float()
+        x     = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        return (self.weight * x.to(dtype) * F.silu(gate.float())).to(dtype)
+class NeoLLMGatedDeltaNet(nn.Module):
+    """
+    GatedDeltaNet linear attention with FANformer integration.
+    Replaces NeoLLMAttention on every ``linear_attention_every_n``-th layer
+    (0-indexed: layers 2, 5, 8 … for every_n=3).
+    REPO (use_repo=True AND use_repo_in_linear_attn=True):
+        Applies continuous per-head positions to Q and K via _apply_repo_rope,
+        matching the full-attention REPO path identically.
+    Without REPO the gated delta rule operates without explicit positional
+    encoding (its recurrent state is implicitly position-aware).
+    References:
+        Yang et al. (2024). arXiv:2412.06464.
+        Li et al. (2026). arXiv:2512.14391.
+    """
+    def __init__(self, config: NeoLLMConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size      = config.hidden_size
+        self.num_v_heads      = config.linear_num_value_heads
+        self.num_k_heads      = config.linear_num_key_heads
+        self.head_k_dim       = config.linear_key_head_dim
+        self.head_v_dim       = config.linear_value_head_dim
+        self.key_dim          = self.head_k_dim * self.num_k_heads
+        self.value_dim        = self.head_v_dim * self.num_v_heads
+        self.conv_kernel_size = config.linear_conv_kernel_dim
+        self.layer_idx        = layer_idx
+        # ── FANformer (same ratio as full-attention layers) ────────────────
+        self.fan_layer = FANLayer(
+            hidden_size=config.hidden_size,
+            fan_ratio=getattr(config, "fan_ratio", 0.125),
+        )
+        _fan_dim = config.hidden_size + int(
+            config.hidden_size * getattr(config, "fan_ratio", 0.125)
+        )
+        # ── Causal conv1d on concatenated Q/K/V ──────────────────────────
+        self.conv_dim = self.key_dim * 2 + self.value_dim
+        self.conv1d   = nn.Conv1d(
+            self.conv_dim, self.conv_dim, bias=False,
+            kernel_size=self.conv_kernel_size,
+            groups=self.conv_dim,
+            padding=self.conv_kernel_size - 1,
+        )
+        # ── QKVz + ba projections (all from FAN-transformed features) ─────
+        _ratio = self.num_v_heads // self.num_k_heads
+        self.in_proj_qkvz = nn.Linear(
+            _fan_dim, self.key_dim * 2 + self.value_dim * 2, bias=False
+        )
+        self.in_proj_ba = nn.Linear(
+            _fan_dim, self.num_v_heads * 2, bias=False
+        )
+        # ── Delta-rule gating parameters ──────────────────────────────────
+        self.dt_bias = nn.Parameter(torch.ones(self.num_v_heads))
+        A            = torch.empty(self.num_v_heads).uniform_(0, 16)
+        self.A_log   = nn.Parameter(torch.log(A))
+        # ── Output normalisation ───────────────────────────────────────────
+        _NormCls = FusedRMSNormGated if FusedRMSNormGated is not None else _NeoLLMRMSNormGated
+        _norm_kw = (
+            dict(activation="silu",
+                 device=torch.cuda.current_device(),
+                 dtype=getattr(config, "dtype", None) or torch.get_default_dtype())
+            if FusedRMSNormGated is not None else {}
+        )
+        self.norm     = _NormCls(self.head_v_dim, eps=config.rms_norm_eps, **_norm_kw)
+        self.out_proj = nn.Linear(self.value_dim, self.hidden_size, bias=False)
+        self.dropout  = nn.Dropout(config.dropout_rate)
+        # ── Kernel dispatch (fast → fallback) ─────────────────────────────
+        self._conv1d_fn = causal_conv1d_fn          # None if not installed
+        self._chunk_fn  = (chunk_gated_delta_rule
+                           if chunk_gated_delta_rule is not None
+                           else _torch_chunk_gated_delta_rule)
+        self._recur_fn  = (fused_recurrent_gated_delta_rule
+                           if fused_recurrent_gated_delta_rule is not None
+                           else _torch_recurrent_gated_delta_rule)
+        if not is_linear_attn_fast_path:
+            logger.warning_once(
+                "NeoLLMGatedDeltaNet: causal_conv1d / flash-linear-attention "
+                "not installed — using pure-PyTorch fallbacks. "
+                "Install both packages for full performance."
+            )
+        # ── REPO: continuous per-head positions on Q and K ─────────────────
+        # Controlled by use_repo AND use_repo_in_linear_attn flags.
+        # Only active for layers at or above repo_start_layer.
+        self.use_repo = (
+            getattr(config, "use_repo", False)
+            and getattr(config, "use_repo_in_linear_attn", False)
+            and layer_idx >= getattr(config, "repo_start_layer",
+                                     config.num_hidden_layers // 3)
+        )
+        if self.use_repo:
+            _d_p = getattr(config, "repo_d_p", config.hidden_size // 8)
+            self.repo_module = REPOModule(
+                hidden_size=config.hidden_size,
+                d_p=_d_p,
+                num_heads=self.num_v_heads,
+            )
+        else:
+            self.repo_module = None
+    def _fix_qkvz(
+        self,
+        mixed_qkvz: torch.Tensor,
+        mixed_ba:   torch.Tensor,
+    ) -> Tuple[torch.Tensor, ...]:
+        """Split fused projection into (q, k, v, z, b, a)."""
+        ratio = self.num_v_heads // self.num_k_heads
+        mixed_qkvz = mixed_qkvz.view(
+            *mixed_qkvz.shape[:-1],
+            self.num_k_heads,
+            2 * self.head_k_dim + 2 * ratio * self.head_v_dim,
+        )
+        mixed_ba = mixed_ba.view(
+            *mixed_ba.shape[:-1],
+            self.num_k_heads,
+            2 * ratio,
+        )
+        q, k, v, z = torch.split(
+            mixed_qkvz,
+            [self.head_k_dim, self.head_k_dim,
+             ratio * self.head_v_dim, ratio * self.head_v_dim],
+            dim=3,
+        )
+        b, a = torch.split(mixed_ba, ratio, dim=3)
+        v = v.reshape(v.shape[0], v.shape[1], -1, self.head_v_dim)
+        z = z.reshape(z.shape[0], z.shape[1], -1, self.head_v_dim)
+        b = b.reshape(b.shape[0], b.shape[1], self.num_v_heads)
+        a = a.reshape(a.shape[0], a.shape[1], self.num_v_heads)
+        return q, k, v, z, b, a
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
+    ) -> torch.Tensor:
+        hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
+        B, S, _       = hidden_states.shape
+        # ── FANformer ─────────────────────────────────────────────────────
+        h_fan = self.fan_layer(hidden_states)
+        # ── QKVz and ba projections ───────────────────────────────────────
+        q, k, v, z, b, a = self._fix_qkvz(
+            self.in_proj_qkvz(h_fan), self.in_proj_ba(h_fan)
+        )
+        # ── Causal conv1d on flattened Q/K/V ─────────────────────────────
+        qkv = torch.cat(
+            [q.reshape(B, S, -1), k.reshape(B, S, -1), v.reshape(B, S, -1)], dim=-1
+        ).transpose(1, 2)                                        # [B, conv_dim, S]
+        if self._conv1d_fn is not None:
+            qkv = self._conv1d_fn(
+                x=qkv, weight=self.conv1d.weight.squeeze(1),
+                bias=self.conv1d.bias, activation="silu", seq_idx=None,
+            )
+        else:
+            qkv = F.silu(self.conv1d(qkv)[:, :, :S])
+        qkv = qkv.transpose(1, 2)                               # [B, S, conv_dim]
+        q_f, k_f, v_f = torch.split(
+            qkv, [self.key_dim, self.key_dim, self.value_dim], dim=-1
+        )
+        q = q_f.reshape(B, S, -1, self.head_k_dim)
+        k = k_f.reshape(B, S, -1, self.head_k_dim)
+        v = v_f.reshape(B, S, -1, self.head_v_dim)
+        # ── REPO: continuous per-head positions ───────────────────────────
+        # Transpose to [B, H, S, dk] for _apply_repo_rope, then back.
+        if self.use_repo and self.repo_module is not None and repo_rope_args is not None:
+            inv_freq, attn_scaling = repo_rope_args
+            z_pos  = self.repo_module(hidden_states)             # [B, H, S]
+            q_t, k_t = q.transpose(1, 2), k.transpose(1, 2)
+            q_t, k_t = _apply_repo_rope(q_t, k_t, z_pos, inv_freq, attn_scaling)
+            q, k   = q_t.transpose(1, 2), k_t.transpose(1, 2)
+        # ── GQA-like head expansion ────────────────────────────────────────
+        ratio = self.num_v_heads // self.num_k_heads
+        if ratio > 1:
+            q = q.repeat_interleave(ratio, dim=2)
+            k = k.repeat_interleave(ratio, dim=2)
+        beta = b.sigmoid()
+        g    = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+        # ── Chunk gated delta rule (fused or fallback) ────────────────────
+        core_out, _ = self._chunk_fn(
+            q, k, v, g=g, beta=beta,
+            initial_state=None, output_final_state=False,
+            use_qk_l2norm_in_kernel=True,
+        )
+        # ── Gated RMSNorm + output projection ─────────────────────────────
+        z_shape  = z.shape
+        core_out = core_out.reshape(-1, core_out.shape[-1])
+        core_out = self.norm(core_out, z.reshape(-1, z.shape[-1]))
+        core_out = core_out.reshape(z_shape).reshape(B, S, -1)
+        return self.dropout(self.out_proj(core_out))
+# ==================== DECODER LAYER ==========================================
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections, optional JTok-M injection.
         self.layer_idx     = layer_idx
         self.use_jtokm     = config.use_jtokm
+        # ── Token-mixer selection ─────────────────────────────────────────
+        # use_linear_attention=True: replace full attention every
+        # `linear_attention_every_n` layers (0-indexed pattern:
+        #   e.g. every_n=3 → layers 2, 5, 8, 11 …).
+        # All other layers keep NeoLLMAttention unchanged.
+        _every_n = getattr(config, "linear_attention_every_n", 3)
+        self.is_linear_attn = (
+            getattr(config, "use_linear_attention", False)
+            and (layer_idx + 1) % _every_n == 0
+        )
+        if self.is_linear_attn:
+            self.linear_attn = NeoLLMGatedDeltaNet(config, layer_idx)
+            self.self_attn   = None
+        else:
+            self.self_attn   = NeoLLMAttention(config, layer_idx)
+            self.linear_attn = None
         self.mlp                      = (
             VersatileFFN(config)
             if getattr(config, "use_versatile_ffn", False)
             self.attn_res_query_attn = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_query_mlp  = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_norm       = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            _num_blocks = getattr(config, 'attn_res_num_blocks', 0)
+            self.attn_res_block_size = (
+                max(config.num_hidden_layers // _num_blocks, 1) if _num_blocks > 0 else 1
+            )
         else:
             self.attn_res_query_attn = None
             self.attn_res_query_mlp  = None
             self.attn_res_norm       = None
+            self.attn_res_block_size = None
+        # ── MUDD: separate K/V LayerNorms for qkvr+sepln mode ──────────────
+        # Only instantiated when both mudd_dense_type='qkvr' AND mudd_sepln=True.
+        # The existing input_layernorm handles the Q stream (unchanged).
+        # Separate norms for K and V allow each stream to rescale independently.
+        _use_mudd  = getattr(config, 'use_mudd', False)
+        _mudd_qkvr = getattr(config, 'mudd_dense_type', 'qkvr') == 'qkvr'
+        _mudd_sepln = getattr(config, 'mudd_sepln', False)
+        if _use_mudd and _mudd_qkvr and _mudd_sepln:
+            self.mudd_k_norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.mudd_v_norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.mudd_k_norm = None
+            self.mudd_v_norm = None
+        # ── DCA (Heddes et al., 2025, arXiv:2502.06785) ───────────────────
+        # GRN-v3 module that aggregates the k-selected depth stack into 3
+        # independent streams (Q, K, V). Each stream has its own dimension-
+        # and input-dependent weights, enabling richer cross-layer interactions.
+        # K and V get their own SeeDNorm + LNS norm chain (same scheme as
+        # MUDD sepln) since they now arrive from a different aggregation path.
+        # The residual connection uses the Q stream output (xq) as its base,
+        # matching the DCA paper's decoder block design (residual = q_input).
+        self.use_dca = getattr(config, 'use_dca', False)
+        if self.use_dca:
+            _dca_k           = getattr(config, 'dca_k', 2)
+            _num_stack       = min(layer_idx + 1, 2 * _dca_k)
+            self.dca_grn     = NeoLLMGRN(
+                hidden_size      = config.hidden_size,
+                num_stack_layers = _num_stack,
+                num_outputs      = 3,
+                eps              = config.rms_norm_eps,
+            )
+            self.dca_k_norm  = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.dca_v_norm  = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.dca_grn    = None
+            self.dca_k_norm = None
+            self.dca_v_norm = None
+        # ── StackTrans (Zhang et al., NeurIPS 2025) ───────────────────────
+        # Differentiable multi-head hidden-state stack inserted at the very
+        # beginning of the layer forward, before the attention sublayer.
+        # Mutually exclusive with use_attn_res, use_mudd, use_dca.
+        self.use_stacktrans = getattr(config, 'use_stacktrans', False)
+        if self.use_stacktrans:
+            self.stack_memory = StackMemory(config)
+        else:
+            self.stack_memory = None
+        # ── LAuReL (Menghani, Kumar & Kumar, ICML 2025) ───────────────────
+        # Learned augmented residual connection replacing f(x)+x at both
+        # the attention and MLP residual sums. Applied immediately before
+        # GPAS, so GPAS still controls magnitude via stop-gradient scaling.
+        # Two independent instances per layer (attention and MLP).
+        # Compatible with use_stacktrans. Incompatible with MUDD/DCA/AttnRes.
+        self.use_laurel = getattr(config, 'use_laurel', False)
+        if self.use_laurel:
+            self.laurel_attn = LAuReLLayer(config)
+            self.laurel_mlp  = LAuReLLayer(config)
+        else:
+            self.laurel_attn = None
+            self.laurel_mlp  = None
     def _attn_res(
         self,
         B_vals: Optional[torch.Tensor] = None,
         attn_res_sources: Optional[list] = None,
         attn_res_partial: Optional[torch.Tensor] = None,
+        mudd_streams: Optional[tuple] = None,
+        dca_stack: Optional[torch.Tensor] = None,
+        stack_state: Optional[torch.Tensor] = None,
+        stack_mask: Optional[torch.Tensor] = None,
         layer_analysis: Optional[LayerAnalysis] = None,
         output_attentions: Optional[bool] = False,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
         if layer_analysis is not None:
             layer_analysis.hidden_states_input = hidden_states.detach()
+        # ── StackTrans: hidden-state stack (pre-attention, pre-norm) ─────
+        # Executed first so attention sees the stack-enriched representation.
+        # stack_state / stack_mask carry the stack from the previous layer;
+        # both are None for layer 0 (StackMemory initialises to zeros then).
+        # Mutually exclusive with MUDD / DCA / AttnRes — those branches are
+        # all skipped when use_stacktrans=True (enforced in NeoLLMConfig).
+        if self.use_stacktrans and self.stack_memory is not None:
+            st_analysis = layer_analysis.stack if layer_analysis is not None else None
+            hidden_states, stack_state, stack_mask = self.stack_memory(
+                hidden_states, stack_state, stack_mask, analysis=st_analysis
+            )
+        # ── MUDD: unpack streams for Q/K/V/R (layer > 0 only) ────────────
+        # mudd_streams is a 4-tuple (xq, xk, xv, xr) when use_mudd=True and
+        # layer_idx > 0; None for layer 0 (standard residual there).
+        # xr replaces hidden_states as the residual throughout this layer.
+        # xq/xk/xv are the aggregated inputs for Q, K, V projections.
+        # When mudd_dense_type='l' (single stream), all four are equal.
+        # When mudd_sepln=True each stream has its own norm applied below.
+        mudd_xk = None
+        mudd_xv = None
+        if mudd_streams is not None:
+            xq_mudd, xk_mudd, xv_mudd, xr_mudd = mudd_streams
+            # Replace hidden_states with xr for residual connections
+            hidden_states = xr_mudd
+            # Norm K and V streams — use separate SeeDNorm if sepln, else
+            # they will share the main input_layernorm path via h_attn below
+            if self.mudd_k_norm is not None:
+                mudd_xk = self.lns_attn(self.mudd_k_norm(xk_mudd))
+                mudd_xv = self.lns_attn(self.mudd_v_norm(xv_mudd))
+            else:
+                # No sepln: K/V also go through the Q-path norm chain
+                mudd_xk = self.lns_attn(self.input_layernorm(xk_mudd))
+                mudd_xv = self.lns_attn(self.input_layernorm(xv_mudd))
+            # Override hidden_states for the Q path
+            hidden_states_for_attn = xq_mudd
+        else:
+            hidden_states_for_attn = hidden_states
+        # ── DCA: GRN-v3 depth-wise aggregation ───────────────────────────
+        # When active, runs the per-layer GRN on the k-selected depth stack
+        # to produce three independent aggregated streams (Q, K, V).
+        # xq replaces hidden_states as both the Q projection input AND the
+        # post-attention residual (DCA paper: residual = q_input).
+        # xk and xv go through separate SeeDNorm+LNS chains and are injected
+        # into NeoLLMAttention via the existing mudd_xk/mudd_xv parameters.
+        dca_residual = None
+        dca_a = layer_analysis.dca if layer_analysis is not None else None
+        if self.use_dca and dca_stack is not None:
+            xq, xk, xv = self.dca_grn(dca_stack, analysis=dca_a)
+            dca_residual             = xq
+            hidden_states_for_attn   = xq
+            # K and V streams: SeeDNorm + LNS before k_proj / v_proj
+            # (reuses the mudd_xk/mudd_xv injection path in NeoLLMAttention)
+            mudd_xk = self.lns_attn(self.dca_k_norm(xk))
+            mudd_xv = self.lns_attn(self.dca_v_norm(xv))
         # ── Attention Residuals: compute pre-attention input ──────────────
         # When active, the input to the attention sublayer is no longer the
         # raw hidden_states (accumulated residual) but a softmax-weighted
                 attn_res_sources, attn_res_partial, self.attn_res_query_attn,
                 ar_analysis, "attn",
             )
+            # ── Block boundary fires HERE — after pre-attn, before attn sublayer ──
+            # Paper pseudocode (Fig. 2) timing: the completed partial of the previous
+            # block is pushed to sources AFTER the pre-attn AttnRes call, so the first
+            # layer of a new block still sees the old partial as an intra-block source
+            # (no duplicate) and the new intra-block accumulation starts from zeros.
+            if self.layer_idx > 0 and self.layer_idx % self.attn_res_block_size == 0:
+                attn_res_sources.append(attn_res_partial)   # in-place; outer loop sees this
+                attn_res_partial = torch.zeros_like(attn_res_partial)  # fresh delta start
             residual_attn = attn_res_partial
         else:
+            h_attn = hidden_states_for_attn   # MUDD/DCA: xq stream or unchanged
+            # DCA: residual is xq (the GRN Q-stream output), not raw hidden_states
+            residual_attn = dca_residual if dca_residual is not None else hidden_states
         # ── Attention block ───────────────────────────────────────────────
         sn_pre = layer_analysis.seednorm_pre_attn if layer_analysis is not None else None
         if layer_analysis is not None:
             layer_analysis.lns_attn_output = h_lns.detach()
+        if self.is_linear_attn:
+            # ── GatedDeltaNet linear attention path ───────────────────────
+            # Does not use: first_layer_fan, mudd_xk/xv, attn_analysis.
+            # attention_mask here is already the linear_attn_mask (no causal
+            # bias, just padding) — NeoLLMModel.forward selects it per layer.
+            hidden_states = self.linear_attn(
+                hidden_states=h_lns,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+                repo_rope_args=repo_rope_args,
+            )
+            attn_weights          = None
+            self.current_layer_fan = None
+        else:
+            # ── Standard full attention path ──────────────────────────────
+            hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
+                hidden_states=h_lns,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+                first_layer_fan=first_layer_fan,
+                attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
+                repo_rope_args=repo_rope_args,
+                mudd_xk=mudd_xk,
+                mudd_xv=mudd_xv,
+                **kwargs,
+            )
         if layer_analysis is not None:
             layer_analysis.attn_contribution = hidden_states.detach()
         gpas_attn_a = layer_analysis.gpas_attn if layer_analysis is not None else None
+        # ── Attention residual sum ────────────────────────────────────────
+        # Standard: GPAS(residual_attn + hidden_states)
+        # LAuReL:   GPAS(LAuReL(f_out=hidden_states, x_res=residual_attn))
+        # Both paths feed into GPAS which applies stop-gradient scaling.
+        if self.use_laurel and self.laurel_attn is not None:
+            la_attn_a = layer_analysis.laurel_attn if layer_analysis is not None else None
+            combined_attn = self.laurel_attn(hidden_states, residual_attn, analysis=la_attn_a)
+        else:
+            combined_attn = residual_attn + hidden_states
+        h_tilde = self.gpas_attn(combined_attn, analysis=gpas_attn_a)
         if layer_analysis is not None:
             layer_analysis.h_tilde = h_tilde.detach()
         if layer_analysis is not None:
             layer_analysis.mlp_contribution = delta_m.detach()
+        # ── MLP residual sum ──────────────────────────────────────────────
+        # LAuReL treats f(x) = delta_m [+ delta_r when JTok-M active] and
+        # x_res = residual_mlp. JTok-M delta_r is additive alongside delta_m,
+        # so the nonlinear component is delta_m + delta_r in that path.
+        gpas_mlp_a = layer_analysis.gpas_mlp if layer_analysis is not None else None
         if self.use_jtokm and z_tilde is not None and B_vals is not None:
             orig_shape = h_tilde.shape
             h_flat     = h_tilde.reshape(-1, self.hidden_size)
             delta_r, aux_stats = self.jtokm(h_flat, z_flat, B_flat, analysis=jtokm_a)
             delta_r = delta_r.reshape(orig_shape)
+            f_mlp = delta_m + delta_r                          # combined nonlinear term
+            if self.use_laurel and self.laurel_mlp is not None:
+                la_mlp_a = layer_analysis.laurel_mlp if layer_analysis is not None else None
+                combined_mlp = self.laurel_mlp(f_mlp, residual_mlp, analysis=la_mlp_a)
+            else:
+                combined_mlp = residual_mlp + f_mlp
+            hidden_states = self.gpas_mlp(combined_mlp, analysis=gpas_mlp_a)
         else:
+            aux_stats = None
+            if self.use_laurel and self.laurel_mlp is not None:
+                la_mlp_a = layer_analysis.laurel_mlp if layer_analysis is not None else None
+                combined_mlp = self.laurel_mlp(delta_m, residual_mlp, analysis=la_mlp_a)
+            else:
+                combined_mlp = residual_mlp + delta_m
+            hidden_states = self.gpas_mlp(combined_mlp, analysis=gpas_mlp_a)
         if layer_analysis is not None:
             layer_analysis.hidden_states_output = hidden_states.detach()
             outputs += (aux_stats,)
         if versatile_aux is not None:
             outputs += (versatile_aux,)
+        # StackTrans: always append stack state (None, None when inactive)
+        # so NeoLLMModel.forward can extract them by position -2 and -1.
+        outputs += (stack_state, stack_mask)
         return outputs
             if hasattr(module, "alpha_ma"):
                 module.alpha_ma.zero_()
+        elif isinstance(module, NeoLLMGatedDeltaNet):
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data.uniform_(0, 16).log_()
         elif isinstance(module, GPAS):
             module.alpha.data.fill_(0.0)
                 module.attn_res_query_attn.data.zero_()
                 module.attn_res_query_mlp.data.zero_()
+        elif isinstance(module, StackMemory):
+            # Truncated-normal for all Linear weights (matches NeoLLM convention).
+            # Biases zeroed. res_weight starts at 1.0 so the stack readout
+            # contributes equally to the residual from step 0.
+            std      = getattr(self.config, "initializer_range", 0.02)
+            cutoff   = getattr(self.config, "init_cutoff_factor", 3.0) * std
+            for attr in ("down_proj", "up_proj", "action_head", "gate_proj"):
+                layer = getattr(module, attr, None)
+                if layer is not None and hasattr(layer, "weight"):
+                    nn.init.trunc_normal_(
+                        layer.weight, mean=0.0, std=std, a=-cutoff, b=cutoff
+                    )
+                    if layer.bias is not None:
+                        nn.init.zeros_(layer.bias)
+            if hasattr(module, "res_weight"):
+                module.res_weight.data.fill_(1.0)
+        elif isinstance(module, LAuReLLayer):
+            # RW: raw logits initialised to zero → softmax([0,0]) = [0.5, 0.5].
+            #     The model quickly learns the optimal α,β weighting.
+            # LR: lr_down (B, down-projection) — column orthogonal init,
+            #         as recommended by the LAuReL paper §3.3 for LLMs.
+            #         Column orthogonal preserves the L2 norm of the projected
+            #         representation, ensuring stable gradient magnitudes
+            #         through the low-rank bottleneck at init.
+            #     lr_up  (A, up-projection) — zero init → lr_term = A·Bx = 0
+            #         at step 0, so the module starts as a standard residual.
+            #         Gradient flows back through lr_down immediately via
+            #         chain rule; A learns from step 1 onward.
+            if hasattr(module, "rw_logits"):
+                nn.init.zeros_(module.rw_logits)
+            if hasattr(module, "lr_down"):
+                # Column-orthogonal: each column of weight^T is orthonormal.
+                # nn.init.orthogonal_ produces a row-orthogonal matrix (rows
+                # are orthonormal). Transposing gives column-orthogonal.
+                nn.init.orthogonal_(module.lr_down.weight)
+            if hasattr(module, "lr_up"):
+                nn.init.zeros_(module.lr_up.weight)
         elif isinstance(module, SpellingBeeEmbedding):
             # byte_emb initialised identically to token embeddings: std=1/√d.
             # Ensures E[‖e_byte‖²] ≈ 1 at init, matching etok, so the
         self.gradient_checkpointing = False
         self.first_layer_fan        = None
+        # ── StackTrans state flag ────��────────────────────────────────────
+        self.use_stacktrans = getattr(config, 'use_stacktrans', False)
+        # ── Residual-replacement mutex ────────────────────────────────────
+        # AttnRes, MUDD, and DCA all replace the residual aggregation
+        # mechanism — at most one can be active at a time.
+        _use_mudd     = getattr(config, 'use_mudd', False)
+        _use_attn_res = getattr(config, 'use_attn_res', False)
+        _use_dca      = getattr(config, 'use_dca',      False)
+        _active_count = sum([_use_mudd, _use_attn_res, _use_dca])
+        if _active_count > 1:
+            active = [n for n, f in [('use_mudd', _use_mudd),
+                                     ('use_attn_res', _use_attn_res),
+                                     ('use_dca', _use_dca)] if f]
+            raise ValueError(
+                f"use_mudd, use_attn_res, and use_dca are mutually exclusive — "
+                f"got {active} simultaneously active. Set exactly one to True."
+            )
+        if _use_mudd:
+            _mudd_dense_type  = getattr(config, 'mudd_dense_type',  'qkvr')
+            _mudd_dynamic     = getattr(config, 'mudd_dynamic_dense', True)
+            _mudd_round64     = getattr(config, 'mudd_round64',      False)
+            _mudd_expand_last = getattr(config, 'mudd_expand_last',  False)
+            _C = 4 if _mudd_dense_type == 'qkvr' else 1
+            # Static bias: one [C, lidx+2] parameter per layer.
+            # Initialized with 1 at index [c, lidx+1] (identity on Xi) so that
+            # at init (W2=0) each DA output = Xi — reducing to standard Transformer.
+            _static_list = []
+            for lidx in range(config.num_hidden_layers):
+                # Last layer always uses C=1: its DA output is the final
+                # model representation fed to the norm and lm_head, collapsing
+                # all history into a single stream (paper code, both files).
+                _c = 1 if lidx == config.num_hidden_layers - 1 else _C
+                a = torch.zeros(_c, lidx + 2)
+                a[:, lidx + 1] = 1.0        # last entry = current layer = identity
+                _static_list.append(nn.Parameter(a))
+            self.mudd_static = nn.ParameterList(_static_list)
+            # Dynamic DA modules (one per layer)
+            if _mudd_dynamic:
+                self.mudd_dynamic = nn.ModuleList([
+                    NeoLLMMUDDModule(
+                        hidden_size = config.hidden_size,
+                        lidx        = lidx,
+                        # Last layer: C=1 — collapses to single final repr
+                        num_ways    = 1 if lidx == config.num_hidden_layers - 1 else _C,
+                        is_last     = (lidx == config.num_hidden_layers - 1),
+                        expand_last = _mudd_expand_last,
+                        round64     = _mudd_round64,
+                    )
+                    for lidx in range(config.num_hidden_layers)
+                ])
+            else:
+                self.mudd_dynamic = None
+        else:
+            self.mudd_static  = None
+            self.mudd_dynamic = None
+        # ── DCA final GRN (Heddes et al., 2025) ───────────────────────────
+        # Applied once after all decoder layers to aggregate the full depth
+        # stack into the final hidden representation before the output norm.
+        # num_stack_layers = min(2*k, L+1) — same cap as per-layer GRNs.
+        # num_outputs=1 collapses to a single [B, S, D] tensor.
+        if _use_dca and getattr(config, 'dca_use_final_grn', True):
+            _dca_k   = getattr(config, 'dca_k', 2)
+            _dca_eps = getattr(config, 'dca_grn_eps', config.rms_norm_eps)
+            self.dca_final_grn = NeoLLMGRN(
+                hidden_size      = config.hidden_size,
+                num_stack_layers = min(2 * _dca_k, config.num_hidden_layers + 1),
+                num_outputs      = 1,
+                eps              = _dca_eps,
+            )
+        else:
+            self.dca_final_grn = None
         self.post_init()
+    def _update_linear_attn_mask(
+        self,
+        attention_mask: Optional[torch.Tensor],
+    ) -> Optional[torch.Tensor]:
+        """
+        Return mask for GatedDeltaNet layers (no causal bias, padding only).
+        Returns None when all tokens are valid (GatedDeltaNet handles via
+        _apply_mask_to_padding_states internally).
+        """
+        if attention_mask is None:
+            return None
+        if torch.all(attention_mask == 1):
+            return None
+        return attention_mask
     def get_input_embeddings(self):
         if self.config.use_token_generator:
             return self.token_generator
             getattr(cfg, "use_repo", False)
             and layer_idx >= getattr(cfg, "repo_start_layer", cfg.num_hidden_layers // 3)
         )
+        _versatile      = getattr(cfg, "use_versatile_ffn", False)
+        _use_stacktrans = getattr(cfg, "use_stacktrans", False)
+        _use_laurel     = getattr(cfg, "use_laurel", False)
         return LayerAnalysis(
             seednorm_pre_attn  = SeeDNormAnalysis(),
             seednorm_post_attn = SeeDNormAnalysis(),
                 polynorm = PolyNormAnalysis() if not _versatile else None,
                 versatile = VersatileFFNAnalysis() if _versatile else None,
             ),
+            gpas_attn    = GPASAnalysis(),
+            gpas_mlp     = GPASAnalysis(),
+            jtokm        = JTokMAnalysis() if cfg.use_jtokm else None,
+            attn_res     = AttnResAnalysis() if getattr(cfg, "use_attn_res", False) else None,
+            dca          = DCAAnalysis()      if getattr(cfg, "use_dca",      False) else None,
+            stack        = StackMemoryAnalysis() if _use_stacktrans else None,
+            laurel_attn  = LAuReLAnalysis()   if _use_laurel else None,
+            laurel_mlp   = LAuReLAnalysis()   if _use_laurel else None,
         )
     def forward(
             if getattr(self.config, "use_repo", False) else None
         )
+        # ── Linear attention mask ──────────────────────────────────────────
+        # Computed once; each layer picks the appropriate mask below.
+        linear_attn_mask = self._update_linear_attn_mask(attention_mask)
         # ── Attention Residuals state ──────────────────────────────────────
         # Full AttnRes (attn_res_num_blocks=0): sources grows by one entry per
         # decoder layer — all previous outputs are kept, max N=num_layers+1.
         if use_attn_res:
             attn_res_sources = [hidden_states]   # b_0 = token embedding
             attn_res_partial = hidden_states     # initial partial sum
+            # Block boundary handling now lives inside NeoLLMDecoderLayer.forward(),
+            # firing after the pre-attn AttnRes call (paper Fig. 2 timing).
+        # ── MUDD state ────────────────────────────────────────────────────
+        # hiddens[0] = token embedding; hiddens[i] = output of layer i-1.
+        # After each layer, its output is appended so layer i receives a
+        # history of length i+2 (embedding + i preceding layer outputs).
+        # mudd_streams is None for layer 0 (standard residual path there)
+        # and a C-tuple of [B,T,D] tensors for layers 1…L.
+        use_mudd = getattr(self.config, 'use_mudd', False)
+        mudd_hiddens = None
+        mudd_streams = None
+        if use_mudd:
+            mudd_hiddens = [hidden_states]   # b_0 = token embedding
+        # ── DCA state ─────────────────────────────────────────────────────
+        # all_tokens[0] = token embedding; grows by one per decoder layer.
+        # Before each layer, the stack is built and k-DCA selection applied,
+        # capping memory at 2*dca_k stored tensors regardless of depth.
+        # dca_stack is always non-None (even layer 0 gets [embedding]).
+        use_dca   = getattr(self.config, 'use_dca', False)
+        _dca_k    = getattr(self.config, 'dca_k', 2)
+        dca_all_tokens = None
+        dca_stack      = None
+        if use_dca:
+            dca_all_tokens = [hidden_states]   # [embedding]
+        # ── StackTrans state ──────────────────────────────────────────────
+        # stack_state / stack_mask start as None for the first layer;
+        # StackMemory initialises them to zeros internally on first call.
+        # After each layer, the returned (new_stack, new_mask) are passed
+        # to the next layer as its initial stack — this is "vertical" state
+        # propagation: information flows depth-wise through the stack.
+        #
+        # Temporal accumulation across generation steps is handled by the
+        # StackMemory internal k_cache / action_cache mechanism:
+        #   - enable_cache is set True when use_cache=True (inference)
+        #   - reset_cache() is called when past_key_values is None
+        #     (new sequence, not a continuation step)
+        # This matches the OLMo reference implementation exactly.
+        use_stacktrans = self.use_stacktrans
+        stack_state    = None
+        stack_mask     = None
+        if use_stacktrans:
+            use_cache_flag = kwargs.get("use_cache", False)
+            past_kv_flag   = kwargs.get("past_key_values", None)
+            for layer in self.layers:
+                if layer.stack_memory is not None:
+                    layer.stack_memory.enable_cache = bool(use_cache_flag)
+                    if past_kv_flag is None:
+                        layer.stack_memory.reset_cache()
         # Pre-allocate per-layer analysis list when analysis is active
         if analysis_state is not None:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
+            # ── DCA: build k-selected stack for this layer ───────────────
+            # Stack has layer_idx+1 entries before selection; after k-DCA
+            # selection it has at most 2*dca_k entries (first k + last k).
+            if use_dca:
+                dca_stack = dca_select_layers(
+                    torch.stack(dca_all_tokens, dim=0), k=_dca_k
+                )
             # Build per-layer analysis container (only in eval + analysis mode)
             layer_analysis = None
                 layer_analysis.layer_idx = layer_idx
                 analysis_state.layers.append(layer_analysis)
+            # Select the appropriate mask: causal for full attention,
+            # padding-only for GatedDeltaNet linear attention.
+            _layer_mask = (
+                linear_attn_mask
+                if getattr(decoder_layer, "is_linear_attn", False)
+                else causal_mask
+            )
             layer_outputs = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
+                attention_mask=_layer_mask,
                 first_layer_fan=self.first_layer_fan,
                 z_tilde=z_tilde,
                 B_vals=B_vals,
                 attn_res_sources=attn_res_sources,
                 attn_res_partial=attn_res_partial if use_attn_res else None,
+                mudd_streams=mudd_streams,
+                dca_stack=dca_stack,
+                stack_state=stack_state,
+                stack_mask=stack_mask,
                 layer_analysis=layer_analysis,
                 output_attentions=output_attentions,
                 repo_rope_args=repo_rope_args,
             )
             hidden_states = layer_outputs[0]
+            # ── StackTrans: extract updated stack state for next layer ─────
+            # layer_outputs always ends with (stack_state, stack_mask) —
+            # both are None when use_stacktrans=False (zero cost).
+            stack_state = layer_outputs[-2]
+            stack_mask  = layer_outputs[-1]
             # Update AttnRes partial sum — the new partial is the layer output
             if use_attn_res:
                 attn_res_partial = hidden_states
+            # Append layer output to DCA history for next layer's stack
+            if use_dca:
+                dca_all_tokens.append(hidden_states)
+            # ── MUDD: append current output and compute DA for next layer ──
+            # mudd_hiddens grows by 1 each iteration; at layer i it has i+2
+            # entries (embedding + i outputs). The DA for layer i+1 takes this
+            # full history and produces C streams via dynamic + static weights.
+            # mudd_streams is passed to layer i+1 as its input streams.
+            if use_mudd:
+                mudd_hiddens.append(hidden_states)
+                # Compute DA module output using the just-appended history
+                # (mudd_hiddens now has layer_idx+2 entries)
+                is_last_layer = (layer_idx == self.config.num_hidden_layers - 1)
+                mudd_da_module = self.mudd_dynamic[layer_idx] if self.mudd_dynamic is not None else None
+                if mudd_da_module is not None:
+                    raw_streams = mudd_da_module(
+                        hidden_states,
+                        mudd_hiddens,
+                        self.mudd_static[layer_idx],
+                    )
+                else:
+                    # Static-only: apply weighted sum with learnable bias
+                    # stack history [L, B, T, D], weight by mudd_static
+                    stacked = torch.stack(mudd_hiddens, dim=0)    # [L, B, T, D]
+                    a = self.mudd_static[layer_idx].to(hidden_states.dtype)  # [C, L]
+                    raw_streams = tuple(
+                        torch.einsum('cl,lbtd->btd', a[c:c+1], stacked).squeeze(0)
+                        for c in range(a.shape[0])
+                    )
+                if is_last_layer:
+                    # Last layer DA always produces C=1 → single final repr.
+                    # This is the MUDD-aggregated combination of all layer
+                    # histories weighted by the last layer's output as query.
+                    # Replace hidden_states so the final norm and lm_head
+                    # receive this aggregated representation, not the raw
+                    # last-layer output (paper forward loop: x = x[0] after loop).
+                    hidden_states = raw_streams[0]
+                    mudd_streams = None   # no next layer
+                elif len(raw_streams) == 1:
+                    # dense_type='l': broadcast to 4-tuple
+                    mudd_streams = (raw_streams[0],) * 4
+                else:
+                    # 'qkvr': 4 streams → (xq, xk, xv, xr)
+                    mudd_streams = raw_streams
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
+            # Collect JTok-M / VersatileFFN aux stats.
+            # layer_outputs always ends with (stack_state, stack_mask) —
+            # slice [1:-2] to skip hidden_states[0] and the two stack slots.
+            inner_outputs = layer_outputs[1:-2]
+            if self.config.use_jtokm and len(inner_outputs) > (1 if output_attentions else 0):
+                all_aux_stats.append(inner_outputs[-1])
             if getattr(self.config, "use_versatile_ffn", False):
+                for item in inner_outputs:
                     if isinstance(item, tuple) and len(item) == 3:
                         all_aux_stats.append(("versatile", item))
                         break
                     and hasattr(decoder_layer, "current_layer_fan")):
                 self.first_layer_fan = decoder_layer.current_layer_fan
+        # ── DCA final GRN ──────────────────────────────────────────────────
+        # Aggregates the full depth history (k-selected) into the final
+        # hidden representation, matching the DCAGPT forward loop which
+        # applies final_grn(stack(all_tokens)) before norm → lm_head.
+        if use_dca and self.dca_final_grn is not None:
+            final_stack   = dca_select_layers(
+                torch.stack(dca_all_tokens, dim=0), k=_dca_k
+            )
+            hidden_states = self.dca_final_grn(final_stack)
         hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             analysis_state.attn_res_sources_final  = (
                 attn_res_sources if use_attn_res else None
             )
+            analysis_state.dca_all_tokens_final    = (
+                dca_all_tokens if use_dca else None
+            )
         if not return_dict:
             return tuple(
             layers                 = None,   # filled by NeoLLMModel.forward
             jtokm_aux_stats        = [] if cfg.use_jtokm else None,
             attn_res_sources_final = [] if getattr(cfg, "use_attn_res", False) else None,
+            dca_all_tokens_final   = [] if getattr(cfg, "use_dca",      False) else None,
         )
     # ── Standard model API ────────────────────────────────────────────────
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
+    "NeoLLMGatedDeltaNet",
+    "StackMemory",
+    "LAuReLLayer",
+    "NeoLLMMUDDModule",
+    "NeoLLMGRN",
+    "dca_select_layers",
     "NeoLLMForCausalLM",
     "NeoLLMModel",
     "NeoLLMPreTrainedModel",
     "REPOModule",
     "VersatileFFN",
     "compute_versatile_aux_loss",
+    # Analysis dataclasses
     "AnalysisState",
     "LayerAnalysis",
     "AttentionAnalysis",
     "VersatileFFNAnalysis",
     "JTokMAnalysis",
     "AttnResAnalysis",
+    "DCAAnalysis",
+    "StackMemoryAnalysis",
+    "LAuReLAnalysis",
     "GeneratorAnalysis",
 ]