KitsuVp
/

NeoLLM

@@ -80,7 +80,7 @@ from transformers.utils import TransformersKwargs, logging
 from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
@@ -339,22 +339,6 @@ class JTokMAnalysis:
     lns_scale:        Optional[float] = None          # 1/√(2ℓ) scaling factor
-@dataclass
-class DCAAnalysis:
-    """
-    GRN-v3 depth-wise aggregate weights from a DeepCrossAttention layer.
-    Only populated when use_dca=True.
-    grn_depth_weights: softmax-free aggregate scalars used to weight each
-        source layer, shape [3, y, B, S] where 3 = Q/K/V streams,
-        y = selected stack depth (at most 2*dca_k), B = batch, S = seq.
-        These are the per-position, per-layer scalars *before* adding the
-        static bias — useful to see which layers the dynamic component
-        selectively suppresses (ReLU zeros out negative entries).
-    """
-    grn_depth_weights: Optional[torch.Tensor] = None  # [3, y, B, S]
 @dataclass
 class AttnResAnalysis:
     """
@@ -366,78 +350,6 @@ class AttnResAnalysis:
     sources_count:    Optional[int] = None            # number of sources including partial
-@dataclass
-class StackMemoryAnalysis:
-    """
-    Internals of a StackMemory forward pass.
-    Only populated when use_stacktrans=True AND model is in eval + analysis mode.
-    Reference: Zhang, K. et al. (NeurIPS 2025). "Recursive Transformer:
-    Boosting Reasoning Ability with State Stack."
-    action_probs:   softmax distribution [push, pop, no-op] per head and
-                    token position. Shape [B, S, H, 3]. Visualising this
-                    across layers reveals the push-heavy early layers and
-                    pop-heavy later layers described in the paper (§B.2).
-    stack_in:       stack state entering this layer (the output of the
-                    previous layer's StackMemory). Shape [B, H, slots, ds].
-                    None for layer 0 (starts as all-zeros).
-    stack_out:      updated stack state after processing this sequence.
-                    Shape [B, H, slots, ds]. This is new_stack[:, -1] —
-                    the stack at the final sequence position, passed to
-                    the next layer as stack_in.
-    mask_out:       validity mask for stack_out. Shape [B, H, slots].
-                    Values near 1 indicate active slots; near 0 = empty.
-    gate_weights:   softmax attention weights used for global reading.
-                    Shape [B, S, H, slots]. High weight on slot i at
-                    position t means the model retrieved from slot i there.
-    memory_output:  weighted stack readout before up_proj.
-                    Shape [B, S, stack_d_model].
-    residual_scale: value of the learnable res_weight scalar at this step.
-    """
-    action_probs:   Optional[torch.Tensor] = None  # [B,S,H,3]
-    stack_in:       Optional[torch.Tensor] = None  # [B,H,slots,ds] entering layer
-    stack_out:      Optional[torch.Tensor] = None  # [B,H,slots,ds] leaving layer
-    mask_out:       Optional[torch.Tensor] = None  # [B,H,slots]
-    gate_weights:   Optional[torch.Tensor] = None  # [B,S,H,slots]
-    memory_output:  Optional[torch.Tensor] = None  # [B,S,stack_d_model]
-    residual_scale: Optional[float]        = None  # res_weight scalar
-@dataclass
-class LAuReLAnalysis:
-    """
-    Internals of one LAuReL residual connection forward pass.
-    Only populated when use_laurel=True AND model is in eval + analysis mode.
-    Instantiated twice per layer: once for the attention residual, once for MLP.
-    Reference: Menghani, G., Kumar, R. & Kumar, S. (ICML 2025).
-    *LAuReL: Learned Augmented Residual Layer.* arXiv:2411.07501.
-    Math (combined RW+LR, both sub-variants active):
-        x_{i+1} = α · f(x_i) + β · (A·(B·x_i) + x_i)
-    where [α, β] = softmax([a, b]), a,b ∈ ℝ learnable (RW component),
-    B ∈ ℝ^{r×D} column-orthogonal init, A ∈ ℝ^{D×r} zero init (LR component).
-    At step 0: A=0 → lr_term=0, so x_{i+1} = 0.5·f(x) + 0.5·x_i (RW only)
-    or x_{i+1} = f(x_i) + x_i (LR only, standard residual).
-    Fields:
-        alpha_rw:    softmax(a) — weight on f(x_i). [scalar float]
-                     None when use_laurel_rw=False.
-        beta_rw:     softmax(b) — weight on g(x_i). [scalar float]
-                     None when use_laurel_rw=False.
-        lr_term:     A·(B·x_res) — the low-rank residual augmentation.
-                     Shape [B, S, D]. Zero at init. None when use_laurel_lr=False.
-        output:      Final combined tensor before GPAS. Shape [B, S, D].
-    """
-    alpha_rw: Optional[float]        = None  # softmax weight on f(x)
-    beta_rw:  Optional[float]        = None  # softmax weight on g(x)
-    lr_term:  Optional[torch.Tensor] = None  # A(Bx) low-rank augmentation [B,S,D]
-    output:   Optional[torch.Tensor] = None  # combined pre-GPAS [B,S,D]
 @dataclass
 class LayerAnalysis:
     """
@@ -466,12 +378,8 @@ class LayerAnalysis:
     gpas_mlp:           Optional[GPASAnalysis] = None      # GPAS after MLP residual
     # Optional components (None when inactive)
-    jtokm:       Optional[JTokMAnalysis]       = None  # if use_jtokm
-    attn_res:    Optional[AttnResAnalysis]     = None  # if use_attn_res
-    dca:         Optional[DCAAnalysis]         = None  # if use_dca
-    stack:       Optional[StackMemoryAnalysis] = None  # if use_stacktrans
-    laurel_attn: Optional[LAuReLAnalysis]      = None  # if use_laurel (attention residual)
-    laurel_mlp:  Optional[LAuReLAnalysis]      = None  # if use_laurel (MLP residual)
 @dataclass
@@ -536,7 +444,6 @@ class AnalysisState:
     layers:                 Optional[List[LayerAnalysis]] = None
     jtokm_aux_stats:        Optional[list] = None
     attn_res_sources_final: Optional[list] = None
-    dca_all_tokens_final:   Optional[list] = None
     logits:                 Optional[torch.Tensor] = None
 class ScalarMultiplier(nn.Module):
@@ -2456,8 +2363,6 @@ class NeoLLMAttention(nn.Module):
         first_layer_fan: Optional[torch.Tensor] = None,
         attn_analysis: Optional[AttentionAnalysis] = None,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
-        mudd_xk: Optional[torch.Tensor] = None,
-        mudd_xv: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
@@ -2468,14 +2373,6 @@ class NeoLLMAttention(nn.Module):
             h_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * h_fan
         current_layer_fan = h_fan.clone()
-        # ── MUDD: separate K/V FAN paths ─────────────────────────────────
-        # When mudd_xk/mudd_xv are provided (MUDD qkvr mode), they have already
-        # been normalized by the decoder layer's K/V norm chain. Here they go
-        # through their own FAN transform before k_proj/v_proj, keeping the
-        # FANformer periodicity modeling orthogonally intact per stream.
-        h_fan_k = self.fan_layer(mudd_xk) if mudd_xk is not None else h_fan
-        h_fan_v = self.fan_layer(mudd_xv) if mudd_xv is not None else h_fan
         query_shape = (*input_shape, self.config.num_attention_heads, self.head_dim)
         kv_shape    = (*input_shape, self.num_mea_component_heads, self.head_dim)
@@ -2490,8 +2387,8 @@ class NeoLLMAttention(nn.Module):
             attn_analysis.gate_raw = gate.detach()
         q = self.q_norm(q_raw.view(query_shape)).transpose(1, 2)
-        k = self.k_norm(self.k_proj(h_fan_k).view(kv_shape)).transpose(1, 2)
-        v = self.v_proj(h_fan_v).view(kv_shape).transpose(1, 2)
         if attn_analysis is not None:
             attn_analysis.q_post_norm = q.detach()
@@ -3168,651 +3065,6 @@ class NeoLLMMLP(nn.Module):
         return result
-class NeoLLMMUDDModule(nn.Module):
-    """
-    Multiway Dynamic Dense (MUDD) Depth-wise Aggregate module.
-    Generates per-position, per-stream connection weights over all preceding
-    layer outputs (and the token embedding) and produces up to C=4 aggregated
-    streams (Q, K, V, R) for the next Transformer block.
-    Architecture (Xiao et al., 2025, arXiv:2502.12170):
-        dw = GELU(RMSNorm(x) @ W1) @ W2 + a   # [B, T, C*(lidx+2)]
-        dw = reshape to [C, B, T, (lidx+2)]
-        stream_c = Σ_j dw[c, :, :, j] * hiddens[j]   for c in range(C)
-    W1 ~ N(0, 1/D), W2 = 0, a = identity on last index → reduces to standard
-    Transformer at init (dynamic part is zero, static bias selects Xi).
-    Args:
-        hidden_size: model dimension D
-        lidx:        layer index (0-based); history has lidx+2 entries
-        num_ways:    C, number of output streams (4 for "qkvr", 1 for "l")
-        is_last:     whether this is the last layer (controls expand_last)
-        expand_last: multiply hid_dim by 4 for the final layer's DA module
-        round64:     round hid_dim up to the nearest multiple of 64
-    """
-    def __init__(
-        self,
-        hidden_size: int,
-        lidx: int,
-        num_ways: int = 4,
-        is_last: bool = False,
-        expand_last: bool = False,
-        round64: bool = False,
-    ) -> None:
-        super().__init__()
-        self.lidx     = lidx
-        self.num_ways = num_ways
-        l             = lidx + 2           # history length: embedding + lidx layers
-        hid_dim       = l * num_ways
-        out_dim       = l * num_ways
-        if is_last and expand_last:
-            hid_dim *= 4
-        if round64:
-            hid_dim = (hid_dim // 64 + 1) * 64
-        # RMSNorm without learnable scale (paper uses RMSnormNoscale)
-        self.norm = nn.RMSNorm(hidden_size, elementwise_affine=False,
-                               eps=1e-6)
-        self.w1   = nn.Linear(hidden_size, hid_dim, bias=False)
-        self.act  = nn.GELU()
-        self.w2   = nn.Linear(hid_dim, out_dim, bias=False)
-        self._reset_mudd_parameters(hidden_size)
-    def _reset_mudd_parameters(self, D: int) -> None:
-        # W1 ~ N(0, 1/D); W2 = 0 → dynamic part starts at zero
-        nn.init.normal_(self.w1.weight, mean=0.0, std=1.0 / D)
-        nn.init.zeros_(self.w2.weight)
-    def forward(
-        self,
-        x: torch.Tensor,                # [B, T, D] — current layer output (Xi)
-        hiddens: list,                   # list of lidx+2 tensors [B, T, D]
-        static_bias: torch.Tensor,       # [C, lidx+2] — learnable static prior
-    ) -> tuple:
-        """
-        Returns:
-            Tuple of num_ways tensors, each [B, T, D] — the aggregated streams.
-        """
-        B, T, D = x.shape
-        # Dynamic weight generation: [B, T, C*(lidx+2)]
-        dw = self.w2(self.act(self.w1(self.norm(x))))
-        # Add static bias (broadcast over B and T)
-        # static_bias: [C, L] → [1, 1, C*L] via reshape
-        C, L = static_bias.shape
-        dw = dw + static_bias.reshape(1, 1, C * L).to(dw.dtype)
-        # Reshape to [C, B, T, L]
-        dw = dw.view(B, T, C, L).permute(2, 0, 1, 3)   # [C, B, T, L]
-        # Stack history: [L, B, T, D]
-        stacked = torch.stack(hiddens, dim=0)            # [L, B, T, D]
-        # Aggregate: Σ_j dw[c, :, :, j] * hiddens[j]
-        # einsum "cbtl, lbtd -> cbtd"
-        streams = torch.einsum('cbtl,lbtd->cbtd', dw, stacked)  # [C, B, T, D]
-        return tuple(streams[c] for c in range(C))
-def dca_select_layers(stacked: torch.Tensor, k: int) -> torch.Tensor:
-    """
-    k-DCA layer selection (Heddes et al., 2025, §3.1).
-    Keeps only the first k and last k tensors from the depth stack,
-    capping memory at 2k layer representations regardless of depth.
-    When the stack has <= 2k entries all are kept (early layers).
-    Args:
-        stacked: [y, B, S, D] — stack of all layer outputs so far.
-        k:       number of first/last layers to retain.
-    Returns:
-        [min(y, 2k), B, S, D]
-    """
-    y = stacked.shape[0]
-    if y <= k * 2:
-        return stacked
-    return torch.cat([stacked[:k], stacked[-k:]], dim=0)
-class NeoLLMGRN(nn.Module):
-    """
-    Generalized Residual Network v3 (GRN-v3) from DeepCrossAttention
-    (Heddes et al., 2025, arXiv:2502.06785, §3.1).
-    Produces `num_outputs` aggregated streams from a depth-wise stack of
-    layer representations. Weights are simultaneously:
-    - **Input-dependent** (dynamic): a two-layer mapping
-      ``w̄ = ReLU(RMSNorm(G) @ W)`` produces one scalar per
-      (output-stream, depth-position, batch-token). ``W`` is initialized
-      to zero so the dynamic contribution starts neutral.
-    - **Dimension-dependent** (static): a learnable bias ``b`` of shape
-      ``[num_outputs, num_stack_layers, hidden_size]`` initialized to ones
-      provides a per-dimension, per-layer prior. At initialization the
-      dynamic part is zero and the static bias sums to an equal-weight
-      average over all stack entries, reducing to a standard residual mean.
-    The combined weight for output stream ``o``, stack position ``y``,
-    batch ``b``, token ``n``, feature ``d`` is::
-        weight[o, y, b, n, d] = ReLU(dynamic[y, b, n, o]) + bias[o, y, d]
-    Output ``o`` is then the weighted sum over depth::
-        out[o, b, n, d] = Σ_y  stack[y, b, n, d] * weight[o, y, b, n, d]
-    Reference:
-        Heddes, M. et al. (2025). *DeepCrossAttention: Supercharging
-        Transformer Residual Connections.* arXiv:2502.06785.
-    Args:
-        hidden_size:      model dimension D.
-        num_stack_layers: number of depth entries this GRN will receive
-                          (= min(layer_idx+1, 2*dca_k)).
-        num_outputs:      number of output streams (3 for DCA Q/K/V,
-                          1 for the final aggregation GRN).
-        eps:              epsilon for the internal RMSNorm.
-    """
-    def __init__(
-        self,
-        hidden_size: int,
-        num_stack_layers: int,
-        num_outputs: int = 3,
-        eps: float = 1e-6,
-    ) -> None:
-        super().__init__()
-        self.num_outputs      = num_outputs
-        self.num_stack_layers = num_stack_layers
-        # Dynamic component: RMSNorm(no scale) → Linear → ReLU
-        # Linear maps D → num_outputs; init zeros so dynamic part = 0 at step 0.
-        _linear = nn.Linear(hidden_size, num_outputs, bias=False)
-        nn.init.zeros_(_linear.weight)
-        self.norm_noscale = nn.RMSNorm(
-            hidden_size, eps=eps, elementwise_affine=False
-        )
-        self.to_dynamic   = nn.Sequential(_linear, nn.ReLU())
-        # Static bias: [num_outputs, num_stack_layers, hidden_size], init ones.
-        # At init: weight = 0 + bias = 1 per entry → equal-weight average → residual.
-        self.bias = nn.Parameter(
-            torch.ones(num_outputs, num_stack_layers, hidden_size)
-        )
-    def forward(
-        self,
-        stack: torch.Tensor,
-        analysis: Optional["DCAAnalysis"] = None,
-    ) -> tuple:
-        """
-        Args:
-            stack:    [y, B, S, D] — selected depth stack (y ≤ 2*dca_k).
-            analysis: optional DCAAnalysis to deposit grn_depth_weights.
-        Returns:
-            Tuple of num_outputs tensors each [B, S, D].
-            When num_outputs=1 returns a single [B, S, D] tensor directly.
-        """
-        y, B, S, D = stack.shape
-        assert y == self.num_stack_layers, (
-            f"NeoLLMGRN expected stack depth {self.num_stack_layers}, got {y}"
-        )
-        # Dynamic aggregate: [y, B, S, D] → norm → [y, B, S, D]
-        #   → to_dynamic → [y, B, S, num_outputs]
-        #   → permute → [num_outputs, y, B, S]
-        normed   = self.norm_noscale(stack)                   # [y, B, S, D]
-        dynamic  = self.to_dynamic(normed)                    # [y, B, S, num_outputs]
-        dynamic  = dynamic.permute(3, 0, 1, 2)               # [o, y, B, S]
-        if analysis is not None:
-            analysis.grn_depth_weights = dynamic.detach()
-        # Combined weight: dynamic scalar + static bias per dimension
-        # dynamic:  [o, y, B, S]     → [o, y, B, S, 1]
-        # bias:     [o, y, D]        → [o, y, 1, 1, D]
-        weights = dynamic.unsqueeze(-1) + self.bias.unsqueeze(2).unsqueeze(3)
-        # weights: [o, y, B, S, D]
-        # Weighted depth-sum: Σ_y stack[y] * weights[o, y]
-        # stack: [y, B, S, D] → [1, y, B, S, D]
-        output = (stack.unsqueeze(0) * weights).sum(dim=1)   # [o, B, S, D]
-        if self.num_outputs == 1:
-            return output.squeeze(0)                          # [B, S, D]
-        return tuple(output[i] for i in range(self.num_outputs))
-class StackMemory(nn.Module):
-    """
-    Differentiable multi-head hidden-state stack for NeoLLM.
-    Implements the StackTrans module from Zhang et al. (NeurIPS 2025):
-    "Recursive Transformer: Boosting Reasoning Ability with State Stack."
-    Architecture (one forward call, covering the full sequence in parallel):
-        1. down_proj  : [B,S,D] → [B,S,stack_d_model]
-        2. action_head: → [B,S,H,3] softmax (push / pop / no-op)
-        3. k_values   : reshape to [B,S,H,ds]
-        4. _vectorized_update: applies soft push/pop/no-op to each
-           (batch, head) stack in parallel across the sequence dim.
-           This is the training-parallelism approximation from §3.3:
-           every token sees the *same* initial stack, breaking strict
-           temporal ordering within a sequence in exchange for full
-           data-parallelism. Cross-token memory is recovered during
-           autoregressive generation via the step() / enable_cache path.
-        5. gate_proj  : global read — softmax over all stack slots
-           (paper §3.1: "query-over-stack attention"), masked by the
-           validity mask. Returns weighted sum of the stack.
-        6. up_proj    : [B,S,stack_d_model] → [B,S,D]
-        7. residual   : output = up_proj_out * res_weight + hidden_states
-    Vertical passing (layer-to-layer):
-        Returns new_stack[:, -1] and new_mask[:, -1] — the stack state
-        at the last sequence position — which becomes the initial stack
-        for the next decoder layer. This propagates hierarchical context
-        depth-wise through the network.
-    Temporal accumulation (generation):
-        During autoregressive decoding, enable_cache=True and step() is
-        used: k_cache and action_cache store previous-token values so the
-        update equation integrates the full generated history rather than
-        starting from zeros each step.
-    Args:
-        config: NeoLLMConfig instance. Reads:
-            stacktrans_num_heads     (H, number of stack heads)
-            stacktrans_stack_slots   (S, stack depth)
-            stacktrans_stack_d_model (H×ds, low-rank dimension)
-            stacktrans_forward_bs    (batch size for cache buffers)
-    """
-    def __init__(self, config: NeoLLMConfig):
-        super().__init__()
-        self.num_stack_heads  = config.stacktrans_num_heads
-        self.stack_slots      = config.stacktrans_stack_slots
-        self.stack_d_model    = config.stacktrans_stack_d_model
-        self.head_dim         = self.stack_d_model // self.num_stack_heads
-        # Dimension reduction / expansion (standard nn.Linear, no multipliers —
-        # StackMemory is architecturally independent per the paper §A)
-        self.down_proj   = nn.Linear(config.hidden_size, self.stack_d_model, bias=True)
-        self.up_proj     = nn.Linear(self.stack_d_model, config.hidden_size, bias=True)
-        # Action prediction: push / pop / no-op probabilities, one triple per head
-        self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
-        # Global read query: one scalar gate per stack slot per head
-        self.gate_proj   = nn.Linear(self.head_dim, 1, bias=True)
-        # Learnable residual gate (paper h'_t = g_h·h_t + R_t, g_h scalar)
-        self.res_weight  = nn.Parameter(torch.ones(1))
-        # ── Autoregressive generation cache ──────────────────────────────
-        # k_cache and action_cache hold per-token values from previous steps
-        # so step() can reconstruct the full sequence history. Only used when
-        # enable_cache=True (set by NeoLLMModel.forward when use_cache=True).
-        _fbs = getattr(config, "stacktrans_forward_bs", 1)
-        _cs  = getattr(config, "cache_size", 2048)
-        self.register_buffer(
-            "k_cache",
-            torch.zeros(_fbs, _cs, self.num_stack_heads, self.head_dim),
-        )
-        self.register_buffer(
-            "action_cache",
-            torch.zeros(_fbs, _cs, self.num_stack_heads, 3),
-        )
-        self.cache_position = 0
-        self.enable_cache   = False
-    # ── Cache helpers ─────────────────────────────────────────────────────
-    def reset_cache(self) -> None:
-        self.cache_position = 0
-    def _update_cache(
-        self,
-        k_values: torch.Tensor,   # [B,S,H,ds] detached
-        actions:  torch.Tensor,   # [B,S,H,3]  detached
-    ) -> None:
-        seq_len = k_values.shape[1]
-        if self.cache_position + seq_len <= self.k_cache.shape[1]:
-            self.k_cache  [:, self.cache_position:self.cache_position + seq_len] = k_values
-            self.action_cache[:, self.cache_position:self.cache_position + seq_len] = actions
-            self.cache_position += seq_len
-        else:
-            self.reset_cache()
-    # ── Core stack update ─────────────────────────────────────────────────
-    def _vectorized_update(
-        self,
-        stack:    torch.Tensor,  # [B,   H, slots, ds]  (4-D) or [B,S,H,slots,ds] (5-D)
-        mask:     torch.Tensor,  # [B,   H, slots]      (3-D) or [B,S,H,slots]    (4-D)
-        actions:  torch.Tensor,  # [B, S, H, 3]
-        k_values: torch.Tensor,  # [B, S, H, ds]
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Vectorized soft push/pop/no-op stack update.
-        Every token position receives the *same* initial stack (the one
-        passed in from the previous layer), and operations are applied in
-        parallel across S.  This is the §3.3 training-parallelism
-        approximation: strict sequential dependency within a sequence is
-        broken intentionally to allow full batch processing.
-        Returns:
-            new_stack [B, S, H, slots, ds]
-            new_mask  [B, S, H, slots]
-        """
-        batch_size, seq_len = actions.shape[:2]
-        # Broadcast 4-D initial state along the sequence dimension
-        if stack.dim() == 4:
-            stack = stack.unsqueeze(1).expand(-1, seq_len, -1, -1, -1)
-            mask  = mask.unsqueeze(1).expand(-1, seq_len, -1, -1)
-        # Push: new value at top, shift everything down (overflow discarded)
-        push_stack = torch.cat([k_values.unsqueeze(3), stack[:, :, :, :-1]], dim=3)
-        push_mask  = torch.cat([torch.ones_like(mask[:, :, :, :1]),
-                                 mask[:, :, :, :-1]], dim=3)
-        # Pop: shift everything up, zero at bottom
-        pop_stack = torch.cat([stack[:, :, :, 1:],
-                                torch.zeros_like(stack[:, :, :, :1])], dim=3)
-        pop_mask  = torch.cat([mask[:, :, :, 1:],
-                                torch.zeros_like(mask[:, :, :, :1])], dim=3)
-        # Soft combination weighted by action probabilities
-        # actions: [B,S,H,3] → unsqueeze to [B,S,H,3,1,1] for stack broadcast
-        aw     = actions.unsqueeze(-1).unsqueeze(-1)            # [B,S,H,3,1,1]
-        stacks = torch.stack([push_stack, pop_stack, stack], dim=3)  # [B,S,H,3,slots,ds]
-        masks  = torch.stack([push_mask,  pop_mask,  mask],  dim=3)  # [B,S,H,3,slots]
-        new_stack = (stacks * aw).sum(dim=3)                   # [B,S,H,slots,ds]
-        new_mask  = (masks  * aw.squeeze(-1)).sum(dim=3)       # [B,S,H,slots]
-        return new_stack, new_mask
-    # ── Training forward (full sequence) ─────────────────────────────────
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        stack: Optional[torch.Tensor] = None,
-        mask:  Optional[torch.Tensor] = None,
-        analysis: Optional[StackMemoryAnalysis] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Full-sequence forward pass (training and prefill).
-        Args:
-            hidden_states: [B, S, D]
-            stack:         [B, H, slots, ds] — previous layer's stack state,
-                           or None (initialised to zeros for layer 0).
-            mask:          [B, H, slots]     — validity mask for stack,
-                           or None (initialised to zeros for layer 0).
-            analysis:      StackMemoryAnalysis container; populated when
-                           model is in eval + analysis mode.
-        Returns:
-            (output, new_stack, new_mask)
-            output    [B, S, D]
-            new_stack [B, H, slots, ds]  — stack at final sequence position
-            new_mask  [B, H, slots]
-        """
-        batch_size, seq_len, _ = hidden_states.shape
-        device = hidden_states.device
-        # Capture incoming stack for analysis before it is updated
-        if analysis is not None:
-            analysis.stack_in = stack.detach() if stack is not None else None
-        # Initialise empty stack / mask for layer 0
-        if stack is None:
-            stack = torch.zeros(
-                batch_size, self.num_stack_heads, self.stack_slots, self.head_dim,
-                device=device, dtype=hidden_states.dtype,
-            )
-        if mask is None:
-            mask = torch.zeros(
-                batch_size, self.num_stack_heads, self.stack_slots,
-                device=device, dtype=hidden_states.dtype,
-            )
-        # 1. Project down
-        h_proj = self.down_proj(hidden_states)                 # [B,S,stack_d_model]
-        # 2. Action probabilities
-        action_logits = self.action_head(h_proj) / math.sqrt(self.head_dim)
-        actions = F.softmax(
-            action_logits.view(batch_size, seq_len, self.num_stack_heads, 3), dim=-1
-        )                                                       # [B,S,H,3]
-        # 3. Values to push
-        k_values = h_proj.view(batch_size, seq_len, self.num_stack_heads, self.head_dim)
-        # 4. Vectorized stack update
-        new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
-        # new_stack: [B,S,H,slots,ds],  new_mask: [B,S,H,slots]
-        # 5. Global read (query-over-stack attention, paper §3.1)
-        gate_scores  = self.gate_proj(new_stack).squeeze(-1)   # [B,S,H,slots]
-        gate_weights = F.softmax(gate_scores + (1 - new_mask) * -1e9, dim=-1)
-        memory_out   = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
-        # memory_out: [B,S,H,ds] → [B,S,stack_d_model]
-        memory_out   = memory_out.view(batch_size, seq_len, self.stack_d_model)
-        # 6. Project back up
-        memory_out_proj = self.up_proj(memory_out)             # [B,S,D]
-        # 7. Residual
-        output = memory_out_proj * self.res_weight + hidden_states
-        # 8. Update generation cache (no-op during training)
-        if self.enable_cache:
-            self._update_cache(k_values.detach(), actions.detach())
-        # Populate analysis fields
-        if analysis is not None:
-            analysis.action_probs   = actions.detach()
-            analysis.stack_out      = new_stack[:, -1].detach()
-            analysis.mask_out       = new_mask[:, -1].detach()
-            analysis.gate_weights   = gate_weights.detach()
-            analysis.memory_output  = memory_out.detach()
-            analysis.residual_scale = self.res_weight.item()
-        # Return output + last-position stack state for next layer
-        return output, new_stack[:, -1], new_mask[:, -1]
-    # ── Autoregressive single-token forward ──────────────────────────────
-    def step(
-        self,
-        hidden_state: torch.Tensor,   # [B, D]
-        stack: torch.Tensor,          # [B, H, slots, ds]
-        mask:  torch.Tensor,          # [B, H, slots]
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Single-token forward for autoregressive generation.
-        When enable_cache=False (simple path used by NeoLLM generation):
-            Calls forward() with a length-1 sequence and unpacks the result.
-            The stack state passed in carries all history from previous tokens
-            (propagated by NeoLLMModel.forward across generation steps).
-        When enable_cache=True (full-history reconstruction path):
-            Concatenates the current token with cached previous-token values
-            and replays the full vectorized update, extracting only the last
-            position. This gives a more accurate stack that sees full history
-            at the cost of O(T) computation per step.
-        Returns:
-            (output, new_stack, new_mask)
-            output    [B, D]
-            new_stack [B, H, slots, ds]
-            new_mask  [B, H, slots]
-        """
-        if not self.enable_cache:
-            # Simple path: forward with seq_len=1, squeeze the sequence dim
-            out, new_stack, new_mask = self.forward(
-                hidden_state.unsqueeze(1), stack, mask
-            )
-            return out.squeeze(1), new_stack, new_mask
-        batch_size = hidden_state.shape[0]
-        # Compute features for the current token
-        h_proj    = self.down_proj(hidden_state)               # [B, stack_d_model]
-        a_logits  = self.action_head(h_proj) / math.sqrt(self.head_dim)
-        cur_act   = F.softmax(
-            a_logits.view(batch_size, 1, self.num_stack_heads, 3), dim=-1
-        )                                                       # [B,1,H,3]
-        cur_k     = h_proj.view(batch_size, 1, self.num_stack_heads, self.head_dim)
-        # Prepend cached history (all previous tokens in this generation)
-        if self.cache_position > 0:
-            k_values = torch.cat([self.k_cache[:batch_size, :self.cache_position], cur_k],   dim=1)
-            actions  = torch.cat([self.action_cache[:batch_size, :self.cache_position], cur_act], dim=1)
-        else:
-            k_values = cur_k
-            actions  = cur_act
-        # Full vectorized update over history + current token; take last position
-        new_stack_seq, new_mask_seq = self._vectorized_update(stack, mask, actions, k_values)
-        new_stack = new_stack_seq[:, -1]                       # [B,H,slots,ds]
-        new_mask  = new_mask_seq[:, -1]                        # [B,H,slots]
-        # Global read on the new stack state
-        gate_scores  = self.gate_proj(new_stack).squeeze(-1)   # [B,H,slots]
-        gate_weights = F.softmax(gate_scores + (1 - new_mask) * -1e9, dim=-1)
-        memory_out   = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=2)
-        memory_out   = memory_out.view(batch_size, self.stack_d_model)
-        memory_out_proj = self.up_proj(memory_out)             # [B,D]
-        output = memory_out_proj * self.res_weight + hidden_state
-        self._update_cache(cur_k, cur_act)
-        return output, new_stack, new_mask
-@dataclass
-class LAuReLLayer(nn.Module):
-    """
-    LAuReL: Learned Augmented Residual Layer.
-    A lightweight replacement for the canonical residual connection
-    that learns to blend the nonlinear sub-layer output f(x) with a
-    richer linear function of the residual x, optionally augmented by a
-    low-rank transformation.
-    Reference: Menghani, G., Kumar, R. & Kumar, S. (ICML 2025).
-    *LAuReL: Learned Augmented Residual Layer.* arXiv:2411.07501.
-    ── Sub-variants ────────────────────────────────────────────────────
-    Controlled by config flags; any combination is valid:
-    **RW only** (use_laurel_rw=True, use_laurel_lr=False):
-        x_{i+1} = α · f(x_i) + β · x_i
-        [α, β]  = softmax([a, b]),  a,b ∈ ℝ  (2 params)
-    **LR only** (use_laurel_rw=False, use_laurel_lr=True):
-        x_{i+1} = f(x_i) + A·(B·x_i) + x_i
-        B ∈ ℝ^{r×D}  column-orthogonal init  (down-projection)
-        A ∈ ℝ^{D×r}  zero init               (up-projection)
-        Params: 2·r·D per layer.
-    **RW + LR** (both True, paper recommendation):
-        x_{i+1} = α · f(x_i) + β · (A·(B·x_i) + x_i)
-    ── Initialisation ──────────────────────────────────────────────────
-    RW: raw logits [a, b] = [0, 0] → α=β=0.5 at step 0.
-    LR: A (up) = zeros → lr_term = 0 at step 0 → pure residual at init.
-    This ensures the model starts as a standard residual and smoothly
-    diverges as the gates and low-rank matrices are trained.
-    ── Integration in NeoLLM ───────────────────────────────────────────
-    Applied immediately before GPAS at both residual sums per layer:
-        h_tilde = GPAS( LAuReL(attn_out, residual_attn) )
-        output  = GPAS( LAuReL(delta_m,  residual_mlp)  )
-    GPAS then applies its stop-gradient scaling on the combined stream,
-    preserving gradient magnitudes across the depth of the network.
-    The two techniques are structurally orthogonal: LAuReL controls the
-    *mixing ratio* of f(x) and x at each residual junction; GPAS
-    controls the *magnitude* of the combined stream with a learned gate
-    and a stop-gradient operator that prevents gradient vanishing.
-    Args:
-        config: NeoLLMConfig. Reads use_laurel_rw, use_laurel_lr,
-                laurel_lr_rank, hidden_size.
-    """
-    def __init__(self, config: NeoLLMConfig):
-        super().__init__()
-        self.use_rw   = getattr(config, "use_laurel_rw", True)
-        self.use_lr   = getattr(config, "use_laurel_lr", True)
-        D             = config.hidden_size
-        r             = getattr(config, "laurel_lr_rank", 32)
-        if self.use_rw:
-            # Raw logits for softmax([α, β]).
-            # Stored as a single 2-vector so softmax is one op.
-            # Init to zero → α=β=0.5 at step 0.
-            self.rw_logits = nn.Parameter(torch.zeros(2))
-        if self.use_lr:
-            # down: B ∈ ℝ^{r×D}, column-orthogonal init (paper §3.3 LLM recommendation)
-            # up:   A ∈ ℝ^{D×r}, zero init → lr_term=0 at step 0 (LoRA-style)
-            self.lr_down = nn.Linear(D, r, bias=False)
-            self.lr_up   = nn.Linear(r, D, bias=False)
-    def forward(
-        self,
-        f_out:    torch.Tensor,           # output of f(x): attn or MLP  [B,S,D]
-        x_res:    torch.Tensor,           # residual (skip connection)    [B,S,D]
-        analysis: Optional[LAuReLAnalysis] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            f_out:    Output of f(x) — attention output or MLP delta.
-            x_res:    Residual tensor — accumulated hidden state.
-            analysis: Optional analysis container; populated in eval+analysis mode.
-        Returns:
-            Combined tensor [B, S, D] to be fed into GPAS.
-        """
-        # ── LR component: A·(B·x_res) ────────────────────────────────────
-        lr_term = None
-        if self.use_lr:
-            lr_term = self.lr_up(self.lr_down(x_res))     # [B,S,D]
-            g_res   = lr_term + x_res                      # enriched residual
-        else:
-            g_res   = x_res
-        # ── RW component: α·f + β·g ──────────────────────────────────────
-        if self.use_rw:
-            weights = torch.softmax(self.rw_logits, dim=0)  # [2]
-            alpha   = weights[0]
-            beta    = weights[1]
-            out     = alpha * f_out + beta * g_res
-        else:
-            # LR only: standard sum with enriched residual
-            out = f_out + g_res
-        if analysis is not None:
-            if self.use_rw:
-                analysis.alpha_rw = alpha.item()
-                analysis.beta_rw  = beta.item()
-            if self.use_lr:
-                analysis.lr_term  = lr_term.detach()
-            analysis.output   = out.detach()
-        return out
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections, optional JTok-M injection.
@@ -3868,78 +3120,10 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             self.attn_res_query_attn = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_query_mlp  = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_norm       = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-            _num_blocks = getattr(config, 'attn_res_num_blocks', 0)
-            self.attn_res_block_size = (
-                max(config.num_hidden_layers // _num_blocks, 1) if _num_blocks > 0 else 1
-            )
         else:
             self.attn_res_query_attn = None
             self.attn_res_query_mlp  = None
             self.attn_res_norm       = None
-            self.attn_res_block_size = None
-        # ── MUDD: separate K/V LayerNorms for qkvr+sepln mode ──────────────
-        # Only instantiated when both mudd_dense_type='qkvr' AND mudd_sepln=True.
-        # The existing input_layernorm handles the Q stream (unchanged).
-        # Separate norms for K and V allow each stream to rescale independently.
-        _use_mudd  = getattr(config, 'use_mudd', False)
-        _mudd_qkvr = getattr(config, 'mudd_dense_type', 'qkvr') == 'qkvr'
-        _mudd_sepln = getattr(config, 'mudd_sepln', False)
-        if _use_mudd and _mudd_qkvr and _mudd_sepln:
-            self.mudd_k_norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
-            self.mudd_v_norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
-        else:
-            self.mudd_k_norm = None
-            self.mudd_v_norm = None
-        # ── DCA (Heddes et al., 2025, arXiv:2502.06785) ───────────────────
-        # GRN-v3 module that aggregates the k-selected depth stack into 3
-        # independent streams (Q, K, V). Each stream has its own dimension-
-        # and input-dependent weights, enabling richer cross-layer interactions.
-        # K and V get their own SeeDNorm + LNS norm chain (same scheme as
-        # MUDD sepln) since they now arrive from a different aggregation path.
-        # The residual connection uses the Q stream output (xq) as its base,
-        # matching the DCA paper's decoder block design (residual = q_input).
-        self.use_dca = getattr(config, 'use_dca', False)
-        if self.use_dca:
-            _dca_k           = getattr(config, 'dca_k', 2)
-            _num_stack       = min(layer_idx + 1, 2 * _dca_k)
-            self.dca_grn     = NeoLLMGRN(
-                hidden_size      = config.hidden_size,
-                num_stack_layers = _num_stack,
-                num_outputs      = 3,
-                eps              = config.rms_norm_eps,
-            )
-            self.dca_k_norm  = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
-            self.dca_v_norm  = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
-        else:
-            self.dca_grn    = None
-            self.dca_k_norm = None
-            self.dca_v_norm = None
-        # ── StackTrans (Zhang et al., NeurIPS 2025) ───────────────────────
-        # Differentiable multi-head hidden-state stack inserted at the very
-        # beginning of the layer forward, before the attention sublayer.
-        # Mutually exclusive with use_attn_res, use_mudd, use_dca.
-        self.use_stacktrans = getattr(config, 'use_stacktrans', False)
-        if self.use_stacktrans:
-            self.stack_memory = StackMemory(config)
-        else:
-            self.stack_memory = None
-        # ── LAuReL (Menghani, Kumar & Kumar, ICML 2025) ───────────────────
-        # Learned augmented residual connection replacing f(x)+x at both
-        # the attention and MLP residual sums. Applied immediately before
-        # GPAS, so GPAS still controls magnitude via stop-gradient scaling.
-        # Two independent instances per layer (attention and MLP).
-        # Compatible with use_stacktrans. Incompatible with MUDD/DCA/AttnRes.
-        self.use_laurel = getattr(config, 'use_laurel', False)
-        if self.use_laurel:
-            self.laurel_attn = LAuReLLayer(config)
-            self.laurel_mlp  = LAuReLLayer(config)
-        else:
-            self.laurel_attn = None
-            self.laurel_mlp  = None
     def _attn_res(
         self,
@@ -3989,10 +3173,6 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         B_vals: Optional[torch.Tensor] = None,
         attn_res_sources: Optional[list] = None,
         attn_res_partial: Optional[torch.Tensor] = None,
-        mudd_streams: Optional[tuple] = None,
-        dca_stack: Optional[torch.Tensor] = None,
-        stack_state: Optional[torch.Tensor] = None,
-        stack_mask: Optional[torch.Tensor] = None,
         layer_analysis: Optional[LayerAnalysis] = None,
         output_attentions: Optional[bool] = False,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
@@ -4002,63 +3182,6 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         if layer_analysis is not None:
             layer_analysis.hidden_states_input = hidden_states.detach()
-        # ── StackTrans: hidden-state stack (pre-attention, pre-norm) ─────
-        # Executed first so attention sees the stack-enriched representation.
-        # stack_state / stack_mask carry the stack from the previous layer;
-        # both are None for layer 0 (StackMemory initialises to zeros then).
-        # Mutually exclusive with MUDD / DCA / AttnRes — those branches are
-        # all skipped when use_stacktrans=True (enforced in NeoLLMConfig).
-        if self.use_stacktrans and self.stack_memory is not None:
-            st_analysis = layer_analysis.stack if layer_analysis is not None else None
-            hidden_states, stack_state, stack_mask = self.stack_memory(
-                hidden_states, stack_state, stack_mask, analysis=st_analysis
-            )
-        # ── MUDD: unpack streams for Q/K/V/R (layer > 0 only) ────────────
-        # mudd_streams is a 4-tuple (xq, xk, xv, xr) when use_mudd=True and
-        # layer_idx > 0; None for layer 0 (standard residual there).
-        # xr replaces hidden_states as the residual throughout this layer.
-        # xq/xk/xv are the aggregated inputs for Q, K, V projections.
-        # When mudd_dense_type='l' (single stream), all four are equal.
-        # When mudd_sepln=True each stream has its own norm applied below.
-        mudd_xk = None
-        mudd_xv = None
-        if mudd_streams is not None:
-            xq_mudd, xk_mudd, xv_mudd, xr_mudd = mudd_streams
-            # Replace hidden_states with xr for residual connections
-            hidden_states = xr_mudd
-            # Norm K and V streams — use separate SeeDNorm if sepln, else
-            # they will share the main input_layernorm path via h_attn below
-            if self.mudd_k_norm is not None:
-                mudd_xk = self.lns_attn(self.mudd_k_norm(xk_mudd))
-                mudd_xv = self.lns_attn(self.mudd_v_norm(xv_mudd))
-            else:
-                # No sepln: K/V also go through the Q-path norm chain
-                mudd_xk = self.lns_attn(self.input_layernorm(xk_mudd))
-                mudd_xv = self.lns_attn(self.input_layernorm(xv_mudd))
-            # Override hidden_states for the Q path
-            hidden_states_for_attn = xq_mudd
-        else:
-            hidden_states_for_attn = hidden_states
-        # ── DCA: GRN-v3 depth-wise aggregation ───────────────────────────
-        # When active, runs the per-layer GRN on the k-selected depth stack
-        # to produce three independent aggregated streams (Q, K, V).
-        # xq replaces hidden_states as both the Q projection input AND the
-        # post-attention residual (DCA paper: residual = q_input).
-        # xk and xv go through separate SeeDNorm+LNS chains and are injected
-        # into NeoLLMAttention via the existing mudd_xk/mudd_xv parameters.
-        dca_residual = None
-        dca_a = layer_analysis.dca if layer_analysis is not None else None
-        if self.use_dca and dca_stack is not None:
-            xq, xk, xv = self.dca_grn(dca_stack, analysis=dca_a)
-            dca_residual             = xq
-            hidden_states_for_attn   = xq
-            # K and V streams: SeeDNorm + LNS before k_proj / v_proj
-            # (reuses the mudd_xk/mudd_xv injection path in NeoLLMAttention)
-            mudd_xk = self.lns_attn(self.dca_k_norm(xk))
-            mudd_xv = self.lns_attn(self.dca_v_norm(xv))
         # ── Attention Residuals: compute pre-attention input ──────────────
         # When active, the input to the attention sublayer is no longer the
         # raw hidden_states (accumulated residual) but a softmax-weighted
@@ -4072,19 +3195,10 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
                 attn_res_sources, attn_res_partial, self.attn_res_query_attn,
                 ar_analysis, "attn",
             )
-            # ── Block boundary fires HERE — after pre-attn, before attn sublayer ──
-            # Paper pseudocode (Fig. 2) timing: the completed partial of the previous
-            # block is pushed to sources AFTER the pre-attn AttnRes call, so the first
-            # layer of a new block still sees the old partial as an intra-block source
-            # (no duplicate) and the new intra-block accumulation starts from zeros.
-            if self.layer_idx > 0 and self.layer_idx % self.attn_res_block_size == 0:
-                attn_res_sources.append(attn_res_partial)   # in-place; outer loop sees this
-                attn_res_partial = torch.zeros_like(attn_res_partial)  # fresh delta start
             residual_attn = attn_res_partial
         else:
-            h_attn = hidden_states_for_attn   # MUDD/DCA: xq stream or unchanged
-            # DCA: residual is xq (the GRN Q-stream output), not raw hidden_states
-            residual_attn = dca_residual if dca_residual is not None else hidden_states
         # ── Attention block ───────────────────────────────────────────────
         sn_pre = layer_analysis.seednorm_pre_attn if layer_analysis is not None else None
@@ -4100,8 +3214,6 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             first_layer_fan=first_layer_fan,
             attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
             repo_rope_args=repo_rope_args,
-            mudd_xk=mudd_xk,
-            mudd_xv=mudd_xv,
             **kwargs,
         )
@@ -4109,18 +3221,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             layer_analysis.attn_contribution = hidden_states.detach()
         gpas_attn_a = layer_analysis.gpas_attn if layer_analysis is not None else None
-        # ── Attention residual sum ────────────────────────────────────────
-        # Standard: GPAS(residual_attn + hidden_states)
-        # LAuReL:   GPAS(LAuReL(f_out=hidden_states, x_res=residual_attn))
-        # Both paths feed into GPAS which applies stop-gradient scaling.
-        if self.use_laurel and self.laurel_attn is not None:
-            la_attn_a = layer_analysis.laurel_attn if layer_analysis is not None else None
-            combined_attn = self.laurel_attn(hidden_states, residual_attn, analysis=la_attn_a)
-        else:
-            combined_attn = residual_attn + hidden_states
-        h_tilde = self.gpas_attn(combined_attn, analysis=gpas_attn_a)
         if layer_analysis is not None:
             layer_analysis.h_tilde = h_tilde.detach()
@@ -4156,11 +3257,8 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         if layer_analysis is not None:
             layer_analysis.mlp_contribution = delta_m.detach()
-        # ── MLP residual sum ──────────────────────────────────────────────
-        # LAuReL treats f(x) = delta_m [+ delta_r when JTok-M active] and
-        # x_res = residual_mlp. JTok-M delta_r is additive alongside delta_m,
-        # so the nonlinear component is delta_m + delta_r in that path.
-        gpas_mlp_a = layer_analysis.gpas_mlp if layer_analysis is not None else None
         if self.use_jtokm and z_tilde is not None and B_vals is not None:
             orig_shape = h_tilde.shape
             h_flat     = h_tilde.reshape(-1, self.hidden_size)
@@ -4171,21 +3269,11 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             delta_r, aux_stats = self.jtokm(h_flat, z_flat, B_flat, analysis=jtokm_a)
             delta_r = delta_r.reshape(orig_shape)
-            f_mlp = delta_m + delta_r                          # combined nonlinear term
-            if self.use_laurel and self.laurel_mlp is not None:
-                la_mlp_a = layer_analysis.laurel_mlp if layer_analysis is not None else None
-                combined_mlp = self.laurel_mlp(f_mlp, residual_mlp, analysis=la_mlp_a)
-            else:
-                combined_mlp = residual_mlp + f_mlp
-            hidden_states = self.gpas_mlp(combined_mlp, analysis=gpas_mlp_a)
         else:
-            aux_stats = None
-            if self.use_laurel and self.laurel_mlp is not None:
-                la_mlp_a = layer_analysis.laurel_mlp if layer_analysis is not None else None
-                combined_mlp = self.laurel_mlp(delta_m, residual_mlp, analysis=la_mlp_a)
-            else:
-                combined_mlp = residual_mlp + delta_m
-            hidden_states = self.gpas_mlp(combined_mlp, analysis=gpas_mlp_a)
         if layer_analysis is not None:
             layer_analysis.hidden_states_output = hidden_states.detach()
@@ -4197,9 +3285,6 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             outputs += (aux_stats,)
         if versatile_aux is not None:
             outputs += (versatile_aux,)
-        # StackTrans: always append stack state (None, None when inactive)
-        # so NeoLLMModel.forward can extract them by position -2 and -1.
-        outputs += (stack_state, stack_mask)
         return outputs
@@ -4583,45 +3668,6 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
                 module.attn_res_query_attn.data.zero_()
                 module.attn_res_query_mlp.data.zero_()
-        elif isinstance(module, StackMemory):
-            # Truncated-normal for all Linear weights (matches NeoLLM convention).
-            # Biases zeroed. res_weight starts at 1.0 so the stack readout
-            # contributes equally to the residual from step 0.
-            std      = getattr(self.config, "initializer_range", 0.02)
-            cutoff   = getattr(self.config, "init_cutoff_factor", 3.0) * std
-            for attr in ("down_proj", "up_proj", "action_head", "gate_proj"):
-                layer = getattr(module, attr, None)
-                if layer is not None and hasattr(layer, "weight"):
-                    nn.init.trunc_normal_(
-                        layer.weight, mean=0.0, std=std, a=-cutoff, b=cutoff
-                    )
-                    if layer.bias is not None:
-                        nn.init.zeros_(layer.bias)
-            if hasattr(module, "res_weight"):
-                module.res_weight.data.fill_(1.0)
-        elif isinstance(module, LAuReLLayer):
-            # RW: raw logits initialised to zero → softmax([0,0]) = [0.5, 0.5].
-            #     The model quickly learns the optimal α,β weighting.
-            # LR: lr_down (B, down-projection) — column orthogonal init,
-            #         as recommended by the LAuReL paper §3.3 for LLMs.
-            #         Column orthogonal preserves the L2 norm of the projected
-            #         representation, ensuring stable gradient magnitudes
-            #         through the low-rank bottleneck at init.
-            #     lr_up  (A, up-projection) — zero init → lr_term = A·Bx = 0
-            #         at step 0, so the module starts as a standard residual.
-            #         Gradient flows back through lr_down immediately via
-            #         chain rule; A learns from step 1 onward.
-            if hasattr(module, "rw_logits"):
-                nn.init.zeros_(module.rw_logits)
-            if hasattr(module, "lr_down"):
-                # Column-orthogonal: each column of weight^T is orthonormal.
-                # nn.init.orthogonal_ produces a row-orthogonal matrix (rows
-                # are orthonormal). Transposing gives column-orthogonal.
-                nn.init.orthogonal_(module.lr_down.weight)
-            if hasattr(module, "lr_up"):
-                nn.init.zeros_(module.lr_up.weight)
         elif isinstance(module, SpellingBeeEmbedding):
             # byte_emb initialised identically to token embeddings: std=1/√d.
             # Ensures E[‖e_byte‖²] ≈ 1 at init, matching etok, so the
@@ -4691,82 +3737,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.gradient_checkpointing = False
         self.first_layer_fan        = None
-        # ── StackTrans state flag ─────────────────────────────────────────
-        self.use_stacktrans = getattr(config, 'use_stacktrans', False)
-        # ── Residual-replacement mutex ────────────────────────────────────
-        # AttnRes, MUDD, and DCA all replace the residual aggregation
-        # mechanism — at most one can be active at a time.
-        _use_mudd     = getattr(config, 'use_mudd', False)
-        _use_attn_res = getattr(config, 'use_attn_res', False)
-        _use_dca      = getattr(config, 'use_dca',      False)
-        _active_count = sum([_use_mudd, _use_attn_res, _use_dca])
-        if _active_count > 1:
-            active = [n for n, f in [('use_mudd', _use_mudd),
-                                     ('use_attn_res', _use_attn_res),
-                                     ('use_dca', _use_dca)] if f]
-            raise ValueError(
-                f"use_mudd, use_attn_res, and use_dca are mutually exclusive — "
-                f"got {active} simultaneously active. Set exactly one to True."
-            )
-        if _use_mudd:
-            _mudd_dense_type  = getattr(config, 'mudd_dense_type',  'qkvr')
-            _mudd_dynamic     = getattr(config, 'mudd_dynamic_dense', True)
-            _mudd_round64     = getattr(config, 'mudd_round64',      False)
-            _mudd_expand_last = getattr(config, 'mudd_expand_last',  False)
-            _C = 4 if _mudd_dense_type == 'qkvr' else 1
-            # Static bias: one [C, lidx+2] parameter per layer.
-            # Initialized with 1 at index [c, lidx+1] (identity on Xi) so that
-            # at init (W2=0) each DA output = Xi — reducing to standard Transformer.
-            _static_list = []
-            for lidx in range(config.num_hidden_layers):
-                # Last layer always uses C=1: its DA output is the final
-                # model representation fed to the norm and lm_head, collapsing
-                # all history into a single stream (paper code, both files).
-                _c = 1 if lidx == config.num_hidden_layers - 1 else _C
-                a = torch.zeros(_c, lidx + 2)
-                a[:, lidx + 1] = 1.0        # last entry = current layer = identity
-                _static_list.append(nn.Parameter(a))
-            self.mudd_static = nn.ParameterList(_static_list)
-            # Dynamic DA modules (one per layer)
-            if _mudd_dynamic:
-                self.mudd_dynamic = nn.ModuleList([
-                    NeoLLMMUDDModule(
-                        hidden_size = config.hidden_size,
-                        lidx        = lidx,
-                        # Last layer: C=1 — collapses to single final repr
-                        num_ways    = 1 if lidx == config.num_hidden_layers - 1 else _C,
-                        is_last     = (lidx == config.num_hidden_layers - 1),
-                        expand_last = _mudd_expand_last,
-                        round64     = _mudd_round64,
-                    )
-                    for lidx in range(config.num_hidden_layers)
-                ])
-            else:
-                self.mudd_dynamic = None
-        else:
-            self.mudd_static  = None
-            self.mudd_dynamic = None
-        # ── DCA final GRN (Heddes et al., 2025) ───────────────────────────
-        # Applied once after all decoder layers to aggregate the full depth
-        # stack into the final hidden representation before the output norm.
-        # num_stack_layers = min(2*k, L+1) — same cap as per-layer GRNs.
-        # num_outputs=1 collapses to a single [B, S, D] tensor.
-        if _use_dca and getattr(config, 'dca_use_final_grn', True):
-            _dca_k   = getattr(config, 'dca_k', 2)
-            _dca_eps = getattr(config, 'dca_grn_eps', config.rms_norm_eps)
-            self.dca_final_grn = NeoLLMGRN(
-                hidden_size      = config.hidden_size,
-                num_stack_layers = min(2 * _dca_k, config.num_hidden_layers + 1),
-                num_outputs      = 1,
-                eps              = _dca_eps,
-            )
-        else:
-            self.dca_final_grn = None
         self.post_init()
     def get_input_embeddings(self):
@@ -4794,9 +3764,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             getattr(cfg, "use_repo", False)
             and layer_idx >= getattr(cfg, "repo_start_layer", cfg.num_hidden_layers // 3)
         )
-        _versatile      = getattr(cfg, "use_versatile_ffn", False)
-        _use_stacktrans = getattr(cfg, "use_stacktrans", False)
-        _use_laurel     = getattr(cfg, "use_laurel", False)
         return LayerAnalysis(
             seednorm_pre_attn  = SeeDNormAnalysis(),
             seednorm_post_attn = SeeDNormAnalysis(),
@@ -4810,14 +3778,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 polynorm = PolyNormAnalysis() if not _versatile else None,
                 versatile = VersatileFFNAnalysis() if _versatile else None,
             ),
-            gpas_attn    = GPASAnalysis(),
-            gpas_mlp     = GPASAnalysis(),
-            jtokm        = JTokMAnalysis() if cfg.use_jtokm else None,
-            attn_res     = AttnResAnalysis() if getattr(cfg, "use_attn_res", False) else None,
-            dca          = DCAAnalysis()      if getattr(cfg, "use_dca",      False) else None,
-            stack        = StackMemoryAnalysis() if _use_stacktrans else None,
-            laurel_attn  = LAuReLAnalysis()   if _use_laurel else None,
-            laurel_mlp   = LAuReLAnalysis()   if _use_laurel else None,
         )
     def forward(
@@ -4933,57 +3897,13 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         if use_attn_res:
             attn_res_sources = [hidden_states]   # b_0 = token embedding
             attn_res_partial = hidden_states     # initial partial sum
-            # Block boundary handling now lives inside NeoLLMDecoderLayer.forward(),
-            # firing after the pre-attn AttnRes call (paper Fig. 2 timing).
-        # ── MUDD state ────────────────────────────────────────────────────
-        # hiddens[0] = token embedding; hiddens[i] = output of layer i-1.
-        # After each layer, its output is appended so layer i receives a
-        # history of length i+2 (embedding + i preceding layer outputs).
-        # mudd_streams is None for layer 0 (standard residual path there)
-        # and a C-tuple of [B,T,D] tensors for layers 1…L.
-        use_mudd = getattr(self.config, 'use_mudd', False)
-        mudd_hiddens = None
-        mudd_streams = None
-        if use_mudd:
-            mudd_hiddens = [hidden_states]   # b_0 = token embedding
-        # ── DCA state ─────────────────────────────────────────────────────
-        # all_tokens[0] = token embedding; grows by one per decoder layer.
-        # Before each layer, the stack is built and k-DCA selection applied,
-        # capping memory at 2*dca_k stored tensors regardless of depth.
-        # dca_stack is always non-None (even layer 0 gets [embedding]).
-        use_dca   = getattr(self.config, 'use_dca', False)
-        _dca_k    = getattr(self.config, 'dca_k', 2)
-        dca_all_tokens = None
-        dca_stack      = None
-        if use_dca:
-            dca_all_tokens = [hidden_states]   # [embedding]
-        # ── StackTrans state ──────────────────────────────────────────────
-        # stack_state / stack_mask start as None for the first layer;
-        # StackMemory initialises them to zeros internally on first call.
-        # After each layer, the returned (new_stack, new_mask) are passed
-        # to the next layer as its initial stack — this is "vertical" state
-        # propagation: information flows depth-wise through the stack.
-        #
-        # Temporal accumulation across generation steps is handled by the
-        # StackMemory internal k_cache / action_cache mechanism:
-        #   - enable_cache is set True when use_cache=True (inference)
-        #   - reset_cache() is called when past_key_values is None
-        #     (new sequence, not a continuation step)
-        # This matches the OLMo reference implementation exactly.
-        use_stacktrans = self.use_stacktrans
-        stack_state    = None
-        stack_mask     = None
-        if use_stacktrans:
-            use_cache_flag = kwargs.get("use_cache", False)
-            past_kv_flag   = kwargs.get("past_key_values", None)
-            for layer in self.layers:
-                if layer.stack_memory is not None:
-                    layer.stack_memory.enable_cache = bool(use_cache_flag)
-                    if past_kv_flag is None:
-                        layer.stack_memory.reset_cache()
         # Pre-allocate per-layer analysis list when analysis is active
         if analysis_state is not None:
@@ -4993,13 +3913,17 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
-            # ── DCA: build k-selected stack for this layer ───────────────
-            # Stack has layer_idx+1 entries before selection; after k-DCA
-            # selection it has at most 2*dca_k entries (first k + last k).
-            if use_dca:
-                dca_stack = dca_select_layers(
-                    torch.stack(dca_all_tokens, dim=0), k=_dca_k
-                )
             # Build per-layer analysis container (only in eval + analysis mode)
             layer_analysis = None
@@ -5017,10 +3941,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 B_vals=B_vals,
                 attn_res_sources=attn_res_sources,
                 attn_res_partial=attn_res_partial if use_attn_res else None,
-                mudd_streams=mudd_streams,
-                dca_stack=dca_stack,
-                stack_state=stack_state,
-                stack_mask=stack_mask,
                 layer_analysis=layer_analysis,
                 output_attentions=output_attentions,
                 repo_rope_args=repo_rope_args,
@@ -5028,76 +3948,23 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             )
             hidden_states = layer_outputs[0]
-            # ── StackTrans: extract updated stack state for next layer ─────
-            # layer_outputs always ends with (stack_state, stack_mask) —
-            # both are None when use_stacktrans=False (zero cost).
-            stack_state = layer_outputs[-2]
-            stack_mask  = layer_outputs[-1]
             # Update AttnRes partial sum — the new partial is the layer output
             if use_attn_res:
                 attn_res_partial = hidden_states
-            # Append layer output to DCA history for next layer's stack
-            if use_dca:
-                dca_all_tokens.append(hidden_states)
-            # ── MUDD: append current output and compute DA for next layer ──
-            # mudd_hiddens grows by 1 each iteration; at layer i it has i+2
-            # entries (embedding + i outputs). The DA for layer i+1 takes this
-            # full history and produces C streams via dynamic + static weights.
-            # mudd_streams is passed to layer i+1 as its input streams.
-            if use_mudd:
-                mudd_hiddens.append(hidden_states)
-                # Compute DA module output using the just-appended history
-                # (mudd_hiddens now has layer_idx+2 entries)
-                is_last_layer = (layer_idx == self.config.num_hidden_layers - 1)
-                mudd_da_module = self.mudd_dynamic[layer_idx] if self.mudd_dynamic is not None else None
-                if mudd_da_module is not None:
-                    raw_streams = mudd_da_module(
-                        hidden_states,
-                        mudd_hiddens,
-                        self.mudd_static[layer_idx],
-                    )
-                else:
-                    # Static-only: apply weighted sum with learnable bias
-                    # stack history [L, B, T, D], weight by mudd_static
-                    stacked = torch.stack(mudd_hiddens, dim=0)    # [L, B, T, D]
-                    a = self.mudd_static[layer_idx].to(hidden_states.dtype)  # [C, L]
-                    raw_streams = tuple(
-                        torch.einsum('cl,lbtd->btd', a[c:c+1], stacked).squeeze(0)
-                        for c in range(a.shape[0])
-                    )
-                if is_last_layer:
-                    # Last layer DA always produces C=1 → single final repr.
-                    # This is the MUDD-aggregated combination of all layer
-                    # histories weighted by the last layer's output as query.
-                    # Replace hidden_states so the final norm and lm_head
-                    # receive this aggregated representation, not the raw
-                    # last-layer output (paper forward loop: x = x[0] after loop).
-                    hidden_states = raw_streams[0]
-                    mudd_streams = None   # no next layer
-                elif len(raw_streams) == 1:
-                    # dense_type='l': broadcast to 4-tuple
-                    mudd_streams = (raw_streams[0],) * 4
-                else:
-                    # 'qkvr': 4 streams → (xq, xk, xv, xr)
-                    mudd_streams = raw_streams
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
-            # Collect JTok-M / VersatileFFN aux stats.
-            # layer_outputs always ends with (stack_state, stack_mask) —
-            # slice [1:-2] to skip hidden_states[0] and the two stack slots.
-            inner_outputs = layer_outputs[1:-2]
-            if self.config.use_jtokm and len(inner_outputs) > (1 if output_attentions else 0):
-                all_aux_stats.append(inner_outputs[-1])
             if getattr(self.config, "use_versatile_ffn", False):
-                for item in inner_outputs:
                     if isinstance(item, tuple) and len(item) == 3:
                         all_aux_stats.append(("versatile", item))
                         break
@@ -5105,16 +3972,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                     and hasattr(decoder_layer, "current_layer_fan")):
                 self.first_layer_fan = decoder_layer.current_layer_fan
-        # ── DCA final GRN ──────────────────────────────────────────────────
-        # Aggregates the full depth history (k-selected) into the final
-        # hidden representation, matching the DCAGPT forward loop which
-        # applies final_grn(stack(all_tokens)) before norm → lm_head.
-        if use_dca and self.dca_final_grn is not None:
-            final_stack   = dca_select_layers(
-                torch.stack(dca_all_tokens, dim=0), k=_dca_k
-            )
-            hidden_states = self.dca_final_grn(final_stack)
         hidden_states = self.norm(hidden_states)
         if output_hidden_states:
@@ -5127,9 +3984,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             analysis_state.attn_res_sources_final  = (
                 attn_res_sources if use_attn_res else None
             )
-            analysis_state.dca_all_tokens_final    = (
-                dca_all_tokens if use_dca else None
-            )
         if not return_dict:
             return tuple(
@@ -5270,7 +4124,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             layers                 = None,   # filled by NeoLLMModel.forward
             jtokm_aux_stats        = [] if cfg.use_jtokm else None,
             attn_res_sources_final = [] if getattr(cfg, "use_attn_res", False) else None,
-            dca_all_tokens_final   = [] if getattr(cfg, "use_dca",      False) else None,
         )
     # ── Standard model API ────────────────────────────────────────────────
@@ -5408,11 +4261,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
-    "StackMemory",
-    "LAuReLLayer",
-    "NeoLLMMUDDModule",
-    "NeoLLMGRN",
-    "dca_select_layers",
     "NeoLLMForCausalLM",
     "NeoLLMModel",
     "NeoLLMPreTrainedModel",
@@ -5430,7 +4278,7 @@ __all__ = [
     "REPOModule",
     "VersatileFFN",
     "compute_versatile_aux_loss",
-    # Analysis dataclasses
     "AnalysisState",
     "LayerAnalysis",
     "AttentionAnalysis",
@@ -5444,9 +4292,6 @@ __all__ = [
     "VersatileFFNAnalysis",
     "JTokMAnalysis",
     "AttnResAnalysis",
-    "DCAAnalysis",
-    "StackMemoryAnalysis",
-    "LAuReLAnalysis",
     "GeneratorAnalysis",
 ]

 from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+torch._dynamo.config.capture_scalar_outputs = True
 logger = logging.get_logger(__name__)
     lns_scale:        Optional[float] = None          # 1/√(2ℓ) scaling factor
 @dataclass
 class AttnResAnalysis:
     """
     sources_count:    Optional[int] = None            # number of sources including partial
 @dataclass
 class LayerAnalysis:
     """
     gpas_mlp:           Optional[GPASAnalysis] = None      # GPAS after MLP residual
     # Optional components (None when inactive)
+    jtokm:    Optional[JTokMAnalysis] = None    # if use_jtokm
+    attn_res: Optional[AttnResAnalysis] = None  # if use_attn_res
 @dataclass
     layers:                 Optional[List[LayerAnalysis]] = None
     jtokm_aux_stats:        Optional[list] = None
     attn_res_sources_final: Optional[list] = None
     logits:                 Optional[torch.Tensor] = None
 class ScalarMultiplier(nn.Module):
         first_layer_fan: Optional[torch.Tensor] = None,
         attn_analysis: Optional[AttentionAnalysis] = None,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
             h_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * h_fan
         current_layer_fan = h_fan.clone()
         query_shape = (*input_shape, self.config.num_attention_heads, self.head_dim)
         kv_shape    = (*input_shape, self.num_mea_component_heads, self.head_dim)
             attn_analysis.gate_raw = gate.detach()
         q = self.q_norm(q_raw.view(query_shape)).transpose(1, 2)
+        k = self.k_norm(self.k_proj(h_fan).view(kv_shape)).transpose(1, 2)
+        v = self.v_proj(h_fan).view(kv_shape).transpose(1, 2)
         if attn_analysis is not None:
             attn_analysis.q_post_norm = q.detach()
         return result
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections, optional JTok-M injection.
             self.attn_res_query_attn = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_query_mlp  = nn.Parameter(torch.zeros(config.hidden_size))
             self.attn_res_norm       = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
             self.attn_res_query_attn = None
             self.attn_res_query_mlp  = None
             self.attn_res_norm       = None
     def _attn_res(
         self,
         B_vals: Optional[torch.Tensor] = None,
         attn_res_sources: Optional[list] = None,
         attn_res_partial: Optional[torch.Tensor] = None,
         layer_analysis: Optional[LayerAnalysis] = None,
         output_attentions: Optional[bool] = False,
         repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
         if layer_analysis is not None:
             layer_analysis.hidden_states_input = hidden_states.detach()
         # ── Attention Residuals: compute pre-attention input ──────────────
         # When active, the input to the attention sublayer is no longer the
         # raw hidden_states (accumulated residual) but a softmax-weighted
                 attn_res_sources, attn_res_partial, self.attn_res_query_attn,
                 ar_analysis, "attn",
             )
             residual_attn = attn_res_partial
         else:
+            h_attn = hidden_states
+            residual_attn = hidden_states
         # ── Attention block ───────────────────────────────────────────────
         sn_pre = layer_analysis.seednorm_pre_attn if layer_analysis is not None else None
             first_layer_fan=first_layer_fan,
             attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
             repo_rope_args=repo_rope_args,
             **kwargs,
         )
             layer_analysis.attn_contribution = hidden_states.detach()
         gpas_attn_a = layer_analysis.gpas_attn if layer_analysis is not None else None
+        h_tilde = self.gpas_attn(residual_attn + hidden_states, analysis=gpas_attn_a)
         if layer_analysis is not None:
             layer_analysis.h_tilde = h_tilde.detach()
         if layer_analysis is not None:
             layer_analysis.mlp_contribution = delta_m.detach()
+        # ── JTok-M injection (additive alongside MLP residual) ────────────
+        aux_stats = None
         if self.use_jtokm and z_tilde is not None and B_vals is not None:
             orig_shape = h_tilde.shape
             h_flat     = h_tilde.reshape(-1, self.hidden_size)
             delta_r, aux_stats = self.jtokm(h_flat, z_flat, B_flat, analysis=jtokm_a)
             delta_r = delta_r.reshape(orig_shape)
+            gpas_mlp_a    = layer_analysis.gpas_mlp if layer_analysis is not None else None
+            hidden_states = self.gpas_mlp(residual_mlp + delta_m + delta_r, analysis=gpas_mlp_a)
         else:
+            gpas_mlp_a    = layer_analysis.gpas_mlp if layer_analysis is not None else None
+            hidden_states = self.gpas_mlp(residual_mlp + delta_m, analysis=gpas_mlp_a)
         if layer_analysis is not None:
             layer_analysis.hidden_states_output = hidden_states.detach()
             outputs += (aux_stats,)
         if versatile_aux is not None:
             outputs += (versatile_aux,)
         return outputs
                 module.attn_res_query_attn.data.zero_()
                 module.attn_res_query_mlp.data.zero_()
         elif isinstance(module, SpellingBeeEmbedding):
             # byte_emb initialised identically to token embeddings: std=1/√d.
             # Ensures E[‖e_byte‖²] ≈ 1 at init, matching etok, so the
         self.gradient_checkpointing = False
         self.first_layer_fan        = None
         self.post_init()
     def get_input_embeddings(self):
             getattr(cfg, "use_repo", False)
             and layer_idx >= getattr(cfg, "repo_start_layer", cfg.num_hidden_layers // 3)
         )
+        _versatile = getattr(cfg, "use_versatile_ffn", False)
         return LayerAnalysis(
             seednorm_pre_attn  = SeeDNormAnalysis(),
             seednorm_post_attn = SeeDNormAnalysis(),
                 polynorm = PolyNormAnalysis() if not _versatile else None,
                 versatile = VersatileFFNAnalysis() if _versatile else None,
             ),
+            gpas_attn = GPASAnalysis(),
+            gpas_mlp  = GPASAnalysis(),
+            jtokm     = JTokMAnalysis() if cfg.use_jtokm else None,
+            attn_res  = AttnResAnalysis() if getattr(cfg, "use_attn_res", False) else None,
         )
     def forward(
         if use_attn_res:
             attn_res_sources = [hidden_states]   # b_0 = token embedding
             attn_res_partial = hidden_states     # initial partial sum
+            num_blocks  = getattr(self.config, 'attn_res_num_blocks', 0)
+            block_size  = (
+                max(self.config.num_hidden_layers // num_blocks, 1)
+                if num_blocks > 0
+                else 1   # Full AttnRes: every layer is its own "block"
+            )
         # Pre-allocate per-layer analysis list when analysis is active
         if analysis_state is not None:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
+            # ── Block AttnRes: boundary handling ──────────────────────────
+            # At each block boundary (excluding layer 0): append the current
+            # partial sum to sources as a completed block summary, then reset
+            # partial to None so the new block builds from scratch — matching
+            # the paper's pseudocode exactly.
+            # For Full AttnRes (block_size=1): every layer is a boundary, so
+            # partial is appended and reset after every layer. The partial is
+            # re-seeded from the previous hidden_states below.
+            if use_attn_res and layer_idx > 0 and layer_idx % block_size == 0:
+                attn_res_sources = attn_res_sources + [attn_res_partial]
+                attn_res_partial = hidden_states  # start new block from current output
             # Build per-layer analysis container (only in eval + analysis mode)
             layer_analysis = None
                 B_vals=B_vals,
                 attn_res_sources=attn_res_sources,
                 attn_res_partial=attn_res_partial if use_attn_res else None,
                 layer_analysis=layer_analysis,
                 output_attentions=output_attentions,
                 repo_rope_args=repo_rope_args,
             )
             hidden_states = layer_outputs[0]
             # Update AttnRes partial sum — the new partial is the layer output
             if use_attn_res:
                 attn_res_partial = hidden_states
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
+            # Collect JTok-M aux stats (last element if present)
+            if self.config.use_jtokm and len(layer_outputs) > (2 if output_attentions else 1):
+                all_aux_stats.append(layer_outputs[-1])
+            # Collect VersatileFFN aux stats (second-to-last if jtokm also present,
+            # or last if jtokm is absent). Only non-None during training.
             if getattr(self.config, "use_versatile_ffn", False):
+                for item in layer_outputs[1:]:
                     if isinstance(item, tuple) and len(item) == 3:
+                        # (p_sum, f_sum, N_tokens) signature
                         all_aux_stats.append(("versatile", item))
                         break
                     and hasattr(decoder_layer, "current_layer_fan")):
                 self.first_layer_fan = decoder_layer.current_layer_fan
         hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             analysis_state.attn_res_sources_final  = (
                 attn_res_sources if use_attn_res else None
             )
         if not return_dict:
             return tuple(
             layers                 = None,   # filled by NeoLLMModel.forward
             jtokm_aux_stats        = [] if cfg.use_jtokm else None,
             attn_res_sources_final = [] if getattr(cfg, "use_attn_res", False) else None,
         )
     # ── Standard model API ────────────────────────────────────────────────
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
     "NeoLLMForCausalLM",
     "NeoLLMModel",
     "NeoLLMPreTrainedModel",
     "REPOModule",
     "VersatileFFN",
     "compute_versatile_aux_loss",
+    # Analysis dataclasses — exported so external tools can type-hint against them
     "AnalysisState",
     "LayerAnalysis",
     "AttentionAnalysis",
     "VersatileFFNAnalysis",
     "JTokMAnalysis",
     "AttnResAnalysis",
     "GeneratorAnalysis",
 ]