KitsuVp
/

NeoLLM

@@ -81,30 +81,6 @@ from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
-# ── Optional fast-path dependencies (GatedDeltaNet linear attention) ─────────
-try:
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update as _causal_conv1d_update
-    _causal_conv1d_available = True
-except ImportError:
-    causal_conv1d_fn = None
-    _causal_conv1d_update = None
-    _causal_conv1d_available = False
-try:
-    from fla.modules import FusedRMSNormGated
-    from fla.ops.gated_delta_rule import (
-        chunk_gated_delta_rule,
-        fused_recurrent_gated_delta_rule,
-    )
-    _fla_available = True
-except ImportError:
-    FusedRMSNormGated = None
-    chunk_gated_delta_rule = None
-    fused_recurrent_gated_delta_rule = None
-    _fla_available = False
-is_linear_attn_fast_path = _causal_conv1d_available and _fla_available
 logger = logging.get_logger(__name__)
@@ -428,6 +404,40 @@ class StackMemoryAnalysis:
     residual_scale: Optional[float]        = None  # res_weight scalar
 @dataclass
 class LayerAnalysis:
     """
@@ -460,8 +470,8 @@ class LayerAnalysis:
     attn_res:    Optional[AttnResAnalysis]     = None  # if use_attn_res
     dca:         Optional[DCAAnalysis]         = None  # if use_dca
     stack:       Optional[StackMemoryAnalysis] = None  # if use_stacktrans
-    laurel_attn: Optional["LAuReLAnalysis"]    = None  # if use_laurel (attention residual)
-    laurel_mlp:  Optional["LAuReLAnalysis"]    = None  # if use_laurel (MLP residual)
 @dataclass
@@ -3685,39 +3695,6 @@ class StackMemory(nn.Module):
 @dataclass
-class LAuReLAnalysis:
-    """
-    Internals of one LAuReL residual connection forward pass.
-    Only populated when use_laurel=True AND model is in eval + analysis mode.
-    Instantiated twice per layer: once for the attention residual, once for MLP.
-    Reference: Menghani, G., Kumar, R. & Kumar, S. (ICML 2025).
-    *LAuReL: Learned Augmented Residual Layer.* arXiv:2411.07501.
-    Math (combined RW+LR, both sub-variants active):
-        x_{i+1} = α · f(x_i) + β · (A·(B·x_i) + x_i)
-    where [α, β] = softmax([a, b]), a,b ∈ ℝ learnable (RW component),
-    B ∈ ℝ^{r×D} column-orthogonal init, A ∈ ℝ^{D×r} zero init (LR component).
-    At step 0: A=0 → lr_term=0, so x_{i+1} = 0.5·f(x) + 0.5·x_i (RW only)
-    or x_{i+1} = f(x_i) + x_i (LR only, standard residual).
-    Fields:
-        alpha_rw:    softmax(a) — weight on f(x_i). [scalar float]
-                     None when use_laurel_rw=False.
-        beta_rw:     softmax(b) — weight on g(x_i). [scalar float]
-                     None when use_laurel_rw=False.
-        lr_term:     A·(B·x_res) — the low-rank residual augmentation.
-                     Shape [B, S, D]. Zero at init. None when use_laurel_lr=False.
-        output:      Final combined tensor before GPAS. Shape [B, S, D].
-    """
-    alpha_rw: Optional[float]        = None  # softmax weight on f(x)
-    beta_rw:  Optional[float]        = None  # softmax weight on g(x)
-    lr_term:  Optional[torch.Tensor] = None  # A(Bx) low-rank augmentation [B,S,D]
-    output:   Optional[torch.Tensor] = None  # combined pre-GPAS [B,S,D]
 class LAuReLLayer(nn.Module):
     """
     LAuReL: Learned Augmented Residual Layer.
@@ -3836,358 +3813,6 @@ class LAuReLLayer(nn.Module):
         return out
-# ==================== GATED DELTA NET (LINEAR ATTENTION) ====================
-# Active when use_linear_attention=True. Replaces NeoLLMAttention every
-# `linear_attention_every_n` layers (pattern 0-indexed: layers 2, 5, 8 …).
-#
-# References:
-#   Yang et al. (2024). "Gated Delta Networks." arXiv:2412.06464.
-#   Li et al. (2026). "REPO." arXiv:2512.14391.
-def _apply_mask_to_padding_states(
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-) -> torch.Tensor:
-    if (
-        attention_mask is not None
-        and attention_mask.shape[1] > 1
-        and attention_mask.shape[0] > 1
-    ):
-        hidden_states = (
-            hidden_states * attention_mask[:, :, None]
-        ).to(hidden_states.dtype)
-    return hidden_states
-def _l2norm(x: torch.Tensor, dim: int = -1, eps: float = 1e-6) -> torch.Tensor:
-    return x / torch.sqrt((x * x).sum(dim=dim, keepdim=True) + eps)
-def _torch_causal_conv1d_update(
-    hidden_states, conv_state, weight, bias=None, activation=None
-):
-    _, hidden_size, seq_len = hidden_states.shape
-    state_len = conv_state.shape[-1]
-    combined  = torch.cat([conv_state, hidden_states], dim=-1).to(weight.dtype)
-    conv_state.copy_(combined[:, :, -state_len:])
-    out = F.conv1d(combined, weight.unsqueeze(1), bias, padding=0, groups=hidden_size)
-    return F.silu(out[:, :, -seq_len:]).to(hidden_states.dtype)
-def _torch_chunk_gated_delta_rule(
-    query, key, value, g, beta,
-    chunk_size=64, initial_state=None, output_final_state=False,
-    use_qk_l2norm_in_kernel=False,
-):
-    initial_dtype = query.dtype
-    if use_qk_l2norm_in_kernel:
-        query, key = _l2norm(query), _l2norm(key)
-    query, key, value, beta, g = [
-        x.transpose(1, 2).contiguous().to(torch.float32)
-        for x in (query, key, value, beta, g)
-    ]
-    bs, seq, nh, kdim = key.shape
-    vdim = value.shape[-1]
-    pad  = (chunk_size - nh % chunk_size) % chunk_size
-    for t in (query, key, value):
-        t = F.pad(t, (0, 0, 0, pad))
-    query  = F.pad(query,  (0, 0, 0, pad))
-    key    = F.pad(key,    (0, 0, 0, pad))
-    value  = F.pad(value,  (0, 0, 0, pad))
-    beta   = F.pad(beta,   (0, pad))
-    g      = F.pad(g,      (0, pad))
-    tot    = nh + pad
-    scale  = query.shape[-1] ** -0.5
-    query  = query * scale
-    vb     = value * beta.unsqueeze(-1)
-    kb     = key   * beta.unsqueeze(-1)
-    query, key, value, kb, vb = [
-        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1])
-        for x in (query, key, value, kb, vb)
-    ]
-    g    = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
-    triu = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), 0)
-    g    = g.cumsum(-1)
-    dm   = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp()).tril()
-    attn = -((kb @ key.transpose(-1, -2)) * dm).masked_fill(triu, 0)
-    for i in range(1, chunk_size):
-        r = attn[..., i, :i].clone(); s = attn[..., :i, :i].clone()
-        attn[..., i, :i] = r + (r.unsqueeze(-1) * s).sum(-2)
-    eye   = torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
-    attn  = attn + eye
-    value = attn @ vb
-    kcd   = attn @ (kb * g.exp().unsqueeze(-1))
-    st    = torch.zeros(bs, seq, kdim, vdim, dtype=value.dtype, device=value.device) if initial_state is None else initial_state.to(value)
-    out   = torch.zeros_like(value)
-    triu2 = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), 1)
-    for i in range(tot // chunk_size):
-        qi, ki, vi = query[:,:,i], key[:,:,i], value[:,:,i]
-        a   = (qi @ ki.transpose(-1,-2) * dm[:,:,i]).masked_fill_(triu2, 0)
-        vp  = kcd[:,:,i] @ st
-        vn  = vi - vp
-        out[:,:,i] = (qi * g[:,:,i,:,None].exp()) @ st + a @ vn
-        st  = st * g[:,:,i,-1,None,None].exp() + (ki * (g[:,:,i,-1,None]-g[:,:,i]).exp()[...,None]).transpose(-1,-2) @ vn
-    if not output_final_state: st = None
-    out = out.reshape(out.shape[0], out.shape[1], -1, out.shape[-1])[:,:,:nh]
-    return out.transpose(1,2).contiguous().to(initial_dtype), st
-def _torch_recurrent_gated_delta_rule(
-    query, key, value, g, beta, initial_state, output_final_state,
-    use_qk_l2norm_in_kernel=False,
-):
-    initial_dtype = query.dtype
-    if use_qk_l2norm_in_kernel:
-        query, key = _l2norm(query), _l2norm(key)
-    query, key, value, beta, g = [
-        x.transpose(1,2).contiguous().to(torch.float32)
-        for x in (query, key, value, beta, g)
-    ]
-    bs, seq, nh, kdim = key.shape
-    vdim   = value.shape[-1]
-    query  = query * (query.shape[-1] ** -0.5)
-    out    = torch.zeros(bs, seq, nh, vdim, dtype=value.dtype, device=value.device)
-    st     = torch.zeros(bs, seq, kdim, vdim, dtype=value.dtype, device=value.device) if initial_state is None else initial_state.to(value)
-    for i in range(nh):
-        qt, kt, vt = query[:,:,i], key[:,:,i], value[:,:,i]
-        gt, bt     = g[:,:,i].exp().unsqueeze(-1).unsqueeze(-1), beta[:,:,i].unsqueeze(-1)
-        st         = st * gt
-        delta       = (vt - (st * kt.unsqueeze(-1)).sum(-2)) * bt
-        st         = st + kt.unsqueeze(-1) * delta.unsqueeze(-2)
-        out[:,:,i] = (st * qt.unsqueeze(-1)).sum(-2)
-    if not output_final_state: st = None
-    return out.transpose(1,2).contiguous().to(initial_dtype), st
-class _NeoLLMRMSNormGated(nn.Module):
-    """Gated RMSNorm fallback when FLA unavailable."""
-    def __init__(self, hidden_size, eps=1e-6, **kwargs):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.eps    = eps
-    def forward(self, x, gate):
-        dtype = x.dtype
-        x     = x.float()
-        x     = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-        return (self.weight * x.to(dtype) * F.silu(gate.float())).to(dtype)
-class NeoLLMGatedDeltaNet(nn.Module):
-    """
-    GatedDeltaNet linear attention with FANformer integration.
-    Replaces NeoLLMAttention on every ``linear_attention_every_n``-th layer
-    (0-indexed: layers 2, 5, 8 … for every_n=3).
-    REPO (use_repo=True AND use_repo_in_linear_attn=True):
-        Applies continuous per-head positions to Q and K via _apply_repo_rope,
-        matching the full-attention REPO path identically.
-    Without REPO the gated delta rule operates without explicit positional
-    encoding (its recurrent state is implicitly position-aware).
-    References:
-        Yang et al. (2024). arXiv:2412.06464.
-        Li et al. (2026). arXiv:2512.14391.
-    """
-    def __init__(self, config: NeoLLMConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size      = config.hidden_size
-        self.num_v_heads      = config.linear_num_value_heads
-        self.num_k_heads      = config.linear_num_key_heads
-        self.head_k_dim       = config.linear_key_head_dim
-        self.head_v_dim       = config.linear_value_head_dim
-        self.key_dim          = self.head_k_dim * self.num_k_heads
-        self.value_dim        = self.head_v_dim * self.num_v_heads
-        self.conv_kernel_size = config.linear_conv_kernel_dim
-        self.layer_idx        = layer_idx
-        # ── FANformer (same ratio as full-attention layers) ────────────────
-        self.fan_layer = FANLayer(
-            hidden_size=config.hidden_size,
-            fan_ratio=getattr(config, "fan_ratio", 0.125),
-        )
-        _fan_dim = config.hidden_size + int(
-            config.hidden_size * getattr(config, "fan_ratio", 0.125)
-        )
-        # ── Causal conv1d on concatenated Q/K/V ──────────────────────────
-        self.conv_dim = self.key_dim * 2 + self.value_dim
-        self.conv1d   = nn.Conv1d(
-            self.conv_dim, self.conv_dim, bias=False,
-            kernel_size=self.conv_kernel_size,
-            groups=self.conv_dim,
-            padding=self.conv_kernel_size - 1,
-        )
-        # ── QKVz + ba projections (all from FAN-transformed features) ─────
-        _ratio = self.num_v_heads // self.num_k_heads
-        self.in_proj_qkvz = nn.Linear(
-            _fan_dim, self.key_dim * 2 + self.value_dim * 2, bias=False
-        )
-        self.in_proj_ba = nn.Linear(
-            _fan_dim, self.num_v_heads * 2, bias=False
-        )
-        # ── Delta-rule gating parameters ──────────────────────────────────
-        self.dt_bias = nn.Parameter(torch.ones(self.num_v_heads))
-        A            = torch.empty(self.num_v_heads).uniform_(0, 16)
-        self.A_log   = nn.Parameter(torch.log(A))
-        # ── Output normalisation ───────────────────────────────────────────
-        _NormCls = FusedRMSNormGated if FusedRMSNormGated is not None else _NeoLLMRMSNormGated
-        _norm_kw = (
-            dict(activation="silu",
-                 device=torch.cuda.current_device(),
-                 dtype=getattr(config, "dtype", None) or torch.get_default_dtype())
-            if FusedRMSNormGated is not None else {}
-        )
-        self.norm     = _NormCls(self.head_v_dim, eps=config.rms_norm_eps, **_norm_kw)
-        self.out_proj = nn.Linear(self.value_dim, self.hidden_size, bias=False)
-        self.dropout  = nn.Dropout(config.dropout_rate)
-        # ── Kernel dispatch (fast → fallback) ─────────────────────────────
-        self._conv1d_fn = causal_conv1d_fn          # None if not installed
-        self._chunk_fn  = (chunk_gated_delta_rule
-                           if chunk_gated_delta_rule is not None
-                           else _torch_chunk_gated_delta_rule)
-        self._recur_fn  = (fused_recurrent_gated_delta_rule
-                           if fused_recurrent_gated_delta_rule is not None
-                           else _torch_recurrent_gated_delta_rule)
-        if not is_linear_attn_fast_path:
-            logger.warning_once(
-                "NeoLLMGatedDeltaNet: causal_conv1d / flash-linear-attention "
-                "not installed — using pure-PyTorch fallbacks. "
-                "Install both packages for full performance."
-            )
-        # ── REPO: continuous per-head positions on Q and K ─────────────────
-        # Controlled by use_repo AND use_repo_in_linear_attn flags.
-        # Only active for layers at or above repo_start_layer.
-        self.use_repo = (
-            getattr(config, "use_repo", False)
-            and getattr(config, "use_repo_in_linear_attn", False)
-            and layer_idx >= getattr(config, "repo_start_layer",
-                                     config.num_hidden_layers // 3)
-        )
-        if self.use_repo:
-            _d_p = getattr(config, "repo_d_p", config.hidden_size // 8)
-            self.repo_module = REPOModule(
-                hidden_size=config.hidden_size,
-                d_p=_d_p,
-                num_heads=self.num_v_heads,
-            )
-        else:
-            self.repo_module = None
-    def _fix_qkvz(
-        self,
-        mixed_qkvz: torch.Tensor,
-        mixed_ba:   torch.Tensor,
-    ) -> Tuple[torch.Tensor, ...]:
-        """Split fused projection into (q, k, v, z, b, a)."""
-        ratio = self.num_v_heads // self.num_k_heads
-        mixed_qkvz = mixed_qkvz.view(
-            *mixed_qkvz.shape[:-1],
-            self.num_k_heads,
-            2 * self.head_k_dim + 2 * ratio * self.head_v_dim,
-        )
-        mixed_ba = mixed_ba.view(
-            *mixed_ba.shape[:-1],
-            self.num_k_heads,
-            2 * ratio,
-        )
-        q, k, v, z = torch.split(
-            mixed_qkvz,
-            [self.head_k_dim, self.head_k_dim,
-             ratio * self.head_v_dim, ratio * self.head_v_dim],
-            dim=3,
-        )
-        b, a = torch.split(mixed_ba, ratio, dim=3)
-        v = v.reshape(v.shape[0], v.shape[1], -1, self.head_v_dim)
-        z = z.reshape(z.shape[0], z.shape[1], -1, self.head_v_dim)
-        b = b.reshape(b.shape[0], b.shape[1], self.num_v_heads)
-        a = a.reshape(a.shape[0], a.shape[1], self.num_v_heads)
-        return q, k, v, z, b, a
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
-    ) -> torch.Tensor:
-        hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
-        B, S, _       = hidden_states.shape
-        # ── FANformer ─────────────────────────────────────────────────────
-        h_fan = self.fan_layer(hidden_states)
-        # ── QKVz and ba projections ───────────────────────────────────────
-        q, k, v, z, b, a = self._fix_qkvz(
-            self.in_proj_qkvz(h_fan), self.in_proj_ba(h_fan)
-        )
-        # ── Causal conv1d on flattened Q/K/V ─────────────────────────────
-        qkv = torch.cat(
-            [q.reshape(B, S, -1), k.reshape(B, S, -1), v.reshape(B, S, -1)], dim=-1
-        ).transpose(1, 2)                                        # [B, conv_dim, S]
-        if self._conv1d_fn is not None:
-            qkv = self._conv1d_fn(
-                x=qkv, weight=self.conv1d.weight.squeeze(1),
-                bias=self.conv1d.bias, activation="silu", seq_idx=None,
-            )
-        else:
-            qkv = F.silu(self.conv1d(qkv)[:, :, :S])
-        qkv = qkv.transpose(1, 2)                               # [B, S, conv_dim]
-        q_f, k_f, v_f = torch.split(
-            qkv, [self.key_dim, self.key_dim, self.value_dim], dim=-1
-        )
-        q = q_f.reshape(B, S, -1, self.head_k_dim)
-        k = k_f.reshape(B, S, -1, self.head_k_dim)
-        v = v_f.reshape(B, S, -1, self.head_v_dim)
-        # ── REPO: continuous per-head positions ───────────────────────────
-        # Transpose to [B, H, S, dk] for _apply_repo_rope, then back.
-        if self.use_repo and self.repo_module is not None and repo_rope_args is not None:
-            inv_freq, attn_scaling = repo_rope_args
-            z_pos  = self.repo_module(hidden_states)             # [B, H, S]
-            q_t, k_t = q.transpose(1, 2), k.transpose(1, 2)
-            q_t, k_t = _apply_repo_rope(q_t, k_t, z_pos, inv_freq, attn_scaling)
-            q, k   = q_t.transpose(1, 2), k_t.transpose(1, 2)
-        # ── GQA-like head expansion ─────────────────────────────��──────────
-        ratio = self.num_v_heads // self.num_k_heads
-        if ratio > 1:
-            q = q.repeat_interleave(ratio, dim=2)
-            k = k.repeat_interleave(ratio, dim=2)
-        beta = b.sigmoid()
-        g    = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
-        # ── Chunk gated delta rule (fused or fallback) ────────────────────
-        core_out, _ = self._chunk_fn(
-            q, k, v, g=g, beta=beta,
-            initial_state=None, output_final_state=False,
-            use_qk_l2norm_in_kernel=True,
-        )
-        # ── Gated RMSNorm + output projection ─────────────────────────────
-        z_shape  = z.shape
-        core_out = core_out.reshape(-1, core_out.shape[-1])
-        core_out = self.norm(core_out, z.reshape(-1, z.shape[-1]))
-        core_out = core_out.reshape(z_shape).reshape(B, S, -1)
-        return self.dropout(self.out_proj(core_out))
-# ==================== DECODER LAYER ==========================================
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections, optional JTok-M injection.
@@ -4210,23 +3835,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         self.layer_idx     = layer_idx
         self.use_jtokm     = config.use_jtokm
-        # ── Token-mixer selection ─────────────────────────────────────────
-        # use_linear_attention=True: replace full attention every
-        # `linear_attention_every_n` layers (0-indexed pattern:
-        #   e.g. every_n=3 → layers 2, 5, 8, 11 …).
-        # All other layers keep NeoLLMAttention unchanged.
-        _every_n = getattr(config, "linear_attention_every_n", 3)
-        self.is_linear_attn = (
-            getattr(config, "use_linear_attention", False)
-            and (layer_idx + 1) % _every_n == 0
-        )
-        if self.is_linear_attn:
-            self.linear_attn = NeoLLMGatedDeltaNet(config, layer_idx)
-            self.self_attn   = None
-        else:
-            self.self_attn   = NeoLLMAttention(config, layer_idx)
-            self.linear_attn = None
         self.mlp                      = (
             VersatileFFN(config)
             if getattr(config, "use_versatile_ffn", False)
@@ -4484,32 +4093,17 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         if layer_analysis is not None:
             layer_analysis.lns_attn_output = h_lns.detach()
-        if self.is_linear_attn:
-            # ── GatedDeltaNet linear attention path ───────────────────────
-            # Does not use: first_layer_fan, mudd_xk/xv, attn_analysis.
-            # attention_mask here is already the linear_attn_mask (no causal
-            # bias, just padding) — NeoLLMModel.forward selects it per layer.
-            hidden_states = self.linear_attn(
-                hidden_states=h_lns,
-                attention_mask=attention_mask,
-                position_embeddings=position_embeddings,
-                repo_rope_args=repo_rope_args,
-            )
-            attn_weights          = None
-            self.current_layer_fan = None
-        else:
-            # ── Standard full attention path ──────────────────────────────
-            hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
-                hidden_states=h_lns,
-                attention_mask=attention_mask,
-                position_embeddings=position_embeddings,
-                first_layer_fan=first_layer_fan,
-                attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
-                repo_rope_args=repo_rope_args,
-                mudd_xk=mudd_xk,
-                mudd_xv=mudd_xv,
-                **kwargs,
-            )
         if layer_analysis is not None:
             layer_analysis.attn_contribution = hidden_states.detach()
@@ -4933,9 +4527,6 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
             if hasattr(module, "alpha_ma"):
                 module.alpha_ma.zero_()
-        elif isinstance(module, NeoLLMGatedDeltaNet):
-            module.dt_bias.data.fill_(1.0)
-            module.A_log.data.uniform_(0, 16).log_()
         elif isinstance(module, GPAS):
             module.alpha.data.fill_(0.0)
@@ -5178,21 +4769,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.post_init()
-    def _update_linear_attn_mask(
-        self,
-        attention_mask: Optional[torch.Tensor],
-    ) -> Optional[torch.Tensor]:
-        """
-        Return mask for GatedDeltaNet layers (no causal bias, padding only).
-        Returns None when all tokens are valid (GatedDeltaNet handles via
-        _apply_mask_to_padding_states internally).
-        """
-        if attention_mask is None:
-            return None
-        if torch.all(attention_mask == 1):
-            return None
-        return attention_mask
     def get_input_embeddings(self):
         if self.config.use_token_generator:
             return self.token_generator
@@ -5343,10 +4919,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if getattr(self.config, "use_repo", False) else None
         )
-        # ── Linear attention mask ──────────────────────────────────────────
-        # Computed once; each layer picks the appropriate mask below.
-        linear_attn_mask = self._update_linear_attn_mask(attention_mask)
         # ── Attention Residuals state ──────────────────────────────────────
         # Full AttnRes (attn_res_num_blocks=0): sources grows by one entry per
         # decoder layer — all previous outputs are kept, max N=num_layers+1.
@@ -5436,17 +5008,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 layer_analysis.layer_idx = layer_idx
                 analysis_state.layers.append(layer_analysis)
-            # Select the appropriate mask: causal for full attention,
-            # padding-only for GatedDeltaNet linear attention.
-            _layer_mask = (
-                linear_attn_mask
-                if getattr(decoder_layer, "is_linear_attn", False)
-                else causal_mask
-            )
             layer_outputs = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
-                attention_mask=_layer_mask,
                 first_layer_fan=self.first_layer_fan,
                 z_tilde=z_tilde,
                 B_vals=B_vals,
@@ -5843,7 +5408,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
-    "NeoLLMGatedDeltaNet",
     "StackMemory",
     "LAuReLLayer",
     "NeoLLMMUDDModule",

 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
     residual_scale: Optional[float]        = None  # res_weight scalar
+@dataclass
+class LAuReLAnalysis:
+    """
+    Internals of one LAuReL residual connection forward pass.
+    Only populated when use_laurel=True AND model is in eval + analysis mode.
+    Instantiated twice per layer: once for the attention residual, once for MLP.
+    Reference: Menghani, G., Kumar, R. & Kumar, S. (ICML 2025).
+    *LAuReL: Learned Augmented Residual Layer.* arXiv:2411.07501.
+    Math (combined RW+LR, both sub-variants active):
+        x_{i+1} = α · f(x_i) + β · (A·(B·x_i) + x_i)
+    where [α, β] = softmax([a, b]), a,b ∈ ℝ learnable (RW component),
+    B ∈ ℝ^{r×D} column-orthogonal init, A ∈ ℝ^{D×r} zero init (LR component).
+    At step 0: A=0 → lr_term=0, so x_{i+1} = 0.5·f(x) + 0.5·x_i (RW only)
+    or x_{i+1} = f(x_i) + x_i (LR only, standard residual).
+    Fields:
+        alpha_rw:    softmax(a) — weight on f(x_i). [scalar float]
+                     None when use_laurel_rw=False.
+        beta_rw:     softmax(b) — weight on g(x_i). [scalar float]
+                     None when use_laurel_rw=False.
+        lr_term:     A·(B·x_res) — the low-rank residual augmentation.
+                     Shape [B, S, D]. Zero at init. None when use_laurel_lr=False.
+        output:      Final combined tensor before GPAS. Shape [B, S, D].
+    """
+    alpha_rw: Optional[float]        = None  # softmax weight on f(x)
+    beta_rw:  Optional[float]        = None  # softmax weight on g(x)
+    lr_term:  Optional[torch.Tensor] = None  # A(Bx) low-rank augmentation [B,S,D]
+    output:   Optional[torch.Tensor] = None  # combined pre-GPAS [B,S,D]
 @dataclass
 class LayerAnalysis:
     """
     attn_res:    Optional[AttnResAnalysis]     = None  # if use_attn_res
     dca:         Optional[DCAAnalysis]         = None  # if use_dca
     stack:       Optional[StackMemoryAnalysis] = None  # if use_stacktrans
+    laurel_attn: Optional[LAuReLAnalysis]      = None  # if use_laurel (attention residual)
+    laurel_mlp:  Optional[LAuReLAnalysis]      = None  # if use_laurel (MLP residual)
 @dataclass
 @dataclass
 class LAuReLLayer(nn.Module):
     """
     LAuReL: Learned Augmented Residual Layer.
         return out
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections, optional JTok-M injection.
         self.layer_idx     = layer_idx
         self.use_jtokm     = config.use_jtokm
+        self.self_attn                = NeoLLMAttention(config, layer_idx)
         self.mlp                      = (
             VersatileFFN(config)
             if getattr(config, "use_versatile_ffn", False)
         if layer_analysis is not None:
             layer_analysis.lns_attn_output = h_lns.detach()
+        hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
+            hidden_states=h_lns,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            first_layer_fan=first_layer_fan,
+            attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
+            repo_rope_args=repo_rope_args,
+            mudd_xk=mudd_xk,
+            mudd_xv=mudd_xv,
+            **kwargs,
+        )
         if layer_analysis is not None:
             layer_analysis.attn_contribution = hidden_states.detach()
             if hasattr(module, "alpha_ma"):
                 module.alpha_ma.zero_()
         elif isinstance(module, GPAS):
             module.alpha.data.fill_(0.0)
         self.post_init()
     def get_input_embeddings(self):
         if self.config.use_token_generator:
             return self.token_generator
             if getattr(self.config, "use_repo", False) else None
         )
         # ── Attention Residuals state ──────────────────────────────────────
         # Full AttnRes (attn_res_num_blocks=0): sources grows by one entry per
         # decoder layer — all previous outputs are kept, max N=num_layers+1.
                 layer_analysis.layer_idx = layer_idx
                 analysis_state.layers.append(layer_analysis)
             layer_outputs = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
                 first_layer_fan=self.first_layer_fan,
                 z_tilde=z_tilde,
                 B_vals=B_vals,
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
     "StackMemory",
     "LAuReLLayer",
     "NeoLLMMUDDModule",