KitsuVp
/

NeoLLM

@@ -5458,7 +5458,13 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         self.hidden_size   = config.hidden_size
         self.layer_idx     = layer_idx
         self.use_jtokm     = config.use_jtokm
-        self.use_seednorm  = bool(getattr(config, "use_seednorm", True))
         # Controls only the first pre-attention normalisation applied directly
         # to the embedding stream. Defaults to True for checkpoint/config
         # backward compatibility.  When False, layer 0 does not instantiate
@@ -5467,7 +5473,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         self.use_embedding_input_norm = bool(
             getattr(config, "use_embedding_input_norm", True)
         )
-        self.has_input_layernorm = not (
             self.layer_idx == 0 and not self.use_embedding_input_norm
         )
@@ -5478,24 +5484,55 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             else NeoLLMMLP(config)
         )
         self.use_versatile_ffn        = getattr(config, "use_versatile_ffn", False)
-        self.input_layernorm          = (
-            _make_norm(
                 config.hidden_size,
                 eps=config.rms_norm_eps,
                 use_seednorm=self.use_seednorm,
             )
-            if self.has_input_layernorm
-            else None
-        )
-        self.post_attention_layernorm = _make_norm(
-            config.hidden_size,
-            eps=config.rms_norm_eps,
-            use_seednorm=self.use_seednorm,
-        )
-        self.lns_attn                 = LNS(layer_idx)
-        self.lns_mlp                  = LNS(layer_idx)
-        self.gpas_attn                = GPAS(config.hidden_size)
-        self.gpas_mlp                 = GPAS(config.hidden_size)
         self.current_layer_fan        = None
         # ── StackMemory / STACKTRANS (Zhang et al., NeurIPS 2025) ────────
@@ -5750,6 +5787,129 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             # LAUREL-LR (paper eq. 3): f(x) + BAx + x
             return delta + lr_delta + residual
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -5830,12 +5990,16 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
                 k_normed = _apply_norm(self.input_layernorm, dca_k_input, analysis=None)
                 v_normed = _apply_norm(self.input_layernorm, dca_v_input, analysis=None)
-        h_lns    = self.lns_attn(h_normed)
         if dca_k_input is not None:
-            k_lns = self.lns_attn(k_normed)
-            v_lns = self.lns_attn(v_normed)
             dca_key_value_states = (k_lns, v_lns)
-        if layer_analysis is not None:
             layer_analysis.lns_attn_output = h_lns.detach()
         hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
@@ -5870,7 +6034,11 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         else:
             attn_aug = residual_attn + hidden_states
             dca_final_residual = hidden_states if self.use_dca else None
-        h_tilde = self.gpas_attn(attn_aug, analysis=gpas_attn_a)
         if layer_analysis is not None:
             layer_analysis.h_tilde = h_tilde.detach()
@@ -5892,8 +6060,8 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # ── MLP block ─────────────────────────────────────────────────────
         sn_post = layer_analysis.seednorm_post_attn if layer_analysis is not None else None
         h_normed2 = _apply_norm(self.post_attention_layernorm, h_mlp, analysis=sn_post)
-        h_lns2    = self.lns_mlp(h_normed2)
-        if layer_analysis is not None:
             layer_analysis.lns_mlp_output = h_lns2.detach()
         mlp_a   = layer_analysis.mlp if layer_analysis is not None else None
@@ -5934,11 +6102,19 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             delta_r, aux_stats = self.jtokm(h_flat, z_flat, B_flat, analysis=jtokm_a)
             delta_r = delta_r.reshape(orig_shape)
-            gpas_mlp_a    = layer_analysis.gpas_mlp if layer_analysis is not None else None
-            hidden_states = self.gpas_mlp(mlp_aug + delta_r, analysis=gpas_mlp_a)
         else:
-            gpas_mlp_a    = layer_analysis.gpas_mlp if layer_analysis is not None else None
-            hidden_states = self.gpas_mlp(mlp_aug, analysis=gpas_mlp_a)
         if layer_analysis is not None:
             layer_analysis.hidden_states_output = hidden_states.detach()
@@ -6378,6 +6554,8 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
             module.res_weight.data.fill_(1.0)
         elif isinstance(module, NeoLLMDecoderLayer):
             # AttnRes pseudo-queries: MUST be initialized to zero.
             # Zero initialization ensures uniform attention weights at step 0
             # (softmax of zeros is uniform), making AttnRes equivalent to a
@@ -6468,11 +6646,21 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             [NeoLLMDecoderLayer(config, layer_idx)
              for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm        = _make_norm(
-            config.hidden_size,
-            eps=config.rms_norm_eps,
-            use_seednorm=bool(getattr(config, "use_seednorm", True)),
-        )
         self.rotary_emb  = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         self.first_layer_fan        = None if getattr(config, "use_fan_residual", True) else False
@@ -6564,8 +6752,8 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 polynorm = PolyNormAnalysis() if not _versatile else None,
                 versatile = VersatileFFNAnalysis() if _versatile else None,
             ),
-            gpas_attn = GPASAnalysis(),
-            gpas_mlp  = GPASAnalysis(),
             jtokm     = JTokMAnalysis() if cfg.use_jtokm else None,
             dca       = DCAAnalysis() if getattr(cfg, "use_dca", False) else None,
             attn_res  = AttnResAnalysis() if getattr(cfg, "use_attn_res", False) else None,
@@ -6658,6 +6846,8 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         )
         hidden_states     = inputs_embeds
         all_hidden_states = () if output_hidden_states else None
         all_attentions    = () if output_attentions    else None
         all_aux_stats     = []
@@ -6770,23 +6960,43 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             dca_layer_sources = self._select_dca_sources(dca_sources) if use_dca else None
-            layer_outputs = decoder_layer(
-                hidden_states,
-                position_embeddings=position_embeddings,
-                attention_mask=causal_mask,
-                first_layer_fan=self.first_layer_fan,
-                z_tilde=z_tilde,
-                B_vals=B_vals,
-                dca_sources=dca_layer_sources,
-                attn_res_sources=attn_res_sources,
-                attn_res_partial=attn_res_partial if use_attn_res else None,
-                layer_analysis=layer_analysis,
-                output_attentions=output_attentions,
-                repo_rope_args=repo_rope_args,
-                position_ids=position_ids,
-                **kwargs,
-            )
-            hidden_states = layer_outputs[0]
             # Update AttnRes partial sum — the new partial is the layer output
             if use_attn_res:
@@ -6797,18 +7007,20 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 dca_sources = dca_sources + [hidden_states]
             if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-            # Collect JTok-M aux stats (last element if present)
-            if self.config.use_jtokm and len(layer_outputs) > (2 if output_attentions else 1):
-                all_aux_stats.append(layer_outputs[-1])
-            # Collect VersatileFFN aux stats (second-to-last if jtokm also present,
-            # or last if jtokm is absent). Only non-None during training.
             if getattr(self.config, "use_versatile_ffn", False):
-                for item in layer_outputs[1:]:
                     if isinstance(item, tuple) and len(item) == 3:
-                        # (p_sum, f_sum, N_tokens) signature
                         all_aux_stats.append(("versatile", item))
                         break
@@ -6817,7 +7029,13 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                     and hasattr(decoder_layer, "current_layer_fan")):
                 self.first_layer_fan = decoder_layer.current_layer_fan
-        hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)

         self.hidden_size   = config.hidden_size
         self.layer_idx     = layer_idx
         self.use_jtokm     = config.use_jtokm
+        self.use_seednorm    = bool(getattr(config, "use_seednorm", True))
+        self.use_lns         = bool(getattr(config, "use_lns", False))
+        self.use_gpas        = bool(getattr(config, "use_gpas", False))
+        self.use_siamesenorm = bool(getattr(config, "use_siamesenorm", False))
+        self.siamese_normalized_input = bool(getattr(config, "siamese_normalized_input", True))
+        self.siamese_depth_scaling    = bool(getattr(config, "siamese_depth_scaling", True))
+        self.siamese_attn_x_scale_init = float(getattr(config, "siamese_attn_x_scale_init", 1.0))
         # Controls only the first pre-attention normalisation applied directly
         # to the embedding stream. Defaults to True for checkpoint/config
         # backward compatibility.  When False, layer 0 does not instantiate
         self.use_embedding_input_norm = bool(
             getattr(config, "use_embedding_input_norm", True)
         )
+        self.has_input_layernorm = (not self.use_siamesenorm) and not (
             self.layer_idx == 0 and not self.use_embedding_input_norm
         )
             else NeoLLMMLP(config)
         )
         self.use_versatile_ffn        = getattr(config, "use_versatile_ffn", False)
+        if self.use_siamesenorm:
+            self.input_layernorm = None
+            self.post_attention_layernorm = None
+            # SiameseNorm is RMS-only by config validation.  These modules are
+            # constructed only when the Siamese topology is active, so no
+            # inactive SeeDNorm/RMSNorm pre-norm modules remain in the graph.
+            self.siamese_attn_x_norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.siamese_attn_y_norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.siamese_mlp_x_norm  = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.siamese_mlp_y_norm  = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.siamese_attn_input_norm = (
+                nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                if self.siamese_normalized_input else None
+            )
+            self.siamese_mlp_input_norm = (
+                nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                if self.siamese_normalized_input else None
+            )
+            self.siamese_attn_x_scale = nn.Parameter(
+                torch.full((config.hidden_size,), self.siamese_attn_x_scale_init, dtype=torch.float32)
+            )
+        else:
+            self.input_layernorm          = (
+                _make_norm(
+                    config.hidden_size,
+                    eps=config.rms_norm_eps,
+                    use_seednorm=self.use_seednorm,
+                )
+                if self.has_input_layernorm
+                else None
+            )
+            self.post_attention_layernorm = _make_norm(
                 config.hidden_size,
                 eps=config.rms_norm_eps,
                 use_seednorm=self.use_seednorm,
             )
+            self.siamese_attn_x_norm = None
+            self.siamese_attn_y_norm = None
+            self.siamese_mlp_x_norm  = None
+            self.siamese_mlp_y_norm  = None
+            self.siamese_attn_input_norm = None
+            self.siamese_mlp_input_norm  = None
+            self.siamese_attn_x_scale    = None
+        self.lns_attn                 = LNS(layer_idx) if self.use_lns else None
+        self.lns_mlp                  = LNS(layer_idx) if self.use_lns else None
+        self.gpas_attn                = GPAS(config.hidden_size) if self.use_gpas else None
+        self.gpas_mlp                 = GPAS(config.hidden_size) if self.use_gpas else None
         self.current_layer_fan        = None
         # ── StackMemory / STACKTRANS (Zhang et al., NeurIPS 2025) ────────
             # LAUREL-LR (paper eq. 3): f(x) + BAx + x
             return delta + lr_delta + residual
+    def _siamese_stream_scale(self, ref: torch.Tensor) -> torch.Tensor:
+        if not self.siamese_depth_scaling:
+            return ref.new_tensor(1.0)
+        return ref.new_tensor(1.0 / math.sqrt(2.0 * float(self.layer_idx + 1)))
+    def forward_siamesenorm(
+        self,
+        x_states: torch.Tensor,
+        y_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        first_layer_fan: Optional[torch.Tensor] = None,
+        z_tilde: Optional[torch.Tensor] = None,
+        B_vals: Optional[torch.Tensor] = None,
+        layer_analysis: Optional[LayerAnalysis] = None,
+        output_attentions: Optional[bool] = False,
+        repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple:
+        # SiameseNorm keeps two coupled streams with shared Attention/MLP
+        # parameters.  All Siamese normalization modules are RMSNorm by
+        # construction; SeeDNorm is rejected at config validation time.
+        if layer_analysis is not None:
+            layer_analysis.hidden_states_input = x_states.detach()
+        # ── Attention shared block ────────────────────────────────────────
+        sn_pre = layer_analysis.seednorm_pre_attn if layer_analysis is not None else None
+        x_attn_norm = self.siamese_attn_x_norm(x_states)
+        y_attn_norm = self.siamese_attn_y_norm(y_states)
+        if sn_pre is not None:
+            sn_pre.output = x_attn_norm.detach()
+        x_scale = self.siamese_attn_x_scale.to(dtype=x_attn_norm.dtype, device=x_attn_norm.device)
+        h_attn = x_scale * x_attn_norm + y_attn_norm
+        if self.siamese_attn_input_norm is not None:
+            h_attn = self.siamese_attn_input_norm(h_attn)
+        h_lns = self.lns_attn(h_attn) if self.use_lns else h_attn
+        if layer_analysis is not None and self.use_lns:
+            layer_analysis.lns_attn_output = h_lns.detach()
+        attn_out, attn_weights, self.current_layer_fan = self.self_attn(
+            hidden_states=h_lns,
+            key_value_states=None,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            first_layer_fan=first_layer_fan,
+            attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
+            repo_rope_args=repo_rope_args,
+            position_ids=position_ids,
+            **kwargs,
+        )
+        if layer_analysis is not None:
+            layer_analysis.attn_contribution = attn_out.detach()
+        stream_scale = self._siamese_stream_scale(attn_out)
+        x_after_attn = x_states + stream_scale * attn_out
+        y_after_attn = y_states + attn_out
+        gpas_attn_a = layer_analysis.gpas_attn if layer_analysis is not None else None
+        if self.use_gpas:
+            x_after_attn = self.gpas_attn(x_after_attn, analysis=gpas_attn_a)
+        if layer_analysis is not None:
+            layer_analysis.h_tilde = x_after_attn.detach()
+        # ── MLP shared block ──────────────────────────────────────────────
+        sn_post = layer_analysis.seednorm_post_attn if layer_analysis is not None else None
+        x_mlp_norm = self.siamese_mlp_x_norm(x_after_attn)
+        y_mlp_norm = self.siamese_mlp_y_norm(y_after_attn)
+        if sn_post is not None:
+            sn_post.output = x_mlp_norm.detach()
+        h_mlp = x_mlp_norm + y_mlp_norm
+        if self.siamese_mlp_input_norm is not None:
+            h_mlp = self.siamese_mlp_input_norm(h_mlp)
+        h_lns2 = self.lns_mlp(h_mlp) if self.use_lns else h_mlp
+        if layer_analysis is not None and self.use_lns:
+            layer_analysis.lns_mlp_output = h_lns2.detach()
+        mlp_a = layer_analysis.mlp if layer_analysis is not None else None
+        if self.use_versatile_ffn:
+            delta_m, versatile_aux = self.mlp(h_lns2, analysis=mlp_a)
+        else:
+            delta_m = self.mlp(h_lns2, analysis=mlp_a)
+            versatile_aux = None
+        if layer_analysis is not None:
+            layer_analysis.mlp_contribution = delta_m.detach()
+        shared_update = delta_m
+        aux_stats = None
+        if self.use_jtokm and z_tilde is not None and B_vals is not None:
+            orig_shape = x_after_attn.shape
+            h_flat = x_after_attn.reshape(-1, self.hidden_size)
+            z_flat = z_tilde.reshape(-1, z_tilde.shape[-1])
+            B_flat = B_vals.reshape(-1, B_vals.shape[-2], B_vals.shape[-1])
+            jtokm_a = layer_analysis.jtokm if layer_analysis is not None else None
+            delta_r, aux_stats = self.jtokm(h_flat, z_flat, B_flat, analysis=jtokm_a)
+            shared_update = shared_update + delta_r.reshape(orig_shape)
+        x_next = x_after_attn + stream_scale * shared_update
+        y_next = y_after_attn + shared_update
+        gpas_mlp_a = layer_analysis.gpas_mlp if layer_analysis is not None else None
+        if self.use_gpas:
+            x_next = self.gpas_mlp(x_next, analysis=gpas_mlp_a)
+        if layer_analysis is not None:
+            layer_analysis.hidden_states_output = x_next.detach()
+        outputs = (x_next, y_next)
+        if output_attentions:
+            outputs += (attn_weights,)
+        if aux_stats is not None:
+            outputs += (aux_stats,)
+        if versatile_aux is not None:
+            outputs += (versatile_aux,)
+        return outputs
     def forward(
         self,
         hidden_states: torch.Tensor,
                 k_normed = _apply_norm(self.input_layernorm, dca_k_input, analysis=None)
                 v_normed = _apply_norm(self.input_layernorm, dca_v_input, analysis=None)
+        h_lns = self.lns_attn(h_normed) if self.use_lns else h_normed
         if dca_k_input is not None:
+            if self.use_lns:
+                k_lns = self.lns_attn(k_normed)
+                v_lns = self.lns_attn(v_normed)
+            else:
+                k_lns = k_normed
+                v_lns = v_normed
             dca_key_value_states = (k_lns, v_lns)
+        if layer_analysis is not None and self.use_lns:
             layer_analysis.lns_attn_output = h_lns.detach()
         hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
         else:
             attn_aug = residual_attn + hidden_states
             dca_final_residual = hidden_states if self.use_dca else None
+        h_tilde = (
+            self.gpas_attn(attn_aug, analysis=gpas_attn_a)
+            if self.use_gpas
+            else attn_aug
+        )
         if layer_analysis is not None:
             layer_analysis.h_tilde = h_tilde.detach()
         # ── MLP block ─────────────────────────────────────────────────────
         sn_post = layer_analysis.seednorm_post_attn if layer_analysis is not None else None
         h_normed2 = _apply_norm(self.post_attention_layernorm, h_mlp, analysis=sn_post)
+        h_lns2 = self.lns_mlp(h_normed2) if self.use_lns else h_normed2
+        if layer_analysis is not None and self.use_lns:
             layer_analysis.lns_mlp_output = h_lns2.detach()
         mlp_a   = layer_analysis.mlp if layer_analysis is not None else None
             delta_r, aux_stats = self.jtokm(h_flat, z_flat, B_flat, analysis=jtokm_a)
             delta_r = delta_r.reshape(orig_shape)
+            gpas_mlp_a = layer_analysis.gpas_mlp if layer_analysis is not None else None
+            hidden_states = (
+                self.gpas_mlp(mlp_aug + delta_r, analysis=gpas_mlp_a)
+                if self.use_gpas
+                else mlp_aug + delta_r
+            )
         else:
+            gpas_mlp_a = layer_analysis.gpas_mlp if layer_analysis is not None else None
+            hidden_states = (
+                self.gpas_mlp(mlp_aug, analysis=gpas_mlp_a)
+                if self.use_gpas
+                else mlp_aug
+            )
         if layer_analysis is not None:
             layer_analysis.hidden_states_output = hidden_states.detach()
             module.res_weight.data.fill_(1.0)
         elif isinstance(module, NeoLLMDecoderLayer):
+            if hasattr(module, "siamese_attn_x_scale") and module.siamese_attn_x_scale is not None:
+                module.siamese_attn_x_scale.data.fill_(module.siamese_attn_x_scale_init)
             # AttnRes pseudo-queries: MUST be initialized to zero.
             # Zero initialization ensures uniform attention weights at step 0
             # (softmax of zeros is uniform), making AttnRes equivalent to a
             [NeoLLMDecoderLayer(config, layer_idx)
              for layer_idx in range(config.num_hidden_layers)]
         )
+        self.use_siamesenorm = bool(getattr(config, "use_siamesenorm", False))
+        if self.use_siamesenorm:
+            self.norm = None
+            self.siamese_x_final_norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.siamese_y_final_norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.siamese_final_norm   = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm        = _make_norm(
+                config.hidden_size,
+                eps=config.rms_norm_eps,
+                use_seednorm=bool(getattr(config, "use_seednorm", True)),
+            )
+            self.siamese_x_final_norm = None
+            self.siamese_y_final_norm = None
+            self.siamese_final_norm   = None
         self.rotary_emb  = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         self.first_layer_fan        = None if getattr(config, "use_fan_residual", True) else False
                 polynorm = PolyNormAnalysis() if not _versatile else None,
                 versatile = VersatileFFNAnalysis() if _versatile else None,
             ),
+            gpas_attn = GPASAnalysis() if getattr(cfg, "use_gpas", False) else None,
+            gpas_mlp  = GPASAnalysis() if getattr(cfg, "use_gpas", False) else None,
             jtokm     = JTokMAnalysis() if cfg.use_jtokm else None,
             dca       = DCAAnalysis() if getattr(cfg, "use_dca", False) else None,
             attn_res  = AttnResAnalysis() if getattr(cfg, "use_attn_res", False) else None,
         )
         hidden_states     = inputs_embeds
+        use_siamesenorm   = bool(getattr(self.config, "use_siamesenorm", False))
+        siamese_y_states  = inputs_embeds if use_siamesenorm else None
         all_hidden_states = () if output_hidden_states else None
         all_attentions    = () if output_attentions    else None
         all_aux_stats     = []
             dca_layer_sources = self._select_dca_sources(dca_sources) if use_dca else None
+            if use_siamesenorm:
+                layer_outputs = decoder_layer.forward_siamesenorm(
+                    hidden_states,
+                    siamese_y_states,
+                    position_embeddings=position_embeddings,
+                    attention_mask=causal_mask,
+                    first_layer_fan=self.first_layer_fan,
+                    z_tilde=z_tilde,
+                    B_vals=B_vals,
+                    layer_analysis=layer_analysis,
+                    output_attentions=output_attentions,
+                    repo_rope_args=repo_rope_args,
+                    position_ids=position_ids,
+                    **kwargs,
+                )
+                hidden_states = layer_outputs[0]
+                siamese_y_states = layer_outputs[1]
+                extras_start = 2
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings=position_embeddings,
+                    attention_mask=causal_mask,
+                    first_layer_fan=self.first_layer_fan,
+                    z_tilde=z_tilde,
+                    B_vals=B_vals,
+                    dca_sources=dca_layer_sources,
+                    attn_res_sources=attn_res_sources,
+                    attn_res_partial=attn_res_partial if use_attn_res else None,
+                    layer_analysis=layer_analysis,
+                    output_attentions=output_attentions,
+                    repo_rope_args=repo_rope_args,
+                    position_ids=position_ids,
+                    **kwargs,
+                )
+                hidden_states = layer_outputs[0]
+                extras_start = 1
             # Update AttnRes partial sum — the new partial is the layer output
             if use_attn_res:
                 dca_sources = dca_sources + [hidden_states]
             if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[extras_start],)
+                extras_start += 1
+            # Collect JTok-M aux stats.
+            if self.config.use_jtokm:
+                for item in layer_outputs[extras_start:]:
+                    if isinstance(item, tuple) and len(item) == 3:
+                        all_aux_stats.append(item)
+                        break
+            # Collect VersatileFFN aux stats. Only non-None during training.
             if getattr(self.config, "use_versatile_ffn", False):
+                for item in layer_outputs[extras_start:]:
                     if isinstance(item, tuple) and len(item) == 3:
                         all_aux_stats.append(("versatile", item))
                         break
                     and hasattr(decoder_layer, "current_layer_fan")):
                 self.first_layer_fan = decoder_layer.current_layer_fan
+        if use_siamesenorm:
+            hidden_states = self.siamese_final_norm(
+                self.siamese_x_final_norm(hidden_states)
+                + self.siamese_y_final_norm(siamese_y_states)
+            )
+        else:
+            hidden_states = self.norm(hidden_states)
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)