smithblack-0
/

SHRAM-dev

@@ -24,7 +24,7 @@
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
-  "transformers_version": "5.10.1",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128

   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
+  "transformers_version": "5.10.2",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128

huggingface.py CHANGED Viewed

@@ -1458,6 +1458,10 @@ Returns a plain dict with keys:
 - "hidden_states": tuple of per-layer activations if output_hidden_states=True, else None
 - "load_balance_loss": scalar sum of per-layer SHRAM load-balance losses
 - "max_vio": detached scalar maximum routing-imbalance across all decoder layers
 """
@@ -1474,7 +1478,7 @@ Each block applies pre-norm hybrid attention followed by pre-norm MLP, with
 gated residual connections around both sublayers:
     normed_attn = RMSNorm(x)
-    attn_out, load_balance_loss, max_vio = SHRAMHybridLayer(normed_attn, ...)
     h = x + residual_gate * attn_out
     normed_mlp = RMSNorm(h)
@@ -3094,7 +3098,7 @@ class MoSRAHRouter(nn.Module):
         x: torch.Tensor,
         active_mask: torch.Tensor,
         used_capacity: torch.Tensor | None
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Route input tokens to K expert heads each and compute routing probabilities.
         Args:
@@ -3103,17 +3107,23 @@ class MoSRAHRouter(nn.Module):
                 True means the token is semantically live. Dead tokens do not
                 contribute to routing frequencies, load_balance_loss, or max_vio.
             used_capacity: Used for capacity management during inference, missing during training.
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
                 Each token's K selected head indices, determined by TopK on biased scores.
             routing_probs: Routing probabilities P of shape (batch, seq_len,
                 num_selected_heads). Gathered from unbiased scores at selected_heads
                 indices and renormalized to sum to 1 per token.
-            load_balance_loss: Scalar load balance imbalance loss for this forward pass.
-                Training loop scales this by a weight and adds it to the main loss.
-            max_vio: Detached scalar routing-imbalance summary for this forward pass.
-                Equal to L · max_l(f_l − 1/L). Zero means perfect balance. Not a loss;
-                never contributes gradients.
         """
         B, N, _ = x.shape
         L = self.num_mosrah_heads
@@ -3122,6 +3132,17 @@ class MoSRAHRouter(nn.Module):
         # Unbiased routing scores R = Softmax(xW_r). These are the scores used to
         # compute routing_probs — expert_bias must not influence them.
         logits = self.routing_projection(x)                    # (B, N, L)
         routing_scores = F.softmax(logits, dim=-1)             # R, (B, N, L)
         # Biased routing scores R̂ = Softmax(xW_r + b). Used only for TopK head
@@ -3177,7 +3198,15 @@ class MoSRAHRouter(nn.Module):
         # L · max_l(f_l − 1/L) applied to routing_freqs. Must not contribute gradients.
         max_vio = self._compute_max_vio(routing_freqs, L)
-        return selected_heads, routing_probs, load_balance_loss, max_vio
     @staticmethod
     def _compute_max_vio(routing_freqs: torch.Tensor, num_heads: int) -> torch.Tensor:
@@ -3322,8 +3351,9 @@ class MoSRAHLayer(nn.Module):
     The MoSRAH path consumes model-space hidden states together with
     authoritative per-token positions and returns the model-space sparse-path
-    contribution, the router's load-balance loss, and the router's MaxVio
-    routing-imbalance scalar.
     """
     def __init__(self, config: ShramConfig) -> None:
@@ -3348,7 +3378,7 @@ class MoSRAHLayer(nn.Module):
         position_ids: torch.Tensor,
         active_mask: torch.Tensor,
         cache: MoSRAHCache | None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Run the full MoSRAH sparse path.
         Args:
@@ -3364,9 +3394,10 @@ class MoSRAHLayer(nn.Module):
         Returns:
             sparse_output: Model-space sparse-path output of shape (B, N, d).
-            load_balance_loss: Scalar router load-balance loss.
-            max_vio: Detached scalar routing-imbalance summary. Passed through
-                unchanged from the router; see MoSRAHRouter for semantics.
         """
         # -------------------------------------------------------------------
@@ -3381,7 +3412,7 @@ class MoSRAHLayer(nn.Module):
         # active_mask is rebound to the packed form after this point.
         # -------------------------------------------------------------------
         used_capacity = cache.get_heads_lengths() if cache is not None else None
-        selected_heads, routing_probs, load_balance_loss, max_vio = self.router(
             hidden_states, active_mask, used_capacity
         )
@@ -3434,7 +3465,7 @@ class MoSRAHLayer(nn.Module):
             token_choice_outputs * routing_probs.unsqueeze(-1)
         ).sum(dim=2)
-        return final_output, load_balance_loss, max_vio
@@ -3463,7 +3494,7 @@ class SHRAMHybridLayer(nn.Module):
         position_ids: torch.Tensor,
         active_mask: torch.Tensor,
         cache: ShramLayerCache | None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Apply the SHRAM hybrid attention layer.
         Args:
@@ -3478,8 +3509,7 @@ class SHRAMHybridLayer(nn.Module):
         Returns:
             hybrid_output: Model-space hybrid attention output of shape (B, N, d).
-            load_balance_loss: Scalar sparse-path load-balance loss.
-            max_vio: Detached scalar routing-imbalance summary. Passed through
                 unchanged from MoSRAHLayer; see MoSRAHRouter for semantics.
         """
         # -------------------------------------------------------------------
@@ -3507,7 +3537,7 @@ class SHRAMHybridLayer(nn.Module):
             active_mask=active_mask,
             cache=sliding_window_cache,
         )
-        sparse_output, load_balance_loss, max_vio = self.sparse_attention(
             hidden_states=hidden_states,
             position_ids=position_ids,
             active_mask=active_mask,
@@ -3522,7 +3552,7 @@ class SHRAMHybridLayer(nn.Module):
         # -------------------------------------------------------------------
         hybrid_output = local_output + sparse_output
-        return hybrid_output, load_balance_loss, max_vio
 # -----------
@@ -3612,7 +3642,7 @@ class DecoderLayer(nn.Module):
         position_ids: torch.Tensor,
         active_mask: torch.Tensor,
         cache: ShramLayerCache | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Apply one decoder block to the input.
         Args:
@@ -3626,12 +3656,10 @@ class DecoderLayer(nn.Module):
         Returns:
             output: Tensor of shape (batch, seq_len, hidden_size).
-            load_balance_loss: Scalar sparse-path load-balance loss propagated
-                from SHRAMHybridLayer.
-            max_vio: Detached scalar routing-imbalance summary. Passed through
                 unchanged from SHRAMHybridLayer; see MoSRAHRouter for semantics.
         """
-        attn_out, load_balance_loss, max_vio = self.attention(
             hidden_states=self.attn_norm(x),
             position_ids=position_ids,
             active_mask=active_mask,
@@ -3639,7 +3667,7 @@ class DecoderLayer(nn.Module):
         )
         hidden_states = x + self.residual_gate*attn_out
         output = hidden_states + self.residual_gate*self.mlp(self.mlp_norm(hidden_states))
-        return output, load_balance_loss, max_vio
 class ShramModel(nn.Module):
@@ -3708,27 +3736,51 @@ class ShramModel(nn.Module):
             - ``"max_vio"``: detached scalar maximum routing-imbalance across
               all decoder layers. Zero means perfectly balanced routing across
               every layer; higher values identify the worst-case head imbalance.
         """
         hidden_states = inputs_embeds
         all_hidden_states = (hidden_states,) if output_hidden_states else None
         total_load_balance_loss = inputs_embeds.new_zeros(())
         max_vio = inputs_embeds.new_zeros(())
         for layer_idx, layer in enumerate(self.layers):
             layer_cache = None if cache is None else cache.layers[layer_idx]
-            hidden_states, layer_load_balance_loss, layer_max_vio = layer(
                 hidden_states,
                 position_ids,
                 active_mask,
                 cache=layer_cache,
             )
-            total_load_balance_loss = total_load_balance_loss + layer_load_balance_loss
-            max_vio = torch.maximum(max_vio, layer_max_vio)
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
         hidden_states = self.norm(hidden_states)
         return {
             "last_hidden_state": hidden_states,
@@ -3736,6 +3788,10 @@ class ShramModel(nn.Module):
             "hidden_states": all_hidden_states,
             "load_balance_loss": total_load_balance_loss,
             "max_vio": max_vio,
         }
@@ -3749,10 +3805,20 @@ class ShramCausalLMOutput(CausalLMOutputWithPast):
     only the SHRAM-specific wrapper outputs.
     """
     ce_loss: torch.FloatTensor | None = None
     load_balance_loss: torch.FloatTensor | None = None
     max_vio: torch.FloatTensor | None = None
 class ShramForCausalLM(PreTrainedModel, GenerationMixin):
     """HuggingFace-facing causal language model wrapper for SHRAM.
@@ -4181,6 +4247,9 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
             output_hidden_states: Whether to return backbone hidden states.
                 Defaults to ``config.output_hidden_states``.
             labels: Optional target token IDs of shape ``(batch, seq_len)``.
             return_dict: Must be ``True`` or ``None``.
             ce_weight: Weight applied to the cross-entropy loss when combining with
                 the load-balance loss. Default 1.0.
@@ -4197,7 +4266,10 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
             - ``past_key_values`` as the active ``ShramCache`` or ``None``,
             - ``hidden_states`` when requested,
             - ``load_balance_loss`` — raw unweighted load-balance loss from the backbone,
-            - detached ``max_vio`` from the backbone.
         """
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_hidden_states = (
@@ -4304,4 +4376,8 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
             hidden_states=backbone_outputs["hidden_states"],
             load_balance_loss=backbone_outputs["load_balance_loss"],
             max_vio=backbone_outputs["max_vio"],
         )

 - "hidden_states": tuple of per-layer activations if output_hidden_states=True, else None
 - "load_balance_loss": scalar sum of per-layer SHRAM load-balance losses
 - "max_vio": detached scalar maximum routing-imbalance across all decoder layers
+- "bias_std": detached scalar mean per-layer std of the expert bias vector
+- "raw_logit_std": detached scalar mean per-layer per-token routing logit spread
+- "logit_std": detached scalar mean per-layer per-token combined (logit + bias) spread
+- "bias_alignment": detached scalar mean per-layer cosine similarity of bias vs logits
 """
 gated residual connections around both sublayers:
     normed_attn = RMSNorm(x)
+    attn_out, router_diagnostics = SHRAMHybridLayer(normed_attn, ...)
     h = x + residual_gate * attn_out
     normed_mlp = RMSNorm(h)
         x: torch.Tensor,
         active_mask: torch.Tensor,
         used_capacity: torch.Tensor | None
+    ) -> tuple[torch.Tensor, torch.Tensor, dict[str, torch.Tensor]]:
         """Route input tokens to K expert heads each and compute routing probabilities.
         Args:
                 True means the token is semantically live. Dead tokens do not
                 contribute to routing frequencies, load_balance_loss, or max_vio.
             used_capacity: Used for capacity management during inference, missing during training.
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
                 Each token's K selected head indices, determined by TopK on biased scores.
             routing_probs: Routing probabilities P of shape (batch, seq_len,
                 num_selected_heads). Gathered from unbiased scores at selected_heads
                 indices and renormalized to sum to 1 per token.
+            router_diagnostics: Dict of routing feedback scalars. Keys:
+                - ``load_balance_loss``: scalar load-balance loss with gradient.
+                - ``max_vio``: detached scalar routing-imbalance summary.
+                - ``bias_std``: std of expert_bias; near-zero means corrections have not built up.
+                - ``raw_logit_std``: mean per-token std of unbiased logits; the natural routing scale.
+                - ``logit_std``: mean per-token std of (logits + expert_bias); lower than
+                  raw_logit_std means bias is flattening preferences (healthy correction).
+                - ``bias_alignment``: mean cosine similarity of expert_bias against per-token
+                  logits. Negative means bias opposes routing direction (healthy correction);
+                  positive means runaway reinforcement.
         """
         B, N, _ = x.shape
         L = self.num_mosrah_heads
         # Unbiased routing scores R = Softmax(xW_r). These are the scores used to
         # compute routing_probs — expert_bias must not influence them.
         logits = self.routing_projection(x)                    # (B, N, L)
+        # Diagnostic scalars characterising the load-balance mechanism. Must be
+        # computed here — before balance_capacity injects -1e8 sentinels that
+        # would corrupt std and cosine similarity.
+        bias_std = self.expert_bias.std().detach()
+        raw_logit_std = logits.std(dim=-1).mean().detach()
+        logit_std = (logits + self.expert_bias).std(dim=-1).mean().detach()
+        bias_alignment = F.cosine_similarity(
+            logits, self.expert_bias.expand_as(logits), dim=-1
+        ).mean().detach()
         routing_scores = F.softmax(logits, dim=-1)             # R, (B, N, L)
         # Biased routing scores R̂ = Softmax(xW_r + b). Used only for TopK head
         # L · max_l(f_l − 1/L) applied to routing_freqs. Must not contribute gradients.
         max_vio = self._compute_max_vio(routing_freqs, L)
+        router_diagnostics = {
+            "load_balance_loss": load_balance_loss,
+            "max_vio": max_vio,
+            "bias_std": bias_std,
+            "raw_logit_std": raw_logit_std,
+            "logit_std": logit_std,
+            "bias_alignment": bias_alignment,
+        }
+        return selected_heads, routing_probs, router_diagnostics
     @staticmethod
     def _compute_max_vio(routing_freqs: torch.Tensor, num_heads: int) -> torch.Tensor:
     The MoSRAH path consumes model-space hidden states together with
     authoritative per-token positions and returns the model-space sparse-path
+    contribution and a diagnostics dict from the router containing
+    load-balance loss, routing-imbalance scalar, and load-balance health
+    scalars.
     """
     def __init__(self, config: ShramConfig) -> None:
         position_ids: torch.Tensor,
         active_mask: torch.Tensor,
         cache: MoSRAHCache | None,
+    ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
         """Run the full MoSRAH sparse path.
         Args:
         Returns:
             sparse_output: Model-space sparse-path output of shape (B, N, d).
+            router_diagnostics: Dict of router feedback scalars. Keys:
+                ``load_balance_loss`` (has grad), ``max_vio``, ``bias_std``,
+                ``raw_logit_std``, ``logit_std``, ``bias_alignment`` (all
+                detached). See MoSRAHRouter for semantics.
         """
         # -------------------------------------------------------------------
         # active_mask is rebound to the packed form after this point.
         # -------------------------------------------------------------------
         used_capacity = cache.get_heads_lengths() if cache is not None else None
+        selected_heads, routing_probs, router_diagnostics = self.router(
             hidden_states, active_mask, used_capacity
         )
             token_choice_outputs * routing_probs.unsqueeze(-1)
         ).sum(dim=2)
+        return final_output, router_diagnostics
         position_ids: torch.Tensor,
         active_mask: torch.Tensor,
         cache: ShramLayerCache | None,
+    ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
         """Apply the SHRAM hybrid attention layer.
         Args:
         Returns:
             hybrid_output: Model-space hybrid attention output of shape (B, N, d).
+            router_diagnostics: Dict of router feedback scalars passed through
                 unchanged from MoSRAHLayer; see MoSRAHRouter for semantics.
         """
         # -------------------------------------------------------------------
             active_mask=active_mask,
             cache=sliding_window_cache,
         )
+        sparse_output, router_diagnostics = self.sparse_attention(
             hidden_states=hidden_states,
             position_ids=position_ids,
             active_mask=active_mask,
         # -------------------------------------------------------------------
         hybrid_output = local_output + sparse_output
+        return hybrid_output, router_diagnostics
 # -----------
         position_ids: torch.Tensor,
         active_mask: torch.Tensor,
         cache: ShramLayerCache | None = None,
+    ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
         """Apply one decoder block to the input.
         Args:
         Returns:
             output: Tensor of shape (batch, seq_len, hidden_size).
+            router_diagnostics: Dict of router feedback scalars passed through
                 unchanged from SHRAMHybridLayer; see MoSRAHRouter for semantics.
         """
+        attn_out, router_diagnostics = self.attention(
             hidden_states=self.attn_norm(x),
             position_ids=position_ids,
             active_mask=active_mask,
         )
         hidden_states = x + self.residual_gate*attn_out
         output = hidden_states + self.residual_gate*self.mlp(self.mlp_norm(hidden_states))
+        return output, router_diagnostics
 class ShramModel(nn.Module):
             - ``"max_vio"``: detached scalar maximum routing-imbalance across
               all decoder layers. Zero means perfectly balanced routing across
               every layer; higher values identify the worst-case head imbalance.
+            - ``"bias_std"``: detached scalar — mean across layers of the std
+              of each layer's expert bias vector. Near-zero means corrections
+              have not built up; large relative to ``raw_logit_std`` means the
+              bias dominates routing.
+            - ``"raw_logit_std"``: detached scalar — mean across layers of the
+              per-token routing logit spread before bias addition. Baseline
+              natural routing preference scale.
+            - ``"logit_std"``: detached scalar — mean across layers of the
+              per-token combined (logit + bias) spread. Lower than
+              ``raw_logit_std`` indicates healthy flattening; higher indicates
+              amplification.
+            - ``"bias_alignment"``: detached scalar — mean across layers of the
+              per-token cosine similarity between the expert bias vector and the
+              routing logits. Negative is healthy correction; positive is
+              runaway feedback.
         """
         hidden_states = inputs_embeds
         all_hidden_states = (hidden_states,) if output_hidden_states else None
         total_load_balance_loss = inputs_embeds.new_zeros(())
         max_vio = inputs_embeds.new_zeros(())
+        total_bias_std = inputs_embeds.new_zeros(())
+        total_raw_logit_std = inputs_embeds.new_zeros(())
+        total_logit_std = inputs_embeds.new_zeros(())
+        total_bias_alignment = inputs_embeds.new_zeros(())
         for layer_idx, layer in enumerate(self.layers):
             layer_cache = None if cache is None else cache.layers[layer_idx]
+            hidden_states, layer_diagnostics = layer(
                 hidden_states,
                 position_ids,
                 active_mask,
                 cache=layer_cache,
             )
+            total_load_balance_loss = total_load_balance_loss + layer_diagnostics["load_balance_loss"]
+            max_vio = torch.maximum(max_vio, layer_diagnostics["max_vio"])
+            total_bias_std = total_bias_std + layer_diagnostics["bias_std"]
+            total_raw_logit_std = total_raw_logit_std + layer_diagnostics["raw_logit_std"]
+            total_logit_std = total_logit_std + layer_diagnostics["logit_std"]
+            total_bias_alignment = total_bias_alignment + layer_diagnostics["bias_alignment"]
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
         hidden_states = self.norm(hidden_states)
+        num_layers = len(self.layers)
         return {
             "last_hidden_state": hidden_states,
             "hidden_states": all_hidden_states,
             "load_balance_loss": total_load_balance_loss,
             "max_vio": max_vio,
+            "bias_std": total_bias_std / num_layers,
+            "raw_logit_std": total_raw_logit_std / num_layers,
+            "logit_std": total_logit_std / num_layers,
+            "bias_alignment": total_bias_alignment / num_layers,
         }
     only the SHRAM-specific wrapper outputs.
     """
+    ## Python dataclass inheritance violation: CausalLMOutputWithPast defaults all
+    ## fields to None, which forces every subclass field to also carry a default.
+    ## The = None below is a language constraint, not a semantic statement. In
+    ## practice, load_balance_loss, max_vio, bias_std, raw_logit_std, logit_std,
+    ## and bias_alignment are always populated by ShramForCausalLM.forward().
+    ## ce_loss is genuinely optional — present only when labels are supplied.
     ce_loss: torch.FloatTensor | None = None
     load_balance_loss: torch.FloatTensor | None = None
     max_vio: torch.FloatTensor | None = None
+    bias_std: torch.Tensor | None = None
+    raw_logit_std: torch.Tensor | None = None
+    logit_std: torch.Tensor | None = None
+    bias_alignment: torch.Tensor | None = None
 class ShramForCausalLM(PreTrainedModel, GenerationMixin):
     """HuggingFace-facing causal language model wrapper for SHRAM.
             output_hidden_states: Whether to return backbone hidden states.
                 Defaults to ``config.output_hidden_states``.
             labels: Optional target token IDs of shape ``(batch, seq_len)``.
+                Pass unshifted labels (same alignment as ``input_ids``). This
+                wrapper shifts internally: ``logits[:, :-1]`` is compared
+                against ``labels[:, 1:]``. Do not pre-shift the caller side.
             return_dict: Must be ``True`` or ``None``.
             ce_weight: Weight applied to the cross-entropy loss when combining with
                 the load-balance loss. Default 1.0.
             - ``past_key_values`` as the active ``ShramCache`` or ``None``,
             - ``hidden_states`` when requested,
             - ``load_balance_loss`` — raw unweighted load-balance loss from the backbone,
+            - ``max_vio`` — detached worst-case routing imbalance across layers,
+            - ``bias_std``, ``raw_logit_std``, ``logit_std``, ``bias_alignment`` —
+              detached load-balance health scalars averaged across decoder layers;
+              see ``ShramModel`` for interpretation.
         """
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_hidden_states = (
             hidden_states=backbone_outputs["hidden_states"],
             load_balance_loss=backbone_outputs["load_balance_loss"],
             max_vio=backbone_outputs["max_vio"],
+            bias_std=backbone_outputs["bias_std"],
+            raw_logit_std=backbone_outputs["raw_logit_std"],
+            logit_std=backbone_outputs["logit_std"],
+            bias_alignment=backbone_outputs["bias_alignment"],
         )