Update architecture and tokenizer

Browse files

Files changed (4) hide show

README.md +2 -2
config.json +2 -2
configuration.py +22 -22
huggingface.py +301 -376

README.md CHANGED Viewed

@@ -82,9 +82,10 @@ contains no weights. All values are overridable via kwargs.
 | `embedding_width` | 512 |
 | `head_dim` | 16 |
 | `inference_sequence_length` | 1024 |
-| `load_balance_loss_type` | ce |
 | `local_rope_theta` | 10000.0 |
 | `max_bid_rounds` | 10 |
 | `mlp_width` | 1366 |
 | `mosrah_overallocation_factor` | 2.0 |
 | `mosrah_rope_theta` | 10000.0 |
@@ -95,7 +96,6 @@ contains no weights. All values are overridable via kwargs.
 | `output_hidden_states` | False |
 | `rms_norm_eps` | 1e-05 |
 | `rope_mode` | main_sequence |
-| `routing_mode` | integral |
 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |

 | `embedding_width` | 512 |
 | `head_dim` | 16 |
 | `inference_sequence_length` | 1024 |
+| `load_balance_loss_type` | temporal_overcapacity |
 | `local_rope_theta` | 10000.0 |
 | `max_bid_rounds` | 10 |
+| `maximum_expert_overclaim` | 20 |
 | `mlp_width` | 1366 |
 | `mosrah_overallocation_factor` | 2.0 |
 | `mosrah_rope_theta` | 10000.0 |
 | `output_hidden_states` | False |
 | `rms_norm_eps` | 1e-05 |
 | `rope_mode` | main_sequence |
 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |

config.json CHANGED Viewed

@@ -9,9 +9,10 @@
   "embedding_width": 512,
   "head_dim": 16,
   "inference_sequence_length": 1024,
-  "load_balance_loss_type": "ce",
   "local_rope_theta": 10000.0,
   "max_bid_rounds": 10,
   "mlp_width": 1366,
   "model_type": "shram",
   "mosrah_overallocation_factor": 2.0,
@@ -22,7 +23,6 @@
   "num_sliding_window_heads": 16,
   "rms_norm_eps": 1e-05,
   "rope_mode": "main_sequence",
-  "routing_mode": "integral",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
   "transformers_version": "5.10.2",

   "embedding_width": 512,
   "head_dim": 16,
   "inference_sequence_length": 1024,
+  "load_balance_loss_type": "temporal_overcapacity",
   "local_rope_theta": 10000.0,
   "max_bid_rounds": 10,
+  "maximum_expert_overclaim": 20,
   "mlp_width": 1366,
   "model_type": "shram",
   "mosrah_overallocation_factor": 2.0,
   "num_sliding_window_heads": 16,
   "rms_norm_eps": 1e-05,
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
   "transformers_version": "5.10.2",

configuration.py CHANGED Viewed

@@ -91,17 +91,18 @@ class ShramConfig(PretrainedConfig):
             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
         load_balance_loss_type: Formula used for the load-balance auxiliary loss.
-            One of ``"gshard"``, ``"ce"``, or ``"bce"``. ``"ce"`` (cross-entropy)
-            is the default; its log-probability signal scales with violation severity
-            and makes correction magnitude proportional to routing imbalance.
-            Default ``"ce"``.
-        routing_mode: Routing computation mode. ``"integral"`` (default) enables the
-            integral routing extension: the exclusive cumsum of routing logits along
-            the sequence dimension is mapped through two additional (L, L) parameter
-            matrices (``routing_integral_weight`` A' and ``balance_integral_weight``
-            B') and added as corrections to both logit pathways. This gives each
-            token a read on the cumulative routing history so far in the sequence.
-            ``"default"`` disables the extension; A' and B' are not created.
     """
     model_type = "shram"
@@ -136,8 +137,8 @@ class ShramConfig(PretrainedConfig):
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
         max_bid_rounds: int = 10,
-        load_balance_loss_type: str = "ce",
-        routing_mode: str = "integral",
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -178,7 +179,13 @@ class ShramConfig(PretrainedConfig):
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
-        _supported_loss_types = {"gshard", "ce", "bce"}
         if load_balance_loss_type not in _supported_loss_types:
             supported = ", ".join(f'"{t}"' for t in sorted(_supported_loss_types))
             raise ValueError(
@@ -186,13 +193,6 @@ class ShramConfig(PretrainedConfig):
                 f"got {load_balance_loss_type!r}."
             )
-        _supported_routing_modes = {"default", "integral"}
-        if routing_mode not in _supported_routing_modes:
-            supported = ", ".join(f'"{m}"' for m in sorted(_supported_routing_modes))
-            raise ValueError(
-                f"routing_mode must be one of {supported}, got {routing_mode!r}."
-            )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
@@ -213,7 +213,7 @@ class ShramConfig(PretrainedConfig):
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
-        self.routing_mode = routing_mode
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
         load_balance_loss_type: Formula used for the load-balance auxiliary loss.
+            One of ``"gshard"``, ``"ce"``, ``"bce"``, or ``"temporal_overcapacity"``.
+            ``"temporal_overcapacity"`` is the default; it fires only when an expert
+            exceeds its allowed trajectory (controlled by ``maximum_expert_overclaim``)
+            and shuts off automatically once routing is balanced, allowing it to be
+            used with a strong weight without interfering with task training during
+            balanced routing. Default ``"temporal_overcapacity"``.
+        maximum_expert_overclaim: Maximum number of tokens an expert may receive above
+            its ideal allocation trajectory before the temporal overcapacity loss
+            fires. A value of 0 means violations trigger immediately at any imbalance.
+            Larger values permit short-lived semantic specialization before correction.
+            Only used when ``load_balance_loss_type="temporal_overcapacity"``.
+            Must be non-negative. Default 20.
     """
     model_type = "shram"
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
         max_bid_rounds: int = 10,
+        load_balance_loss_type: str = "temporal_overcapacity",
+        maximum_expert_overclaim: int = 20,
         **kwargs
     ):
         if head_dim % 2 != 0:
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
+        if maximum_expert_overclaim < 0:
+            raise ValueError(
+                f"maximum_expert_overclaim must be non-negative, "
+                f"got {maximum_expert_overclaim}."
+            )
+        _supported_loss_types = {"gshard", "ce", "bce", "temporal_overcapacity"}
         if load_balance_loss_type not in _supported_loss_types:
             supported = ", ".join(f'"{t}"' for t in sorted(_supported_loss_types))
             raise ValueError(
                 f"got {load_balance_loss_type!r}."
             )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
+        self.maximum_expert_overclaim = maximum_expert_overclaim
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

huggingface.py CHANGED Viewed

@@ -178,17 +178,18 @@ class ShramConfig(PretrainedConfig):
             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
         load_balance_loss_type: Formula used for the load-balance auxiliary loss.
-            One of ``"gshard"``, ``"ce"``, or ``"bce"``. ``"ce"`` (cross-entropy)
-            is the default; its log-probability signal scales with violation severity
-            and makes correction magnitude proportional to routing imbalance.
-            Default ``"ce"``.
-        routing_mode: Routing computation mode. ``"integral"`` (default) enables the
-            integral routing extension: the exclusive cumsum of routing logits along
-            the sequence dimension is mapped through two additional (L, L) parameter
-            matrices (``routing_integral_weight`` A' and ``balance_integral_weight``
-            B') and added as corrections to both logit pathways. This gives each
-            token a read on the cumulative routing history so far in the sequence.
-            ``"default"`` disables the extension; A' and B' are not created.
     """
     model_type = "shram"
@@ -223,8 +224,8 @@ class ShramConfig(PretrainedConfig):
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
         max_bid_rounds: int = 10,
-        load_balance_loss_type: str = "ce",
-        routing_mode: str = "integral",
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -265,7 +266,13 @@ class ShramConfig(PretrainedConfig):
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
-        _supported_loss_types = {"gshard", "ce", "bce"}
         if load_balance_loss_type not in _supported_loss_types:
             supported = ", ".join(f'"{t}"' for t in sorted(_supported_loss_types))
             raise ValueError(
@@ -273,13 +280,6 @@ class ShramConfig(PretrainedConfig):
                 f"got {load_balance_loss_type!r}."
             )
-        _supported_routing_modes = {"default", "integral"}
-        if routing_mode not in _supported_routing_modes:
-            supported = ", ".join(f'"{m}"' for m in sorted(_supported_routing_modes))
-            raise ValueError(
-                f"routing_mode must be one of {supported}, got {routing_mode!r}."
-            )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
@@ -300,7 +300,7 @@ class ShramConfig(PretrainedConfig):
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
-        self.routing_mode = routing_mode
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
@@ -1478,10 +1478,7 @@ Returns a plain dict with keys:
 - "hidden_states": tuple of per-layer activations if output_hidden_states=True, else None
 - "load_balance_loss": scalar sum of per-layer SHRAM load-balance losses
 - "max_vio": detached scalar maximum routing-imbalance across all decoder layers
-- "bias_std": detached scalar mean per-layer std of the expert bias vector
-- "raw_logit_std": detached scalar mean per-layer per-token routing logit spread
-- "logit_std": detached scalar mean per-layer per-token combined (logit + bias) spread
-- "bias_alignment": detached scalar mean per-layer cosine similarity of bias vs logits
 """
@@ -2725,71 +2722,38 @@ This module implements the routing mechanism described in Appendix A.Routing of
 paper. Given an input hidden state x, the router produces two outputs used downstream:
   - selected_heads (I): which K of the L available expert heads each token routes to,
-    determined by TopK over capacity-balanced semantic routing scores.
   - routing_probs (P): the weights used for the weighted output reduction, gathered from
-    the semantic routing scores at the selected indices and renormalized to sum to 1
-    per token.
-Base routing uses two learnable projection matrices and two gradient-isolated pathways:
-  - routing_weight (A): shape (L, embedding_width). Maps input to per-head routing
-    scores. Receives gradients from task loss; balance_weight is isolated.
-  - balance_weight (B): shape (L, embedding_width). Maps input to per-head load-balance
-    correction scores. Receives gradients from load_balance_loss; routing_weight is
-    isolated.
-The two gradient-isolated base pathways over numerically identical values:
-  - semantic_logits = A·x + (B·x).detach(): task gradients reach routing_weight;
-    balance_weight is isolated from task loss.
-  - load_balancing_logits = (A·x).detach() + B·(x.detach()): load balance gradients
-    reach balance_weight; routing_weight and x are isolated from load balance loss.
-Integral routing extension (routing_mode == "integral"):
-Standard routing is parallel — each token routes based on its own hidden state alone,
-with no direct read on what earlier tokens in the sequence have already selected.
-Integral routing adds a cumulative-sum signal that gives each token a view of the
-prior routing history within the sequence.
-Two additional (L, L) parameter matrices are introduced:
-  - routing_integral_weight (A'): shape (L, L). Maps the cumulative logit history to
-    per-head semantic corrections. Receives gradients from task loss.
-  - balance_integral_weight (B'): shape (L, L). Maps the cumulative logit history to
-    per-head load-balance corrections. Receives gradients from load_balance_loss.
-The cumulative history signal u is the exclusive cumsum of the base logits along the
-sequence dimension: u[n] = sum(logits[0..n-1]), shape (B, N, L). Position 0 receives
-zeros (no prior history). The same gradient isolation pattern as A/B applies:
-  - semantic_logits   += A'·u_semantic + (B'·u_semantic).detach()
-  - lb_logits         += (A'·u_load).detach() + B'·u_load
-Detaching the full B'·u_semantic result (rather than just B') mirrors the
-(B·x).detach() pattern in the base pathway and prevents double-counting the
-cumsum gradient path back to routing_weight.
-Both base matrices and both integral matrices are nn.Parameter so that HuggingFace
-_init_weights does not override their kaiming initialization at construction.
-Assignment probabilities are computed before balance_capacity applies -1e8 sentinels.
-Post-capacity softmax would invert the load balance gradient for over-capacity experts
-(near-zero probability after masking signals "increase corrections" for an already-
-overloaded expert).
-The router also computes and returns the load balance loss via a log-probability auxiliary
-loss (see load_balance_loss.py). The loss formulation is selected by config; the default
-is cross-entropy.
-The router additionally computes and returns MaxVio, a detached scalar summarising
-routing imbalance for the current forward pass:
-    MaxVio = mean_b( L · max_l(f_bl − 1/L) )
-where f_bl is the per-batch-item realised routing frequency of head l and 1/L is the
-perfectly balanced target. MaxVio is averaged over batch items and is a monitoring
-quantity only; it never contributes gradients.
 Paper ref: Appendix A.Routing, Appendix A.Load Balancing, §MaxVio.
 """
@@ -2804,7 +2768,7 @@ Paper ref: Appendix A.Routing, Appendix A.Load Balancing, §MaxVio.
 # -----------
 """Log-probability auxiliary loss functions for MoSRAH load balancing.
-This module provides three load-balance loss formulations, two token-reduction
 helpers, and a factory that selects among the formulations. All formulations
 share the same external contract:
@@ -2814,9 +2778,8 @@ share the same external contract:
         active_mask:     Tensor[B, N],
     ) -> scalar Tensor
-    logits:          Load-balancing logits, shape (B, N, L). These are the raw
-                     pre-softmax scores from logits.detach() + expert_bias.
-                     Gradient flows to expert_bias through this tensor.
     assignment_mask: Per-token head-assignment indicators. assignment_mask[b, n, l]
                      is 1.0 if token (b, n) was assigned to head l. Dead tokens
                      should carry zero entries.
@@ -2826,17 +2789,19 @@ share the same external contract:
 Token reduction is split into two helpers with distinct roles:
     reduce_frequency_tokens — produces per-batch-item routing frequencies f_bl (B, L).
-        Called by all three formulations. Output is detached; f_bl carries no gradient.
     reduce_probability_tokens — produces per-batch-item mean assignment probabilities
-        p_bl (B, L). Called only by gshard and bce. Gradient flows to expert_bias
-        through the internal softmax over logits.
 CE delegates probability computation to F.cross_entropy, which handles its own
 log_softmax and operates directly on the raw (B, N, L) logits.
-The factory is the intended entry point. MoSRAHRouter constructs the loss callable
-once at init and invokes it each forward pass.
 """
@@ -3010,30 +2975,181 @@ def bce_loss(
     """
     f_bl = reduce_frequency_tokens(assignment_mask, active_mask)
     p_bl = reduce_probability_tokens(logits, active_mask)
-    # Clamp p_bl for numerical safety: F.binary_cross_entropy requires input in
-    # (0, 1) and will produce inf for exactly 0 or 1. Softmax outputs are
-    # strictly positive in normal operation; the clamp guards the all-dead-tokens
-    # edge case where the mean defaults to zero.
-    return F.binary_cross_entropy(
-        p_bl.clamp(min=1e-7, max=1.0 - 1e-7),
-        1.0 - f_bl,
-        reduction='mean',
-    )
 # ---------------------------------------------------------------------------
 # Factory
 # ---------------------------------------------------------------------------
-_LOSS_REGISTRY: dict[str, Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]] = {
-    "gshard": gshard_loss,
-    "ce": ce_loss,
-    "bce": bce_loss,
 }
 def make_load_balance_loss(
     loss_type: str,
 ) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]:
     """Return a load-balance loss callable for the requested formulation.
@@ -3045,11 +3161,14 @@ def make_load_balance_loss(
             active_mask:     Tensor[B, N],
         ) -> scalar Tensor
-    The caller is responsible for computing logits as logits.detach() + expert_bias
-    to ensure gradient isolation to expert_bias.
     Args:
-        loss_type: One of ``"gshard"``, ``"ce"``, or ``"bce"``.
     Returns:
         Loss callable matching the shared contract.
@@ -3062,34 +3181,29 @@ def make_load_balance_loss(
         raise ValueError(
             f"load_balance_loss_type must be one of {supported}, got {loss_type!r}."
         )
-    return _LOSS_REGISTRY[loss_type]
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
-    Each input token independently selects K of the L available expert heads. Both
-    selection and routing_probs incorporate balance_weight via two gradient-isolated
-    pathways over numerically identical values. See module docstring for the
-    two-pathway architecture and the integral routing extension.
-    All four learnable matrices are nn.Parameter rather than nn.Linear so that
-    HuggingFace _init_weights does not override their kaiming initialization at
-    construction.
     Attributes:
-        routing_weight: A, shape (L, embedding_width). Task-loss pathway.
-        balance_weight: B, shape (L, embedding_width). Load-balance pathway.
-        routing_integral_weight: A', shape (L, L). Integral task-loss pathway.
-            Present only when ``routing_mode == "integral"``.
-        balance_integral_weight: B', shape (L, L). Integral load-balance pathway.
-            Present only when ``routing_mode == "integral"``.
-        routing_mode: ``"integral"`` or ``"default"``, from config.
     Args:
         config: Model configuration. Must expose ``embedding_width``,
-            ``num_mosrah_heads`` (L), ``num_selected_heads`` (K), and
-            ``routing_mode``.
     """
     def __init__(self, config: ShramConfig) -> None:
@@ -3102,40 +3216,19 @@ class MoSRAHRouter(nn.Module):
             self.capacity = config.mosrah_packed_length
         self.max_bid_rounds = config.max_bid_rounds
-        self.routing_mode = config.routing_mode
-        self._load_balance_loss = make_load_balance_loss(config.load_balance_loss_type)
-        # W_r (A): semantic routing matrix. Maps input (B, N, d) to per-head routing
-        # scores (B, N, L) for selection and routing_probs. nn.Parameter ensures
-        # HuggingFace _init_weights does not override kaiming initialization.
-        self.routing_weight = nn.Parameter(
-            torch.empty(config.num_mosrah_heads, config.embedding_width)
         )
-        nn.init.kaiming_uniform_(self.routing_weight)
-        # W_b (B): load-balancing projection matrix. Maps input (B, N, d) to per-head
-        # correction scores (B, N, L). Receives gradients only from load_balance_loss.
         # nn.Parameter ensures HuggingFace _init_weights does not override kaiming init.
-        self.balance_weight = nn.Parameter(
             torch.empty(config.num_mosrah_heads, config.embedding_width)
         )
-        nn.init.kaiming_uniform_(self.balance_weight)
-        if self.routing_mode == "integral":
-            L = config.num_mosrah_heads
-            # A': integral semantic matrix. Maps cumulative logit history (B, N, L) to
-            # per-head semantic corrections (B, N, L). Shape (L, L). Receives gradients
-            # from task loss; balance_integral_weight is isolated from task loss.
-            # Zero-initialized so that corrections start at zero and grow from gradient
-            # updates — kaiming init produces corrections that immediately overwhelm the
-            # base routing signal via the cumsum feedback path.
-            self.routing_integral_weight = nn.Parameter(torch.zeros(L, L))
-            # B': integral load-balance matrix. Maps cumulative logit history (B, N, L)
-            # to per-head load-balance corrections (B, N, L). Shape (L, L). Receives
-            # gradients from load_balance_loss; routing_integral_weight is isolated.
-            # Zero-initialized for the same reason as routing_integral_weight.
-            self.balance_integral_weight = nn.Parameter(torch.zeros(L, L))
     @staticmethod
     def get_best_proposals(
@@ -3380,226 +3473,87 @@ class MoSRAHRouter(nn.Module):
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
                 Each token's K selected head indices, determined by TopK on
-                capacity-balanced semantic scores.
             routing_probs: Routing probabilities P of shape (batch, seq_len,
-                num_selected_heads). Gathered from pre-capacity semantic softmax at
                 selected_heads indices and renormalized to sum to 1 per token.
             router_diagnostics: Dict of routing feedback scalars. Keys:
                 - ``load_balance_loss``: scalar load-balance loss with gradient.
                 - ``max_vio``: detached scalar routing-imbalance summary.
-                - ``raw_logit_std``: mean per-token std of routing_logits; natural
-                  routing preference scale and baseline for interpreting bias_std.
-                - ``bias_std``: mean per-token std of balance_logits; near-zero
-                  means balance corrections have not built up relative to routing scale.
-                - ``logit_std``: mean per-token std of semantic_logits; lower than
-                  raw_logit_std means balance is flattening preferences (healthy correction).
-                - ``bias_alignment``: mean cosine similarity of routing_logits vs
-                  balance_logits per token. Negative means balance opposes routing direction
-                  (healthy correction); positive means runaway reinforcement.
         """
         B, N, _ = x.shape
         L = self.num_mosrah_heads
         K = self.num_selected_heads
-        logits = self._compute_routing_logits(x, active_mask)
-        # Diagnostic scalars characterising the two routing pathways. Must be computed
-        # before balance_capacity injects -1e8 sentinels that would corrupt std and
-        # cosine similarity. Extracted to _compute_bias_diagnostics to keep the forward
-        # body free of non-(B,N,L) reduction logic.
-        bias_diagnostics = self._compute_bias_diagnostics(
-            logits["routing_logits"], logits["balance_logits"], logits["semantic_logits"]
         )
-        # Pre-capacity semantic softmax for gathering routing_probs. Computed before
-        # balance_capacity so that gathered probabilities reflect genuine preference
-        # magnitudes rather than hard-masked sentinel values.
-        routing_scores = F.softmax(logits["semantic_logits"], dim=-1)          # (B, N, L)
-        # Capacity-balanced semantic logits for selection. Injects -1e8 into positions
-        # that would exceed per-expert token budget, enforcing the packing constraint.
-        balanced_semantic_logits = self.balance_capacity(
-            logits["semantic_logits"],
             used_capacity,
             self.capacity,
             self.num_selected_heads,
             self.max_bid_rounds,
         )
-        selection_scores = F.softmax(balanced_semantic_logits, dim=-1)    # (B, N, L)
-        # selected_heads I = TopK over capacity-balanced semantic scores.
-        selected_heads = selection_scores.topk(K, dim=-1).indices          # (B, N, K)
-        # Routing probabilities P: gathered from pre-capacity semantic softmax at
-        # selected_heads positions, renormalized so they sum to 1 per token.
         gathered      = routing_scores.gather(dim=-1, index=selected_heads)    # (B, N, K)
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)          # P, (B, N, K)
-        # assignment_mask: (B, N, L) float — 1.0 at each token's K selected heads, 0 elsewhere.
-        # The discrete routing decision; no gradient flows through it. Passed alongside
-        # load_balancing_logits and active_mask to the loss and max_vio methods, which
-        # own all frequency aggregation and reduction internally.
-        assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
-        assignment_mask.scatter_(-1, selected_heads, 1.0)
-        load_balance_loss = self._load_balance_loss(
-            logits["load_balancing_logits"], assignment_mask, active_mask
-        )
-        # MaxVio: detached monitoring scalar averaged over batch items. Computed from
-        # the same (B, N, L) assignment_mask so frequencies are consistent with the loss.
-        max_vio = self._compute_max_vio(assignment_mask, active_mask, L)
         router_diagnostics = {
             "load_balance_loss": load_balance_loss,
-            "max_vio": max_vio,
-            **bias_diagnostics,
         }
         return selected_heads, routing_probs, router_diagnostics
-    @staticmethod
-    def exclusive_cumsum(logits: torch.Tensor) -> torch.Tensor:
-        """Compute the exclusive cumulative sum along the sequence dimension.
-        u[n] = sum(logits[0..n-1]): position n receives the accumulated sum of all
-        prior positions, giving it a read on the routing preferences expressed by
-        earlier tokens in the sequence. Position 0 always receives zeros — no prior
-        history exists at the first position.
-        Args:
-            logits: Shape (B, N, L). Any per-head score tensor along a sequence.
-        Returns:
-            Exclusive cumsum, shape (B, N, L). Same dtype and device as input.
-        """
-        shifted = torch.cat(
-            [torch.zeros_like(logits[:, :1, :]), logits[:, :-1, :]], dim=1
-        )
-        return shifted.cumsum(dim=1)
-    def _compute_routing_logits(
-        self, x: torch.Tensor, active_mask: torch.Tensor
-    ) -> dict[str, torch.Tensor]:
-        """Compute the gradient-isolated logit pathways from input hidden states.
-        Base pathways (both modes):
-          Two gradient-isolated pathways over numerically identical values:
-          - semantic_logits = A·x + (B·x).detach(): task gradients reach routing_weight;
-            balance_weight is isolated from task loss.
-          - load_balancing_logits = (A·x).detach() + B·(x.detach()): load balance
-            gradients reach balance_weight; routing_weight and x are isolated.
-        Integral extension (routing_mode == "integral"):
-          Dead tokens are zeroed out of the logits before computing the cumsum, so
-          inactive positions do not contribute to the routing history of downstream
-          live tokens. u_semantic and u_load therefore represent history from live
-          tokens only.
-          u_semantic = exclusive_cumsum(semantic_logits * active_mask)    — (B, N, L)
-          u_load     = exclusive_cumsum(load_balancing_logits * active_mask) — (B, N, L)
-          semantic_logits       += A'·u_semantic + (B'·u_semantic).detach()
-          load_balancing_logits += (A'·u_load).detach() + B'·u_load
-          Detaching the full (B'·u_semantic) result mirrors the (B·x).detach() base
-          pattern: it isolates balance_integral_weight from task loss AND prevents
-          double-counting the cumsum gradient path back to routing_weight.
-          The same reasoning applies to (A'·u_load).detach() in the load-balance
-          pathway — u_load already has no path to routing_weight (routing_logits is
-          detached in load_balancing_logits), and the detach additionally blocks
-          routing_integral_weight.
         Args:
             x: Input hidden states, shape (batch, seq_len, embedding_width).
-            active_mask: Boolean active-token mask, shape (batch, seq_len). Dead tokens
-                are excluded from the cumsum history in integral mode.
         Returns:
-            Dict with keys:
-            - ``routing_logits``:        A·x, shape (B, N, L).
-            - ``balance_logits``:        B·x, shape (B, N, L).
-            - ``semantic_logits``:       combined task-loss pathway, shape (B, N, L).
-            - ``load_balancing_logits``: combined load-balance pathway, shape (B, N, L).
         """
-        routing_logits = F.linear(x, self.routing_weight)                     # (B, N, L)
-        balance_logits = F.linear(x, self.balance_weight)                     # (B, N, L)
-        semantic_logits       = routing_logits + balance_logits.detach()
-        load_balancing_logits = routing_logits.detach() + F.linear(x.detach(), self.balance_weight)
-        if self.routing_mode == "integral":
-            # Zero out dead token positions before cumsum so inactive tokens do not
-            # contaminate the routing history of subsequent live tokens.
-            live = active_mask.unsqueeze(-1)                                   # (B, N, 1)
-            u_semantic = self.exclusive_cumsum(semantic_logits * live)         # (B, N, L)
-            u_load     = self.exclusive_cumsum(load_balancing_logits * live)   # (B, N, L)
-            # Semantic pathway: A' trains on task loss; B' term is fully detached to
-            # isolate balance_integral_weight from task loss and prevent double-counting
-            # the cumsum gradient path back to routing_weight.
-            semantic_logits = (
-                semantic_logits
-                + F.linear(u_semantic, self.routing_integral_weight)
-                + F.linear(u_semantic, self.balance_integral_weight).detach()
-            )
-            # Load-balance pathway: B' trains on load_balance_loss; A' term is fully
-            # detached to isolate routing_integral_weight from load_balance_loss.
-            load_balancing_logits = (
-                load_balancing_logits
-                + F.linear(u_load, self.routing_integral_weight).detach()
-                + F.linear(u_load, self.balance_integral_weight)
-            )
-        return {
-            "routing_logits":        routing_logits,
-            "balance_logits":        balance_logits,
-            "semantic_logits":       semantic_logits,
-            "load_balancing_logits": load_balancing_logits,
-        }
-    @staticmethod
-    def _compute_bias_diagnostics(
-        routing_logits: torch.Tensor,
-        balance_logits: torch.Tensor,
-        semantic_logits: torch.Tensor,
-    ) -> dict[str, torch.Tensor]:
-        """Compute detached diagnostic scalars characterising the two routing pathways.
-        All scalars must be computed from pre-capacity logits; balance_capacity
-        applies -1e8 sentinels that would corrupt std and cosine similarity.
-        Extracted from forward to keep the main body free of reduction logic.
-        Args:
-            routing_logits:  A·x, routing pathway output, shape (B, N, L).
-            balance_logits:  B·x, balance pathway output, shape (B, N, L).
-            semantic_logits: A·x + (B·x).detach(), combined signal, shape (B, N, L).
-        Returns:
-            Dict with keys:
-            - ``raw_logit_std``:  Mean per-token std of routing_logits. Natural
-                                   routing preference scale; reference baseline for
-                                   interpreting bias_std.
-            - ``bias_std``:       Mean per-token std of balance_logits. Near-zero
-                                   means balance corrections have not built up
-                                   relative to the routing scale.
-            - ``logit_std``:      Mean per-token std of semantic_logits. Lower than
-                                   raw_logit_std indicates balance is flattening
-                                   preferences (healthy correction signal).
-            - ``bias_alignment``: Mean cosine similarity of routing_logits vs
-                                   balance_logits per token. Range [-1, 1]. Negative
-                                   means balance opposes routing direction (healthy
-                                   correction); positive means runaway reinforcement.
-        """
-        return {
-            "raw_logit_std":  routing_logits.std(dim=-1).mean().detach(),
-            "bias_std":       balance_logits.std(dim=-1).mean().detach(),
-            "logit_std":      semantic_logits.std(dim=-1).mean().detach(),
-            "bias_alignment": F.cosine_similarity(
-                routing_logits, balance_logits, dim=-1
-            ).mean().detach(),
-        }
     @staticmethod
     def _compute_max_vio(
@@ -4137,30 +4091,15 @@ class ShramModel(nn.Module):
             - ``"max_vio"``: detached scalar maximum routing-imbalance across
               all decoder layers. Zero means perfectly balanced routing across
               every layer; higher values identify the worst-case head imbalance.
-            - ``"bias_std"``: detached scalar — mean across layers of the std
-              of each layer's expert bias vector. Near-zero means corrections
-              have not built up; large relative to ``raw_logit_std`` means the
-              bias dominates routing.
-            - ``"raw_logit_std"``: detached scalar — mean across layers of the
-              per-token routing logit spread before bias addition. Baseline
-              natural routing preference scale.
             - ``"logit_std"``: detached scalar — mean across layers of the
-              per-token combined (logit + bias) spread. Lower than
-              ``raw_logit_std`` indicates healthy flattening; higher indicates
-              amplification.
-            - ``"bias_alignment"``: detached scalar — mean across layers of the
-              per-token cosine similarity between the expert bias vector and the
-              routing logits. Negative is healthy correction; positive is
-              runaway feedback.
         """
         hidden_states = inputs_embeds
         all_hidden_states = (hidden_states,) if output_hidden_states else None
         total_load_balance_loss = inputs_embeds.new_zeros(())
         max_vio = inputs_embeds.new_zeros(())
-        total_bias_std = inputs_embeds.new_zeros(())
-        total_raw_logit_std = inputs_embeds.new_zeros(())
         total_logit_std = inputs_embeds.new_zeros(())
-        total_bias_alignment = inputs_embeds.new_zeros(())
         for layer_idx, layer in enumerate(self.layers):
             layer_cache = None if cache is None else cache.layers[layer_idx]
@@ -4172,10 +4111,7 @@ class ShramModel(nn.Module):
             )
             total_load_balance_loss = total_load_balance_loss + layer_diagnostics["load_balance_loss"]
             max_vio = torch.maximum(max_vio, layer_diagnostics["max_vio"])
-            total_bias_std = total_bias_std + layer_diagnostics["bias_std"]
-            total_raw_logit_std = total_raw_logit_std + layer_diagnostics["raw_logit_std"]
             total_logit_std = total_logit_std + layer_diagnostics["logit_std"]
-            total_bias_alignment = total_bias_alignment + layer_diagnostics["bias_alignment"]
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -4189,10 +4125,7 @@ class ShramModel(nn.Module):
             "hidden_states": all_hidden_states,
             "load_balance_loss": total_load_balance_loss,
             "max_vio": max_vio,
-            "bias_std": total_bias_std / num_layers,
-            "raw_logit_std": total_raw_logit_std / num_layers,
             "logit_std": total_logit_std / num_layers,
-            "bias_alignment": total_bias_alignment / num_layers,
         }
@@ -4209,17 +4142,14 @@ class ShramCausalLMOutput(CausalLMOutputWithPast):
     ## Python dataclass inheritance violation: CausalLMOutputWithPast defaults all
     ## fields to None, which forces every subclass field to also carry a default.
     ## The = None below is a language constraint, not a semantic statement. In
-    ## practice, load_balance_loss, max_vio, bias_std, raw_logit_std, logit_std,
-    ## and bias_alignment are always populated by ShramForCausalLM.forward().
-    ## ce_loss is genuinely optional — present only when labels are supplied.
     ce_loss: torch.FloatTensor | None = None
     load_balance_loss: torch.FloatTensor | None = None
     max_vio: torch.FloatTensor | None = None
-    bias_std: torch.Tensor | None = None
-    raw_logit_std: torch.Tensor | None = None
     logit_std: torch.Tensor | None = None
-    bias_alignment: torch.Tensor | None = None
 class ShramForCausalLM(PreTrainedModel, GenerationMixin):
     """HuggingFace-facing causal language model wrapper for SHRAM.
@@ -4668,9 +4598,7 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
             - ``hidden_states`` when requested,
             - ``load_balance_loss`` — raw unweighted load-balance loss from the backbone,
             - ``max_vio`` — detached worst-case routing imbalance across layers,
-            - ``bias_std``, ``raw_logit_std``, ``logit_std``, ``bias_alignment`` —
-              detached load-balance health scalars averaged across decoder layers;
-              see ``ShramModel`` for interpretation.
         """
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_hidden_states = (
@@ -4777,8 +4705,5 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
             hidden_states=backbone_outputs["hidden_states"],
             load_balance_loss=backbone_outputs["load_balance_loss"],
             max_vio=backbone_outputs["max_vio"],
-            bias_std=backbone_outputs["bias_std"],
-            raw_logit_std=backbone_outputs["raw_logit_std"],
             logit_std=backbone_outputs["logit_std"],
-            bias_alignment=backbone_outputs["bias_alignment"],
         )

             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
         load_balance_loss_type: Formula used for the load-balance auxiliary loss.
+            One of ``"gshard"``, ``"ce"``, ``"bce"``, or ``"temporal_overcapacity"``.
+            ``"temporal_overcapacity"`` is the default; it fires only when an expert
+            exceeds its allowed trajectory (controlled by ``maximum_expert_overclaim``)
+            and shuts off automatically once routing is balanced, allowing it to be
+            used with a strong weight without interfering with task training during
+            balanced routing. Default ``"temporal_overcapacity"``.
+        maximum_expert_overclaim: Maximum number of tokens an expert may receive above
+            its ideal allocation trajectory before the temporal overcapacity loss
+            fires. A value of 0 means violations trigger immediately at any imbalance.
+            Larger values permit short-lived semantic specialization before correction.
+            Only used when ``load_balance_loss_type="temporal_overcapacity"``.
+            Must be non-negative. Default 20.
     """
     model_type = "shram"
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
         max_bid_rounds: int = 10,
+        load_balance_loss_type: str = "temporal_overcapacity",
+        maximum_expert_overclaim: int = 20,
         **kwargs
     ):
         if head_dim % 2 != 0:
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
+        if maximum_expert_overclaim < 0:
+            raise ValueError(
+                f"maximum_expert_overclaim must be non-negative, "
+                f"got {maximum_expert_overclaim}."
+            )
+        _supported_loss_types = {"gshard", "ce", "bce", "temporal_overcapacity"}
         if load_balance_loss_type not in _supported_loss_types:
             supported = ", ".join(f'"{t}"' for t in sorted(_supported_loss_types))
             raise ValueError(
                 f"got {load_balance_loss_type!r}."
             )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
+        self.maximum_expert_overclaim = maximum_expert_overclaim
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
 - "hidden_states": tuple of per-layer activations if output_hidden_states=True, else None
 - "load_balance_loss": scalar sum of per-layer SHRAM load-balance losses
 - "max_vio": detached scalar maximum routing-imbalance across all decoder layers
+- "logit_std": detached scalar mean per-layer per-token routing logit spread
 """
 paper. Given an input hidden state x, the router produces two outputs used downstream:
   - selected_heads (I): which K of the L available expert heads each token routes to,
+    determined by TopK over capacity-balanced routing scores.
   - routing_probs (P): the weights used for the weighted output reduction, gathered from
+    the routing scores at the selected indices and renormalized to sum to 1 per token.
+Routing uses a single learnable projection:
+  - routing_weight: shape (L, embedding_width). Maps input to per-head routing scores.
+    Both task loss and load_balance_loss train this parameter directly — there is no
+    gradient isolation between the two signals.
+This coupled design is intentional. SHRAM has an unusually strong task-level incentive
+to concentrate tokens into the same expert bucket (sparse attention only occurs among
+tokens routed to the same expert), so any indirect balancing pathway will be outlearned.
+Coupling the gradients allows the load balance loss to act with full strength directly
+on the parameter that determines routing.
+routing_weight is nn.Parameter so that HuggingFace _init_weights does not override
+its kaiming initialization at construction.
+routing_probs are computed before balance_capacity applies -1e8 sentinels. Post-capacity
+softmax would corrupt routing_probs for over-capacity experts (near-zero probability
+after masking does not reflect genuine routing preference).
+The router computes and returns:
+  - load_balance_loss: scalar auxiliary loss (see load_balance_loss.py); gradient flows
+    to routing_weight.
+  - max_vio: detached scalar summarising routing imbalance:
+      MaxVio = mean_b( L · max_l(f_bl − 1/L) )
+    where f_bl is the per-batch-item realised routing frequency of head l. Zero means
+    perfect balance; 1.0 means the most loaded head received double its fair share.
+  - logit_std: detached scalar; mean per-token standard deviation of routing logits.
+    Monitoring metric for routing sharpness.
 Paper ref: Appendix A.Routing, Appendix A.Load Balancing, §MaxVio.
 """
 # -----------
 """Log-probability auxiliary loss functions for MoSRAH load balancing.
+This module provides four load-balance loss formulations, two token-reduction
 helpers, and a factory that selects among the formulations. All formulations
 share the same external contract:
         active_mask:     Tensor[B, N],
     ) -> scalar Tensor
+    logits:          Pre-softmax routing scores, shape (B, N, L). Gradient flows
+                     through this tensor.
     assignment_mask: Per-token head-assignment indicators. assignment_mask[b, n, l]
                      is 1.0 if token (b, n) was assigned to head l. Dead tokens
                      should carry zero entries.
 Token reduction is split into two helpers with distinct roles:
     reduce_frequency_tokens — produces per-batch-item routing frequencies f_bl (B, L).
+        Called by gshard, ce, and bce. Output is detached; f_bl carries no gradient.
     reduce_probability_tokens — produces per-batch-item mean assignment probabilities
+        p_bl (B, L). Called only by gshard and bce. Gradient flows through the
+        internal softmax over logits.
 CE delegates probability computation to F.cross_entropy, which handles its own
 log_softmax and operates directly on the raw (B, N, L) logits.
+``make_load_balance_loss`` is the sole public entry point. The individual loss
+functions are internal implementation details; their signatures may change between
+units. Callers and tests must construct loss callables through the factory, not by
+importing or invoking the loss functions directly.
 """
     """
     f_bl = reduce_frequency_tokens(assignment_mask, active_mask)
     p_bl = reduce_probability_tokens(logits, active_mask)
+    # Clamp for numerical safety: softmax outputs are strictly positive in
+    # normal operation; the clamp guards the all-dead-tokens edge case where
+    # the mean defaults to zero. log1p(-p) avoids cancellation near p=1.
+    p = p_bl.clamp(min=1e-7, max=1.0 - 1e-7)
+    target = 1.0 - f_bl
+    return -(target * torch.log(p) + (1.0 - target) * torch.log1p(-p)).mean()
+def _temporal_overcapacity_loss(
+    logits: torch.Tensor,
+    assignment_mask: torch.Tensor,
+    active_mask: torch.Tensor,
+    expected_tokens_rate: float,
+    maximum_expert_overclaim: int,
+) -> torch.Tensor:
+    """Temporal overcapacity loss for MoSRAH load balancing.
+    Penalises routing decisions that select a head already overloaded relative to
+    its ideal allocation trajectory. A head is considered overloaded when the number
+    of active tokens before position n assigned to that head exceeds
+    cumulative_active_tokens * M + C, where M is the expected_tokens_rate (K/L) and
+    C is the maximum_expert_overclaim slack.
+    Loss is exactly zero when no head exceeds its trajectory, making it safe to
+    weight strongly — it stays out of the way when routing is balanced.
+    Args:
+        logits:                   Pre-softmax routing scores, shape (B, N, L).
+        assignment_mask:          Per-token head-assignment indicators, shape (B, N, L).
+                                  1.0 if token (b, n) is assigned to head l.
+        active_mask:              Boolean active-token mask, shape (B, N).
+        expected_tokens_rate (M): Ideal per-head allocation rate K/L. Pre-computed
+                                  by the factory so the division is not repeated each
+                                  forward pass.
+        maximum_expert_overclaim (C): Slack above the ideal trajectory before
+                                  imbalance fires. Larger C tolerates more deviation.
+    Returns:
+        Scalar loss tensor. Exactly 0.0 when no head exceeds its allowed trajectory.
+    """
+    # ── Algorithm overview ──────────────────────────────────────────────────────
+    #
+    # Problem: token routing is stateless — each token's TopK selection is blind to
+    # how many times each expert has already been chosen earlier in the sequence. A
+    # router that develops a strong preference for certain experts will overload them
+    # far beyond their K/L fair share with no correction signal at the moment of
+    # selection.
+    #
+    # Approach: track per-head assignment history as exclusive cumulative counts
+    # (assignments by all active tokens strictly before position n) and compare
+    # against an ideal trajectory S·M, where S is the inclusive cumulative active
+    # token count and M is the amount of tokens expected given ideal balancing
+    #  A head is overloaded when its prior count exceeds that trajectory
+    # by more than C. When a token selects an already-overloaded head, the loss
+    # moment — mean(violating logits) minus mean(non-overloaded logits) — penalises
+    # the gap and pushes future routing toward underloaded alternatives.
+    # ── Routing history and imbalance threshold ──────────────────────────────────
+    #
+    # prior_assignment_counts is the exclusive routing history at each position:
+    # active assignments to each head by all tokens strictly before position n.
+    # Exclusive because it reflects only what was known when token n was being routed.
+    # cumulative_active_tokens grows by 1 per active token; the ideal per-head
+    # allocation at n is S·M. Exceeding that by more than C triggers imbalance.
+    active_float = active_mask.float()                                              # (B, N)
+    active_assignments = assignment_mask * active_float.unsqueeze(-1)               # (B, N, L)
+    # exclusive cumsums: subtract self to exclude position n
+    prior_assignment_counts = active_assignments.cumsum(dim=1) - active_assignments  # (B, N, L)
+    cumulative_active_tokens = active_float.cumsum(dim=1) - active_float             # (B, N)
+    maximum_supportable_assignments = (
+        cumulative_active_tokens.unsqueeze(-1) * expected_tokens_rate
+        + maximum_expert_overclaim
+    )                                                                                # (B, N, 1) → broadcasts to (B, N, L)
+    # ── Mask construction ────────────────────────────────────────────────────────
+    #
+    # Three derived masks:
+    #   imbalance_mask:           any head exceeding its trajectory.
+    #   violating_selection_mask: selected AND imbalanced — the penalty target.
+    #   non_overloaded_head_mask: NOT imbalanced, regardless of selection.
+    #
+    # Masking is deliberately assymetric. We have a problem when something is over
+    # capacity AND gets chosen by topk. We can transfer it elsewhere only if we
+    # are not overcapacity.
+    imbalance_mask           = prior_assignment_counts > maximum_supportable_assignments  # (B, N, L)
+    violating_selection_mask = assignment_mask.bool() & imbalance_mask                   # (B, N, L)
+    non_overloaded_head_mask = ~imbalance_mask                                            # (B, N, L)
+    has_violation_mask       = violating_selection_mask.any(dim=-1)                       # (B, N)
+    # ── Loss moment ────────────────────────────────────────────────────────
+    #
+    # Epsilons on the count denominators guard against NaN when violation_count or
+    # non_overloaded_count is zero. has_violation_mask zeros positions with no
+    # violations at the gating step, so the epsilon-inflated denominator never
+    # contributes to the loss.
+    #
+    # One notable property of this moment is it keeps the amount of transferred
+    # logit mass constant. That is the gradient reduces violating logits and increases
+    # non-overloaded logits by equal magnitude. Routing is redirected, not suppressed.
+    violation_count           = violating_selection_mask.float().sum(dim=-1).clamp(min=1.0)   # (B, N)
+    non_overloaded_count      = non_overloaded_head_mask.float().sum(dim=-1).clamp(min=1.0)   # (B, N)
+    mean_violating_logit      = (violating_selection_mask.float() * logits).sum(dim=-1) / violation_count      # (B, N)
+    mean_non_overloaded_logit = (non_overloaded_head_mask.float() * logits).sum(dim=-1) / non_overloaded_count  # (B, N)
+    raw_loss                  = mean_violating_logit - mean_non_overloaded_logit                                 # (B, N)
+    # ── Loss reduction ───────────────────────────────────────────────────────────
+    #
+    # Reduction is over active positions only; dead tokens are excluded from both
+    # numerator (gated by active_float) and denominator (active_count_per_seq).
+    # clamp(min=1.0) handles the all-dead-tokens edge case: gated_loss is zero
+    # there since active_float gates it, so the result is 0/1 = 0.
+    #
+    # Exact-zero guarantee: when no head exceeds its trajectory, has_violation_mask
+    # is all-False, gated_loss is zeroed everywhere, and the scalar return is
+    # exactly 0.0. The loss is inert when routing is balanced.
+    gated_loss           = active_float * has_violation_mask.float() * raw_loss           # (B, N)
+    active_count_per_seq = active_float.sum(dim=1).clamp(min=1.0)                         # (B,)
+    sequence_loss        = gated_loss.sum(dim=1) / active_count_per_seq                   # (B,)
+    final_loss           = sequence_loss.mean()
+    return final_loss
 # ---------------------------------------------------------------------------
 # Factory
 # ---------------------------------------------------------------------------
+def _gshard_factory(**kwargs: object) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]:
+    return gshard_loss
+def _ce_factory(**kwargs: object) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]:
+    return ce_loss
+def _bce_factory(**kwargs: object) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]:
+    return bce_loss
+def _temporal_overcapacity_factory(
+    num_selected_heads: int,
+    num_total_heads: int,
+    maximum_expert_overclaim: int,
+    **kwargs: object,
+) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]:
+    expected_tokens_rate = num_selected_heads / num_total_heads
+    def _runtime(
+        logits: torch.Tensor,
+        assignment_mask: torch.Tensor,
+        active_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        return _temporal_overcapacity_loss(
+            logits, assignment_mask, active_mask,
+            expected_tokens_rate=expected_tokens_rate,
+            maximum_expert_overclaim=maximum_expert_overclaim,
+        )
+    return _runtime
+_LOSS_REGISTRY: dict[str, Callable[..., Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]]] = {
+    "gshard": _gshard_factory,
+    "ce": _ce_factory,
+    "bce": _bce_factory,
+    "temporal_overcapacity": _temporal_overcapacity_factory,
 }
 def make_load_balance_loss(
     loss_type: str,
+    **loss_parameters: object,
 ) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]:
     """Return a load-balance loss callable for the requested formulation.
             active_mask:     Tensor[B, N],
         ) -> scalar Tensor
+    Keyword arguments are forwarded to the selected factory. The gshard, ce, and bce
+    factories silently ignore all kwargs; this allows callers to pass loss-type-specific
+    parameters (e.g. for temporal_overcapacity) without branching on loss_type.
     Args:
+        loss_type:        One of ``"gshard"``, ``"ce"``, ``"bce"``, or
+                          ``"temporal_overcapacity"``.
+        **loss_parameters: Construction-time parameters forwarded to the factory.
     Returns:
         Loss callable matching the shared contract.
         raise ValueError(
             f"load_balance_loss_type must be one of {supported}, got {loss_type!r}."
         )
+    return _LOSS_REGISTRY[loss_type](**loss_parameters)
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
+    Each input token independently selects K of the L available expert heads.
+    A single routing projection maps input hidden states to per-head scores; both
+    task loss and load_balance_loss train this projection directly.
+    routing_weight is nn.Parameter rather than nn.Linear so that HuggingFace
+    _init_weights does not override its kaiming initialization at construction.
     Attributes:
+        routing_weight: Shape (L, embedding_width). Maps input hidden states to
+            per-head routing scores. Receives gradients from both task loss and
+            load_balance_loss.
     Args:
         config: Model configuration. Must expose ``embedding_width``,
+            ``num_mosrah_heads`` (L), ``num_selected_heads`` (K),
+            ``load_balance_loss_type``, ``maximum_expert_overclaim``, ``max_bid_rounds``,
+            ``use_cache``, ``mosrah_cache_length``, and ``mosrah_packed_length``.
     """
     def __init__(self, config: ShramConfig) -> None:
             self.capacity = config.mosrah_packed_length
         self.max_bid_rounds = config.max_bid_rounds
+        self._load_balance_loss = make_load_balance_loss(
+            config.load_balance_loss_type,
+            num_selected_heads=config.num_selected_heads,
+            num_total_heads=config.num_mosrah_heads,
+            maximum_expert_overclaim=config.maximum_expert_overclaim,
         )
+        # Routing projection: maps input (B, N, d) to per-head routing scores (B, N, L).
         # nn.Parameter ensures HuggingFace _init_weights does not override kaiming init.
+        self.routing_weight = nn.Parameter(
             torch.empty(config.num_mosrah_heads, config.embedding_width)
         )
+        nn.init.kaiming_normal_(self.routing_weight)
     @staticmethod
     def get_best_proposals(
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
                 Each token's K selected head indices, determined by TopK on
+                capacity-balanced routing scores.
             routing_probs: Routing probabilities P of shape (batch, seq_len,
+                num_selected_heads). Gathered from pre-capacity routing softmax at
                 selected_heads indices and renormalized to sum to 1 per token.
             router_diagnostics: Dict of routing feedback scalars. Keys:
                 - ``load_balance_loss``: scalar load-balance loss with gradient.
                 - ``max_vio``: detached scalar routing-imbalance summary.
+                - ``logit_std``: detached mean per-token std of routing logits;
+                  monitoring metric for routing sharpness.
         """
         B, N, _ = x.shape
         L = self.num_mosrah_heads
         K = self.num_selected_heads
+        # ── Phase: pre-capacity scoring ───────────────────────────────────────
+        #
+        # Establishes the clean pre-sentinel distribution that all downstream
+        # consumers draw from. logit_std must be captured here — balance_capacity
+        # injects -1e8 sentinels that would corrupt the standard deviation.
+        # routing_scores is the pre-capacity probability distribution; both the
+        # load balance signal and the final routing_probs gather from it.
+        routing_logits = self._compute_routing_logits(x)                       # (B, N, L)
+        logit_std      = routing_logits.std(dim=-1).mean().detach()
+        routing_scores = F.softmax(routing_logits, dim=-1)                     # (B, N, L)
+        # ── Phase: load balance signal ────────────────────────────────────────
+        #
+        # The loss must observe the unconstrained routing decision — the genuine
+        # routing pressure before capacity enforcement masks any imbalance.
+        # pre_cap_heads and assignment_mask exist solely to give the loss this
+        # honest view; nothing downstream uses them.
+        pre_cap_heads   = routing_scores.topk(K, dim=-1).indices               # (B, N, K)
+        assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
+        assignment_mask.scatter_(-1, pre_cap_heads, 1.0)
+        load_balance_loss = self._load_balance_loss(
+            routing_logits, assignment_mask, active_mask
         )
+        # ── Phase: capacity enforcement and final selection ───────────────────
+        #
+        # Produces the capacity-enforced routing that all downstream consumers
+        # depend on. max_vio is computed here because it measures realized routing
+        # imbalance — the actual post-capacity assignment, not the unconstrained
+        # preference. routing_probs are gathered from the pre-capacity routing_scores
+        # (not the balanced distribution) to avoid sentinel corruption — overloaded
+        # experts would otherwise receive near-zero probability regardless of genuine
+        # routing preference.
+        balanced_logits = self.balance_capacity(
+            routing_logits,
             used_capacity,
             self.capacity,
             self.num_selected_heads,
             self.max_bid_rounds,
         )
+        selected_heads = F.softmax(balanced_logits, dim=-1).topk(K, dim=-1).indices  # (B, N, K)
+        realized_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
+        realized_mask.scatter_(-1, selected_heads, 1.0)
+        max_vio = self._compute_max_vio(realized_mask, active_mask, L)
         gathered      = routing_scores.gather(dim=-1, index=selected_heads)    # (B, N, K)
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)          # P, (B, N, K)
         router_diagnostics = {
             "load_balance_loss": load_balance_loss,
+            "max_vio":           max_vio,
+            "logit_std":         logit_std,
         }
         return selected_heads, routing_probs, router_diagnostics
+    def _compute_routing_logits(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute per-head routing logits from input hidden states.
         Args:
             x: Input hidden states, shape (batch, seq_len, embedding_width).
         Returns:
+            Routing logits, shape (batch, seq_len, num_mosrah_heads).
         """
+        return F.linear(x, self.routing_weight)                                # (B, N, L)
     @staticmethod
     def _compute_max_vio(
             - ``"max_vio"``: detached scalar maximum routing-imbalance across
               all decoder layers. Zero means perfectly balanced routing across
               every layer; higher values identify the worst-case head imbalance.
             - ``"logit_std"``: detached scalar — mean across layers of the
+              per-token routing logit spread. Monitoring metric for routing
+              sharpness.
         """
         hidden_states = inputs_embeds
         all_hidden_states = (hidden_states,) if output_hidden_states else None
         total_load_balance_loss = inputs_embeds.new_zeros(())
         max_vio = inputs_embeds.new_zeros(())
         total_logit_std = inputs_embeds.new_zeros(())
         for layer_idx, layer in enumerate(self.layers):
             layer_cache = None if cache is None else cache.layers[layer_idx]
             )
             total_load_balance_loss = total_load_balance_loss + layer_diagnostics["load_balance_loss"]
             max_vio = torch.maximum(max_vio, layer_diagnostics["max_vio"])
             total_logit_std = total_logit_std + layer_diagnostics["logit_std"]
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
             "hidden_states": all_hidden_states,
             "load_balance_loss": total_load_balance_loss,
             "max_vio": max_vio,
             "logit_std": total_logit_std / num_layers,
         }
     ## Python dataclass inheritance violation: CausalLMOutputWithPast defaults all
     ## fields to None, which forces every subclass field to also carry a default.
     ## The = None below is a language constraint, not a semantic statement. In
+    ## practice, load_balance_loss, max_vio, and logit_std are always populated
+    ## by ShramForCausalLM.forward(). ce_loss is genuinely optional — present
+    ## only when labels are supplied.
     ce_loss: torch.FloatTensor | None = None
     load_balance_loss: torch.FloatTensor | None = None
     max_vio: torch.FloatTensor | None = None
     logit_std: torch.Tensor | None = None
 class ShramForCausalLM(PreTrainedModel, GenerationMixin):
     """HuggingFace-facing causal language model wrapper for SHRAM.
             - ``hidden_states`` when requested,
             - ``load_balance_loss`` — raw unweighted load-balance loss from the backbone,
             - ``max_vio`` — detached worst-case routing imbalance across layers,
+            - ``logit_std`` — detached mean per-token routing logit spread across layers.
         """
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_hidden_states = (
             hidden_states=backbone_outputs["hidden_states"],
             load_balance_loss=backbone_outputs["load_balance_loss"],
             max_vio=backbone_outputs["max_vio"],
             logit_std=backbone_outputs["logit_std"],
         )