Update architecture and tokenizer

Browse files

Files changed (4) hide show

README.md +1 -2
config.json +1 -2
configuration.py +13 -23
huggingface.py +474 -208

README.md CHANGED Viewed

@@ -83,7 +83,6 @@ contains no weights. All values are overridable via kwargs.
 | `head_dim` | 16 |
 | `inference_sequence_length` | 1024 |
 | `load_balance_loss_type` | ce |
-| `load_balance_p` | 1.0 |
 | `local_rope_theta` | 10000.0 |
 | `max_bid_rounds` | 10 |
 | `mlp_width` | 1366 |
@@ -96,7 +95,7 @@ contains no weights. All values are overridable via kwargs.
 | `output_hidden_states` | False |
 | `rms_norm_eps` | 1e-05 |
 | `rope_mode` | main_sequence |
-| `router_init_scale` | 0.0001 |
 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |

 | `head_dim` | 16 |
 | `inference_sequence_length` | 1024 |
 | `load_balance_loss_type` | ce |
 | `local_rope_theta` | 10000.0 |
 | `max_bid_rounds` | 10 |
 | `mlp_width` | 1366 |
 | `output_hidden_states` | False |
 | `rms_norm_eps` | 1e-05 |
 | `rope_mode` | main_sequence |
+| `routing_mode` | integral |
 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |

config.json CHANGED Viewed

@@ -10,7 +10,6 @@
   "head_dim": 16,
   "inference_sequence_length": 1024,
   "load_balance_loss_type": "ce",
-  "load_balance_p": 1.0,
   "local_rope_theta": 10000.0,
   "max_bid_rounds": 10,
   "mlp_width": 1366,
@@ -23,7 +22,7 @@
   "num_sliding_window_heads": 16,
   "rms_norm_eps": 1e-05,
   "rope_mode": "main_sequence",
-  "router_init_scale": 0.0001,
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
   "transformers_version": "5.10.2",

   "head_dim": 16,
   "inference_sequence_length": 1024,
   "load_balance_loss_type": "ce",
   "local_rope_theta": 10000.0,
   "max_bid_rounds": 10,
   "mlp_width": 1366,
   "num_sliding_window_heads": 16,
   "rms_norm_eps": 1e-05,
   "rope_mode": "main_sequence",
+  "routing_mode": "integral",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
   "transformers_version": "5.10.2",

configuration.py CHANGED Viewed

@@ -84,10 +84,6 @@ class ShramConfig(PretrainedConfig):
             num_selected_heads / num_mosrah_heads * mosrah_overallocation_factor).
             Must be > 1.0 to guarantee a buffer larger than the balanced-routing
             baseline. Default 2.0.
-        load_balance_p: Exponent p for the p-mean aggregation of per-item routing
-            frequencies into the load balance signal. Higher p weights aggregation
-            toward the worst-case batch item, making the correction signal more
-            sensitive to per-item allocation spikes. Must be positive. Default 2.0.
         max_bid_rounds: Maximum bidding rounds for the deferred-acceptance capacity
             solver in ``balance_capacity``. 10 covers convergence at approximately
             the 98th percentile of routing densities; the top 2% of extreme-density
@@ -99,11 +95,13 @@ class ShramConfig(PretrainedConfig):
             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
-        router_init_scale: Initial standard deviation for the ``routing_scale``
-            scalar gate on routing logits. Brings routing logit magnitude to
-            ``expert_bias`` scale at initialisation so load balancing is operative
-            from step one. Must be positive. Default ``1e-4``. Note lower values
-            may require more bidding rounds to converge and more overcapacity to support.
     """
     model_type = "shram"
@@ -137,10 +135,9 @@ class ShramConfig(PretrainedConfig):
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
-        load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
-        router_init_scale: float = 1e-4,
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -176,11 +173,6 @@ class ShramConfig(PretrainedConfig):
                 f"Got {mosrah_overallocation_factor}."
             )
-        if load_balance_p <= 0.0:
-            raise ValueError(
-                f"load_balance_p must be positive, got {load_balance_p}."
-            )
         if max_bid_rounds < 1:
             raise ValueError(
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
@@ -193,13 +185,12 @@ class ShramConfig(PretrainedConfig):
                 f"load_balance_loss_type must be one of {supported}, "
                 f"got {load_balance_loss_type!r}."
             )
-        if load_balance_loss_type == "ce" and load_balance_p != 1.0:
-            raise ValueError("In cross entropy mode, aggregation of "
-                             "frequencies must be with mean 1.0")
-        if router_init_scale <= 0.0:
             raise ValueError(
-                f"router_init_scale must be positive, got {router_init_scale}."
             )
         self.vocab_size = vocab_size
@@ -220,10 +211,9 @@ class ShramConfig(PretrainedConfig):
         self.alpha = alpha
         self.beta = beta
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
-        self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
-        self.router_init_scale = router_init_scale
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

             num_selected_heads / num_mosrah_heads * mosrah_overallocation_factor).
             Must be > 1.0 to guarantee a buffer larger than the balanced-routing
             baseline. Default 2.0.
         max_bid_rounds: Maximum bidding rounds for the deferred-acceptance capacity
             solver in ``balance_capacity``. 10 covers convergence at approximately
             the 98th percentile of routing densities; the top 2% of extreme-density
             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
+        routing_mode: Routing computation mode. ``"integral"`` (default) enables the
+            integral routing extension: the exclusive cumsum of routing logits along
+            the sequence dimension is mapped through two additional (L, L) parameter
+            matrices (``routing_integral_weight`` A' and ``balance_integral_weight``
+            B') and added as corrections to both logit pathways. This gives each
+            token a read on the cumulative routing history so far in the sequence.
+            ``"default"`` disables the extension; A' and B' are not created.
     """
     model_type = "shram"
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
+        routing_mode: str = "integral",
         **kwargs
     ):
         if head_dim % 2 != 0:
                 f"Got {mosrah_overallocation_factor}."
             )
         if max_bid_rounds < 1:
             raise ValueError(
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
                 f"load_balance_loss_type must be one of {supported}, "
                 f"got {load_balance_loss_type!r}."
             )
+        _supported_routing_modes = {"default", "integral"}
+        if routing_mode not in _supported_routing_modes:
+            supported = ", ".join(f'"{m}"' for m in sorted(_supported_routing_modes))
             raise ValueError(
+                f"routing_mode must be one of {supported}, got {routing_mode!r}."
             )
         self.vocab_size = vocab_size
         self.alpha = alpha
         self.beta = beta
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
+        self.routing_mode = routing_mode
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

huggingface.py CHANGED Viewed

@@ -45,7 +45,6 @@ from torch.nn.attention.flex_attention import create_block_mask
 from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
 from typing import Callable
-from typing import Optional
@@ -172,10 +171,6 @@ class ShramConfig(PretrainedConfig):
             num_selected_heads / num_mosrah_heads * mosrah_overallocation_factor).
             Must be > 1.0 to guarantee a buffer larger than the balanced-routing
             baseline. Default 2.0.
-        load_balance_p: Exponent p for the p-mean aggregation of per-item routing
-            frequencies into the load balance signal. Higher p weights aggregation
-            toward the worst-case batch item, making the correction signal more
-            sensitive to per-item allocation spikes. Must be positive. Default 2.0.
         max_bid_rounds: Maximum bidding rounds for the deferred-acceptance capacity
             solver in ``balance_capacity``. 10 covers convergence at approximately
             the 98th percentile of routing densities; the top 2% of extreme-density
@@ -187,11 +182,13 @@ class ShramConfig(PretrainedConfig):
             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
-        router_init_scale: Initial standard deviation for the ``routing_scale``
-            scalar gate on routing logits. Brings routing logit magnitude to
-            ``expert_bias`` scale at initialisation so load balancing is operative
-            from step one. Must be positive. Default ``1e-4``. Note lower values
-            may require more bidding rounds to converge and more overcapacity to support.
     """
     model_type = "shram"
@@ -225,10 +222,9 @@ class ShramConfig(PretrainedConfig):
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
-        load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
-        router_init_scale: float = 1e-4,
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -264,11 +260,6 @@ class ShramConfig(PretrainedConfig):
                 f"Got {mosrah_overallocation_factor}."
             )
-        if load_balance_p <= 0.0:
-            raise ValueError(
-                f"load_balance_p must be positive, got {load_balance_p}."
-            )
         if max_bid_rounds < 1:
             raise ValueError(
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
@@ -281,13 +272,12 @@ class ShramConfig(PretrainedConfig):
                 f"load_balance_loss_type must be one of {supported}, "
                 f"got {load_balance_loss_type!r}."
             )
-        if load_balance_loss_type == "ce" and load_balance_p != 1.0:
-            raise ValueError("In cross entropy mode, aggregation of "
-                             "frequencies must be with mean 1.0")
-        if router_init_scale <= 0.0:
             raise ValueError(
-                f"router_init_scale must be positive, got {router_init_scale}."
             )
         self.vocab_size = vocab_size
@@ -308,10 +298,9 @@ class ShramConfig(PretrainedConfig):
         self.alpha = alpha
         self.beta = beta
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
-        self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
-        self.router_init_scale = router_init_scale
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
@@ -2741,24 +2730,53 @@ paper. Given an input hidden state x, the router produces two outputs used downs
     the semantic routing scores at the selected indices and renormalized to sum to 1
     per token.
-Routing computation uses two gradient-isolated pathways over numerically identical
-biased values:
-  - semantic_logits = logits + expert_bias.detach(): drives selection and routing_probs.
-    Task gradients reach routing_projection.weight; expert_bias is isolated from task loss.
-  - load_balancing_logits = logits.detach() + expert_bias: drives assignment_probs.
-    Load balance gradients reach expert_bias; routing_projection.weight is isolated from
-    load balance loss.
-No unbiased routing computation exists. All routing uses biased values. The separation
-of gradient paths replaces the previous biased/unbiased split, closing the loophole where
-a bias-redirected expert could be selected but contribute negligibly to the output because
-its unbiased preference — and thus its routing_prob — remained near zero.
 Assignment probabilities are computed before balance_capacity applies -1e8 sentinels.
 Post-capacity softmax would invert the load balance gradient for over-capacity experts
-(near-zero probability after masking signals "increase bias" for an already-overloaded
-expert).
 The router also computes and returns the load balance loss via a log-probability auxiliary
 loss (see load_balance_loss.py). The loss formulation is selected by config; the default
@@ -2767,10 +2785,11 @@ is cross-entropy.
 The router additionally computes and returns MaxVio, a detached scalar summarising
 routing imbalance for the current forward pass:
-    MaxVio = L · max_l(f_l − 1/L)
-where f_l is the realised routing frequency of head l and 1/L is the perfectly balanced
-target. MaxVio is a monitoring quantity only; it never contributes gradients.
 Paper ref: Appendix A.Routing, Appendix A.Load Balancing, §MaxVio.
 """
@@ -2785,130 +2804,228 @@ Paper ref: Appendix A.Routing, Appendix A.Load Balancing, §MaxVio.
 # -----------
 """Log-probability auxiliary loss functions for MoSRAH load balancing.
-This module provides three load-balance loss formulations and a factory that selects
-among them. All formulations share the same external contract and the same gradient
-isolation property: assignment probabilities are computed from detached logits plus
-expert_bias, so only expert_bias receives gradients from the loss signal. The routing
-projection weights are not reachable from any returned loss.
-The factory is the intended entry point. The caller (MoSRAHRouter) constructs the
-loss callable once at init and invokes it each forward pass.
-Log-probability formulations (ce, bce) are preferred over linear ones (gshard) because
-their gradient magnitude scales with how far the distribution deviates from the target.
-A linear signal can be outrun by routing concentrations that diverge nonlinearly; a
-log-probability signal cannot.
-The external contract for all returned callables is:
-    loss_fn(routing_freqs, assignment_probs) -> scalar Tensor
-    routing_freqs:    (L,) realized routing frequencies f_i, detached.
-    assignment_probs: (L,) soft assignment probabilities p_i with gradient through
-                      expert_bias. Caller must compute these via
-                      softmax(logits.detach() + expert_bias) to preserve isolation.
 """
 # ---------------------------------------------------------------------------
 # Loss functions
 # ---------------------------------------------------------------------------
 def gshard_loss(
-    routing_freqs: torch.Tensor,
-    assignment_probs: torch.Tensor,
 ) -> torch.Tensor:
     """GShard-style linear load-balance loss.
-    Computes (1/L) * Σ_i f_i * p_i, where L is the number of expert heads,
-    f_i is the realized routing frequency for head i, and p_i is the soft
-    assignment probability for head i.
-    The fixed point of this loss under gradient descent is uniform routing:
-    when p_i = 1/L for all i, the loss is minimized at 1/L (independent of f_i).
-    The linear signal is the weakest of the three formulations — gradient magnitude
-    does not grow with deviation from the target. Provided for comparison.
     Args:
-        routing_freqs: Realized routing frequencies f_i, shape (L,). Detached.
-        assignment_probs: Soft assignment probabilities p_i, shape (L,). Gradient
-            flows to expert_bias through this tensor.
     Returns:
         Scalar loss tensor.
     """
-    L = routing_freqs.shape[0]
-    return (routing_freqs * assignment_probs).sum() / L
 def ce_loss(
-    routing_freqs: torch.Tensor,
-    assignment_probs: torch.Tensor,
 ) -> torch.Tensor:
     """Cross-entropy load-balance loss.
-    Computes -(1/(L-1)) * Σ_i (1 - f_i) * log(p_i), where the weight (1 - f_i)
-    suppresses the signal for overloaded heads (high f_i → weight near zero) and
-    amplifies it for underloaded heads (low f_i → weight near 1). This makes the
-    loss push probability mass toward under-utilized experts.
-    The (1/(L-1)) normalization makes the coefficient interpretable as a controller
-    strength independent of expert count. The log-probability signal grows as p_i
-    deviates from the target, providing correction that scales with violation severity.
     Args:
-        routing_freqs: Realized routing frequencies f_i, shape (L,). Detached.
-        assignment_probs: Soft assignment probabilities p_i, shape (L,). Gradient
-            flows to expert_bias through this tensor.
     Returns:
         Scalar loss tensor.
     """
-    L = routing_freqs.shape[0]
-    # Numerical stability: torch.log is safe here because softmax outputs are
-    # strictly positive. The (1 - f_i) weight goes to zero exactly when f_i = 1,
-    # which can only occur with a single head, so the 0 * (-inf) degenerate case
-    # does not arise in practice.
-    return -(((1.0 - routing_freqs) * torch.log(assignment_probs)).sum()) / (L - 1)
 def bce_loss(
-    routing_freqs: torch.Tensor,
-    assignment_probs: torch.Tensor,
 ) -> torch.Tensor:
     """Binary cross-entropy load-balance loss.
-    Computes -(1/L) * Σ_i [(1 - f_i) * log(p_i) + f_i * log(1 - p_i)], where
-    each head is treated as an independent binary target. Unlike CE, BCE maintains
-    a repulsion signal from saturated experts: when f_i → 1, the weight on
-    log(1 - p_i) drives p_i away from 1, preventing runaway concentration.
-    log(1 - p_i) is computed as log1p(-p_i) for numerical safety near p_i = 1.
     Args:
-        routing_freqs: Realized routing frequencies f_i, shape (L,). Detached.
-        assignment_probs: Soft assignment probabilities p_i, shape (L,). Gradient
-            flows to expert_bias through this tensor.
     Returns:
         Scalar loss tensor.
     """
-    L = routing_freqs.shape[0]
-    positive_term = (1.0 - routing_freqs) * torch.log(assignment_probs)
-    # log1p(-p) instead of log(1-p): avoids catastrophic cancellation when p is
-    # close to 1, where (1 - p) loses precision and log produces large errors.
-    negative_term = routing_freqs * torch.log1p(-assignment_probs)
-    return -(positive_term + negative_term).sum() / L
 # ---------------------------------------------------------------------------
 # Factory
 # ---------------------------------------------------------------------------
-_LOSS_REGISTRY: dict[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = {
     "gshard": gshard_loss,
     "ce": ce_loss,
     "bce": bce_loss,
@@ -2917,15 +3034,19 @@ _LOSS_REGISTRY: dict[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]]
 def make_load_balance_loss(
     loss_type: str,
-) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]:
     """Return a load-balance loss callable for the requested formulation.
-    All returned callables share the same external contract:
-        loss_fn(routing_freqs: Tensor, assignment_probs: Tensor) -> scalar Tensor
-    The caller is responsible for computing assignment_probs via
-    softmax(logits.detach() + expert_bias) to ensure gradient isolation.
     Args:
         loss_type: One of ``"gshard"``, ``"ce"``, or ``"bce"``.
@@ -2944,55 +3065,77 @@ def make_load_balance_loss(
     return _LOSS_REGISTRY[loss_type]
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
     Each input token independently selects K of the L available expert heads. Both
-    selection and routing_probs incorporate expert_bias via two gradient-isolated
-    pathways over numerically identical biased values. See module docstring for the
-    two-pathway architecture.
-    The routing projection W_r has no bias term — the paper specifies xW_r with no
-    additional projection bias. The only bias-like parameter is expert_bias (b), which
-    has an entirely separate role and gradient path.
     Args:
-        config: Model configuration. Must expose ``hidden_size``, ``num_mosrah_heads``
-            (L), and ``num_selected_heads`` (K).
     """
     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
         self.num_mosrah_heads = config.num_mosrah_heads
         self.num_selected_heads = config.num_selected_heads
-        self.load_balance_p = config.load_balance_p
         if config.use_cache:
             self.capacity = config.mosrah_cache_length
         else:
             self.capacity = config.mosrah_packed_length
         self.max_bid_rounds = config.max_bid_rounds
         self._load_balance_loss = make_load_balance_loss(config.load_balance_loss_type)
-        # W_r: routing projection, no bias (paper specifies xW_r, no additional term).
-        self.routing_projection = nn.Linear(
-            config.embedding_width, config.num_mosrah_heads, bias=False
         )
-        # Scalar gate on routing logits. As an nn.Parameter it is exempt from
-        # HuggingFace _init_weights, so its near-zero initial value is preserved
-        # after from_config construction. Near-zero initialization ensures routing
-        # starts near-uniform and expert_bias has leverage over logits from step one.
-        self.routing_scale = nn.Parameter(
-            torch.full((1,), config.router_init_scale)
         )
-        # b: learned per-head bias for load balancing. Initialized to zero so that all
-        # heads start with equal selection probability. Updated by the main optimizer
-        # via gradients from the load balance loss through load_balancing_logits.
-        self.expert_bias = nn.Parameter(torch.zeros(config.num_mosrah_heads))
     @staticmethod
     def get_best_proposals(
@@ -3228,7 +3371,7 @@ class MoSRAHRouter(nn.Module):
         """Route input tokens to K expert heads each and compute routing probabilities.
         Args:
-            x: Input hidden states of shape (batch, seq_len, hidden_size).
             active_mask: Current-chunk active mask of shape (batch, seq_len), where
                 True means the token is semantically live. Dead tokens do not
                 contribute to routing frequencies, load_balance_loss, or max_vio.
@@ -3244,56 +3387,39 @@ class MoSRAHRouter(nn.Module):
             router_diagnostics: Dict of routing feedback scalars. Keys:
                 - ``load_balance_loss``: scalar load-balance loss with gradient.
                 - ``max_vio``: detached scalar routing-imbalance summary.
-                - ``bias_std``: std of expert_bias; near-zero means corrections have not built up.
-                - ``raw_logit_std``: mean per-token std of scaled logits; the natural routing scale.
                 - ``logit_std``: mean per-token std of semantic_logits; lower than
-                  raw_logit_std means bias is flattening preferences (healthy correction).
-                - ``bias_alignment``: mean cosine similarity of expert_bias against per-token
-                  logits. Negative means bias opposes routing direction (healthy correction);
-                  positive means runaway reinforcement.
         """
         B, N, _ = x.shape
         L = self.num_mosrah_heads
         K = self.num_selected_heads
-        # Scaled logits. routing_scale is a near-zero nn.Parameter exempt from
-        # HuggingFace _init_weights, so routing starts near-uniform and expert_bias
-        # has leverage from step one.
-        logits = self.routing_projection(x) * self.routing_scale    # (B, N, L)
-        # Two gradient-isolated pathways over numerically identical biased values.
-        # semantic_logits: task gradients reach routing_projection; expert_bias isolated.
-        # load_balancing_logits: load balance gradients reach expert_bias; routing_projection isolated.
-        semantic_logits       = logits + self.expert_bias.detach()   # (B, N, L)
-        load_balancing_logits = logits.detach() + self.expert_bias   # (B, N, L)
-        # Diagnostic scalars characterising the load-balance mechanism. Must be
-        # computed here — before balance_capacity injects -1e8 sentinels that
-        # would corrupt std and cosine similarity.
-        bias_std       = self.expert_bias.std().detach()
-        raw_logit_std  = logits.std(dim=-1).mean().detach()
-        logit_std      = semantic_logits.std(dim=-1).mean().detach()
-        bias_alignment = F.cosine_similarity(
-            logits, self.expert_bias.expand_as(logits), dim=-1
-        ).mean().detach()
-        # Assignment probabilities for load balance loss. Computed from load_balancing_logits
-        # before balance_capacity so that -1e8 sentinels do not invert the load balance
-        # gradient for over-capacity experts. active_float is reused below for routing freqs.
-        active_float     = active_mask.float().unsqueeze(-1)                          # (B, N, 1)
-        lb_softmax        = F.softmax(load_balancing_logits, dim=-1)                  # (B, N, L)
-        assignment_probs  = (lb_softmax * active_float).sum(dim=(0, 1))               # (L,) unnorm
-        assignment_probs  = assignment_probs / active_mask.float().sum()              # (L,) norm
         # Pre-capacity semantic softmax for gathering routing_probs. Computed before
         # balance_capacity so that gathered probabilities reflect genuine preference
         # magnitudes rather than hard-masked sentinel values.
-        routing_scores = F.softmax(semantic_logits, dim=-1)          # (B, N, L)
         # Capacity-balanced semantic logits for selection. Injects -1e8 into positions
         # that would exceed per-expert token budget, enforcing the packing constraint.
         balanced_semantic_logits = self.balance_capacity(
-            semantic_logits,
             used_capacity,
             self.capacity,
             self.num_selected_heads,
@@ -3309,61 +3435,201 @@ class MoSRAHRouter(nn.Module):
         gathered      = routing_scores.gather(dim=-1, index=selected_heads)    # (B, N, K)
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)          # P, (B, N, K)
-        # Per-item routing frequencies f_{b,l}: for each batch item b and head l, what
-        # fraction of that item's active K assignments over all tokens go to head l.
-        # Dead tokens are excluded before reduction. Normalization is per batch item so
-        # each item's frequencies sum to 1 independently of other items in the batch.
         assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
         assignment_mask.scatter_(-1, selected_heads, 1.0)
-        active_assignments = assignment_mask * active_mask.unsqueeze(-1)
-        per_item_counts = active_assignments.sum(dim=1)              # (B, L)
-        per_item_total  = active_mask.sum(dim=1, keepdim=True) * K  # (B, 1)
-        per_item_freqs  = per_item_counts / per_item_total           # (B, L)
-        # p-mean of per_item_freqs over the batch dimension produces routing_freqs (L,).
-        # p-mean weights aggregation toward the worst-case batch item relative to
-        # arithmetic mean, making the load balance signal sensitive to per-item spikes
-        # that cause packing overflow.
-        p = self.load_balance_p
-        routing_freqs = (per_item_freqs ** p).mean(dim=0) ** (1.0 / p)  # (L,)
-        load_balance_loss = self._load_balance_loss(routing_freqs, assignment_probs)
-        # MaxVio is a detached monitoring scalar following the paper's formula
-        # L · max_l(f_l − 1/L) applied to routing_freqs. Must not contribute gradients.
-        max_vio = self._compute_max_vio(routing_freqs, L)
         router_diagnostics = {
             "load_balance_loss": load_balance_loss,
             "max_vio": max_vio,
-            "bias_std": bias_std,
-            "raw_logit_std": raw_logit_std,
-            "logit_std": logit_std,
-            "bias_alignment": bias_alignment,
         }
         return selected_heads, routing_probs, router_diagnostics
     @staticmethod
-    def _compute_max_vio(routing_freqs: torch.Tensor, num_heads: int) -> torch.Tensor:
         """Compute the MaxVio routing-imbalance scalar.
-        MaxVio = L · max_l(f_l − 1/L), where f_l is the realised routing frequency of
-        head l and 1/L is the perfectly balanced target. Follows the paper's definition
-        (Wang et al.) applied to routing_freqs. A value of zero indicates perfect
-        balance; a value of 0.5 means the most overloaded head received 50% more routed
-        tokens than ideal.
-        The result is detached from the autograd graph — MaxVio is a monitoring scalar
-        and must never contribute gradients to any parameter.
         Args:
-            routing_freqs: Per-head routing frequencies of shape (L,).
-            num_heads: Total number of MoSRAH heads L.
         Returns:
             Detached scalar MaxVio tensor.
         """
-        return (num_heads * (routing_freqs - 1.0 / num_heads).max()).detach()
 # -----------
 # Inlined from: positions_converter.py

 from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
 from typing import Callable
             num_selected_heads / num_mosrah_heads * mosrah_overallocation_factor).
             Must be > 1.0 to guarantee a buffer larger than the balanced-routing
             baseline. Default 2.0.
         max_bid_rounds: Maximum bidding rounds for the deferred-acceptance capacity
             solver in ``balance_capacity``. 10 covers convergence at approximately
             the 98th percentile of routing densities; the top 2% of extreme-density
             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
+        routing_mode: Routing computation mode. ``"integral"`` (default) enables the
+            integral routing extension: the exclusive cumsum of routing logits along
+            the sequence dimension is mapped through two additional (L, L) parameter
+            matrices (``routing_integral_weight`` A' and ``balance_integral_weight``
+            B') and added as corrections to both logit pathways. This gives each
+            token a read on the cumulative routing history so far in the sequence.
+            ``"default"`` disables the extension; A' and B' are not created.
     """
     model_type = "shram"
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
+        routing_mode: str = "integral",
         **kwargs
     ):
         if head_dim % 2 != 0:
                 f"Got {mosrah_overallocation_factor}."
             )
         if max_bid_rounds < 1:
             raise ValueError(
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
                 f"load_balance_loss_type must be one of {supported}, "
                 f"got {load_balance_loss_type!r}."
             )
+        _supported_routing_modes = {"default", "integral"}
+        if routing_mode not in _supported_routing_modes:
+            supported = ", ".join(f'"{m}"' for m in sorted(_supported_routing_modes))
             raise ValueError(
+                f"routing_mode must be one of {supported}, got {routing_mode!r}."
             )
         self.vocab_size = vocab_size
         self.alpha = alpha
         self.beta = beta
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
+        self.routing_mode = routing_mode
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
     the semantic routing scores at the selected indices and renormalized to sum to 1
     per token.
+Base routing uses two learnable projection matrices and two gradient-isolated pathways:
+  - routing_weight (A): shape (L, embedding_width). Maps input to per-head routing
+    scores. Receives gradients from task loss; balance_weight is isolated.
+  - balance_weight (B): shape (L, embedding_width). Maps input to per-head load-balance
+    correction scores. Receives gradients from load_balance_loss; routing_weight is
+    isolated.
+The two gradient-isolated base pathways over numerically identical values:
+  - semantic_logits = A·x + (B·x).detach(): task gradients reach routing_weight;
+    balance_weight is isolated from task loss.
+  - load_balancing_logits = (A·x).detach() + B·(x.detach()): load balance gradients
+    reach balance_weight; routing_weight and x are isolated from load balance loss.
+Integral routing extension (routing_mode == "integral"):
+Standard routing is parallel — each token routes based on its own hidden state alone,
+with no direct read on what earlier tokens in the sequence have already selected.
+Integral routing adds a cumulative-sum signal that gives each token a view of the
+prior routing history within the sequence.
+Two additional (L, L) parameter matrices are introduced:
+  - routing_integral_weight (A'): shape (L, L). Maps the cumulative logit history to
+    per-head semantic corrections. Receives gradients from task loss.
+  - balance_integral_weight (B'): shape (L, L). Maps the cumulative logit history to
+    per-head load-balance corrections. Receives gradients from load_balance_loss.
+The cumulative history signal u is the exclusive cumsum of the base logits along the
+sequence dimension: u[n] = sum(logits[0..n-1]), shape (B, N, L). Position 0 receives
+zeros (no prior history). The same gradient isolation pattern as A/B applies:
+  - semantic_logits   += A'·u_semantic + (B'·u_semantic).detach()
+  - lb_logits         += (A'·u_load).detach() + B'·u_load
+Detaching the full B'·u_semantic result (rather than just B') mirrors the
+(B·x).detach() pattern in the base pathway and prevents double-counting the
+cumsum gradient path back to routing_weight.
+Both base matrices and both integral matrices are nn.Parameter so that HuggingFace
+_init_weights does not override their kaiming initialization at construction.
 Assignment probabilities are computed before balance_capacity applies -1e8 sentinels.
 Post-capacity softmax would invert the load balance gradient for over-capacity experts
+(near-zero probability after masking signals "increase corrections" for an already-
+overloaded expert).
 The router also computes and returns the load balance loss via a log-probability auxiliary
 loss (see load_balance_loss.py). The loss formulation is selected by config; the default
 The router additionally computes and returns MaxVio, a detached scalar summarising
 routing imbalance for the current forward pass:
+    MaxVio = mean_b( L · max_l(f_bl − 1/L) )
+where f_bl is the per-batch-item realised routing frequency of head l and 1/L is the
+perfectly balanced target. MaxVio is averaged over batch items and is a monitoring
+quantity only; it never contributes gradients.
 Paper ref: Appendix A.Routing, Appendix A.Load Balancing, §MaxVio.
 """
 # -----------
 """Log-probability auxiliary loss functions for MoSRAH load balancing.
+This module provides three load-balance loss formulations, two token-reduction
+helpers, and a factory that selects among the formulations. All formulations
+share the same external contract:
+    loss_fn(
+        logits:          Tensor[B, N, L],
+        assignment_mask: Tensor[B, N, L],
+        active_mask:     Tensor[B, N],
+    ) -> scalar Tensor
+    logits:          Load-balancing logits, shape (B, N, L). These are the raw
+                     pre-softmax scores from logits.detach() + expert_bias.
+                     Gradient flows to expert_bias through this tensor.
+    assignment_mask: Per-token head-assignment indicators. assignment_mask[b, n, l]
+                     is 1.0 if token (b, n) was assigned to head l. Dead tokens
+                     should carry zero entries.
+    active_mask:     Boolean mask, shape (B, N). True means the token is
+                     semantically live.
+Token reduction is split into two helpers with distinct roles:
+    reduce_frequency_tokens — produces per-batch-item routing frequencies f_bl (B, L).
+        Called by all three formulations. Output is detached; f_bl carries no gradient.
+    reduce_probability_tokens — produces per-batch-item mean assignment probabilities
+        p_bl (B, L). Called only by gshard and bce. Gradient flows to expert_bias
+        through the internal softmax over logits.
+CE delegates probability computation to F.cross_entropy, which handles its own
+log_softmax and operates directly on the raw (B, N, L) logits.
+The factory is the intended entry point. MoSRAHRouter constructs the loss callable
+once at init and invokes it each forward pass.
 """
+# ---------------------------------------------------------------------------
+# Token-reduction helpers
+# ---------------------------------------------------------------------------
+def reduce_frequency_tokens(
+    assignment_mask: torch.Tensor,
+    active_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Reduce per-token head assignments to per-batch-item routing frequencies.
+    f_bl[b, l] is the fraction of active-token assignments in batch item b going
+    to head l. Values sum to 1 per batch item when routing is valid.
+    The output is detached from the autograd graph: routing frequencies are
+    derived from discrete TopK selections and must not carry gradients.
+    Denominators are clamped to 1 to handle the all-dead-tokens edge case.
+    Args:
+        assignment_mask: Per-token head-assignment indicators, shape (B, N, L).
+        active_mask:     Boolean active-token mask, shape (B, N).
+    Returns:
+        f_bl: Per-batch-item routing frequencies, shape (B, L). Detached.
+    """
+    active_float = active_mask.float().unsqueeze(-1)                               # (B, N, 1)
+    active_assignments = assignment_mask * active_float                             # (B, N, L)
+    assignment_totals = (
+        active_assignments.sum(dim=(1, 2)).clamp(min=1.0).unsqueeze(-1)            # (B, 1)
+    )
+    return (active_assignments.sum(dim=1) / assignment_totals).detach()            # (B, L)
+def reduce_probability_tokens(
+    logits: torch.Tensor,
+    active_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Reduce per-token load-balancing logits to per-batch-item assignment probabilities.
+    p_bl[b, l] is the mean softmax probability for head l over active tokens in
+    batch item b. Values sum to 1 per batch item. Gradient flows to expert_bias
+    through the internal softmax.
+    Denominators are clamped to 1 to handle the all-dead-tokens edge case.
+    Args:
+        logits:      Load-balancing logits, shape (B, N, L). Gradient flows through.
+        active_mask: Boolean active-token mask, shape (B, N).
+    Returns:
+        p_bl: Per-batch-item mean assignment probabilities, shape (B, L).
+    """
+    per_token_probs = F.softmax(logits, dim=-1)                                    # (B, N, L)
+    active_float = active_mask.float().unsqueeze(-1)                               # (B, N, 1)
+    active_count = active_mask.float().sum(dim=1, keepdim=True).clamp(min=1.0)     # (B, 1)
+    return (per_token_probs * active_float).sum(dim=1) / active_count              # (B, L)
 # ---------------------------------------------------------------------------
 # Loss functions
 # ---------------------------------------------------------------------------
 def gshard_loss(
+    logits: torch.Tensor,
+    assignment_mask: torch.Tensor,
+    active_mask: torch.Tensor,
 ) -> torch.Tensor:
     """GShard-style linear load-balance loss.
+    Computes (1/L) * Σ_l f_bl * p_bl per batch item, averaged over B, where
+    f_bl comes from reduce_frequency_tokens and p_bl from reduce_probability_tokens.
+    The linear signal is the weakest of the three formulations; gradient magnitude
+    does not grow with violation severity. Provided for comparison.
     Args:
+        logits:          Load-balancing logits, shape (B, N, L).
+        assignment_mask: Per-token head-assignment indicators, shape (B, N, L).
+        active_mask:     Boolean active-token mask, shape (B, N).
     Returns:
         Scalar loss tensor.
     """
+    L = logits.shape[-1]
+    f_bl = reduce_frequency_tokens(assignment_mask, active_mask)
+    p_bl = reduce_probability_tokens(logits, active_mask)
+    return (f_bl * p_bl).sum(dim=-1).mean() / L
 def ce_loss(
+    logits: torch.Tensor,
+    assignment_mask: torch.Tensor,
+    active_mask: torch.Tensor,
 ) -> torch.Tensor:
     """Cross-entropy load-balance loss.
+    Constructs per-batch-item soft target distributions from routing frequencies
+    and delegates to F.cross_entropy operating directly on (B, N, L) logits.
+    Inactive tokens receive all-zero targets, producing zero loss and zero gradient.
+    The soft target for head l in batch item b is (1 - f_bl) / (L - 1). This
+    distribution sums to 1 per batch item (since Σ_l (1 - f_bl) = L - 1) and
+    weights underloaded heads (low f_bl → high target) more strongly than
+    overloaded ones.
+    The total CE over active tokens is normalised by the active token count rather
+    than B*N to avoid dilution from inactive positions.
     Args:
+        logits:          Load-balancing logits, shape (B, N, L).
+        assignment_mask: Per-token head-assignment indicators, shape (B, N, L).
+        active_mask:     Boolean active-token mask, shape (B, N).
     Returns:
         Scalar loss tensor.
     """
+    B, N, L = logits.shape
+    f_bl = reduce_frequency_tokens(assignment_mask, active_mask)               # (B, L)
+    active_count = active_mask.float().sum().clamp(min=1.0)
+    # Soft target: (1 - f_bl) / (L - 1) for active tokens, zeros for inactive.
+    # Zeros give zero CE loss and zero gradient at inactive positions.
+    target = (1.0 - f_bl) / (L - 1)                                           # (B, L)
+    target_per_token = (
+        target.unsqueeze(1).expand(-1, N, -1)                                  # (B, N, L)
+        * active_mask.float().unsqueeze(-1)                                    # zero inactive
+    )
+    # F.cross_entropy requires the class dimension to be dim 1.
+    # Permute (B, N, L) → (B, L, N) to satisfy the (N, C, d) contract.
+    return F.cross_entropy(
+        logits.permute(0, 2, 1),             # (B, L, N)
+        target_per_token.permute(0, 2, 1),   # (B, L, N)
+        reduction='sum',
+    ) / active_count
 def bce_loss(
+    logits: torch.Tensor,
+    assignment_mask: torch.Tensor,
+    active_mask: torch.Tensor,
 ) -> torch.Tensor:
     """Binary cross-entropy load-balance loss.
+    Treats each head as an independent binary target with label (1 - f_bl).
+    Uses reduce_probability_tokens to produce per-batch-item probabilities,
+    then delegates to F.binary_cross_entropy over (B, L) tensors.
+    Unlike CE, BCE maintains a repulsion signal from saturated experts: when
+    f_bl → 1 the target → 0, driving p_bl away from 1 and preventing runaway
+    concentration.
+    Active masking is handled inside reduce_frequency_tokens and
+    reduce_probability_tokens, so the (B, L) output tensors already exclude
+    inactive tokens from both frequencies and probabilities.
     Args:
+        logits:          Load-balancing logits, shape (B, N, L).
+        assignment_mask: Per-token head-assignment indicators, shape (B, N, L).
+        active_mask:     Boolean active-token mask, shape (B, N).
     Returns:
         Scalar loss tensor.
     """
+    f_bl = reduce_frequency_tokens(assignment_mask, active_mask)
+    p_bl = reduce_probability_tokens(logits, active_mask)
+    # Clamp p_bl for numerical safety: F.binary_cross_entropy requires input in
+    # (0, 1) and will produce inf for exactly 0 or 1. Softmax outputs are
+    # strictly positive in normal operation; the clamp guards the all-dead-tokens
+    # edge case where the mean defaults to zero.
+    return F.binary_cross_entropy(
+        p_bl.clamp(min=1e-7, max=1.0 - 1e-7),
+        1.0 - f_bl,
+        reduction='mean',
+    )
 # ---------------------------------------------------------------------------
 # Factory
 # ---------------------------------------------------------------------------
+_LOSS_REGISTRY: dict[str, Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]] = {
     "gshard": gshard_loss,
     "ce": ce_loss,
     "bce": bce_loss,
 def make_load_balance_loss(
     loss_type: str,
+) -> Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]:
     """Return a load-balance loss callable for the requested formulation.
+    All returned callables share the external contract:
+        loss_fn(
+            logits:          Tensor[B, N, L],
+            assignment_mask: Tensor[B, N, L],
+            active_mask:     Tensor[B, N],
+        ) -> scalar Tensor
+    The caller is responsible for computing logits as logits.detach() + expert_bias
+    to ensure gradient isolation to expert_bias.
     Args:
         loss_type: One of ``"gshard"``, ``"ce"``, or ``"bce"``.
     return _LOSS_REGISTRY[loss_type]
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
     Each input token independently selects K of the L available expert heads. Both
+    selection and routing_probs incorporate balance_weight via two gradient-isolated
+    pathways over numerically identical values. See module docstring for the
+    two-pathway architecture and the integral routing extension.
+    All four learnable matrices are nn.Parameter rather than nn.Linear so that
+    HuggingFace _init_weights does not override their kaiming initialization at
+    construction.
+    Attributes:
+        routing_weight: A, shape (L, embedding_width). Task-loss pathway.
+        balance_weight: B, shape (L, embedding_width). Load-balance pathway.
+        routing_integral_weight: A', shape (L, L). Integral task-loss pathway.
+            Present only when ``routing_mode == "integral"``.
+        balance_integral_weight: B', shape (L, L). Integral load-balance pathway.
+            Present only when ``routing_mode == "integral"``.
+        routing_mode: ``"integral"`` or ``"default"``, from config.
     Args:
+        config: Model configuration. Must expose ``embedding_width``,
+            ``num_mosrah_heads`` (L), ``num_selected_heads`` (K), and
+            ``routing_mode``.
     """
     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
         self.num_mosrah_heads = config.num_mosrah_heads
         self.num_selected_heads = config.num_selected_heads
         if config.use_cache:
             self.capacity = config.mosrah_cache_length
         else:
             self.capacity = config.mosrah_packed_length
         self.max_bid_rounds = config.max_bid_rounds
+        self.routing_mode = config.routing_mode
         self._load_balance_loss = make_load_balance_loss(config.load_balance_loss_type)
+        # W_r (A): semantic routing matrix. Maps input (B, N, d) to per-head routing
+        # scores (B, N, L) for selection and routing_probs. nn.Parameter ensures
+        # HuggingFace _init_weights does not override kaiming initialization.
+        self.routing_weight = nn.Parameter(
+            torch.empty(config.num_mosrah_heads, config.embedding_width)
         )
+        nn.init.kaiming_uniform_(self.routing_weight)
+        # W_b (B): load-balancing projection matrix. Maps input (B, N, d) to per-head
+        # correction scores (B, N, L). Receives gradients only from load_balance_loss.
+        # nn.Parameter ensures HuggingFace _init_weights does not override kaiming init.
+        self.balance_weight = nn.Parameter(
+            torch.empty(config.num_mosrah_heads, config.embedding_width)
         )
+        nn.init.kaiming_uniform_(self.balance_weight)
+        if self.routing_mode == "integral":
+            L = config.num_mosrah_heads
+            # A': integral semantic matrix. Maps cumulative logit history (B, N, L) to
+            # per-head semantic corrections (B, N, L). Shape (L, L). Receives gradients
+            # from task loss; balance_integral_weight is isolated from task loss.
+            # Zero-initialized so that corrections start at zero and grow from gradient
+            # updates — kaiming init produces corrections that immediately overwhelm the
+            # base routing signal via the cumsum feedback path.
+            self.routing_integral_weight = nn.Parameter(torch.zeros(L, L))
+            # B': integral load-balance matrix. Maps cumulative logit history (B, N, L)
+            # to per-head load-balance corrections (B, N, L). Shape (L, L). Receives
+            # gradients from load_balance_loss; routing_integral_weight is isolated.
+            # Zero-initialized for the same reason as routing_integral_weight.
+            self.balance_integral_weight = nn.Parameter(torch.zeros(L, L))
     @staticmethod
     def get_best_proposals(
         """Route input tokens to K expert heads each and compute routing probabilities.
         Args:
+            x: Input hidden states of shape (batch, seq_len, embedding_width).
             active_mask: Current-chunk active mask of shape (batch, seq_len), where
                 True means the token is semantically live. Dead tokens do not
                 contribute to routing frequencies, load_balance_loss, or max_vio.
             router_diagnostics: Dict of routing feedback scalars. Keys:
                 - ``load_balance_loss``: scalar load-balance loss with gradient.
                 - ``max_vio``: detached scalar routing-imbalance summary.
+                - ``raw_logit_std``: mean per-token std of routing_logits; natural
+                  routing preference scale and baseline for interpreting bias_std.
+                - ``bias_std``: mean per-token std of balance_logits; near-zero
+                  means balance corrections have not built up relative to routing scale.
                 - ``logit_std``: mean per-token std of semantic_logits; lower than
+                  raw_logit_std means balance is flattening preferences (healthy correction).
+                - ``bias_alignment``: mean cosine similarity of routing_logits vs
+                  balance_logits per token. Negative means balance opposes routing direction
+                  (healthy correction); positive means runaway reinforcement.
         """
         B, N, _ = x.shape
         L = self.num_mosrah_heads
         K = self.num_selected_heads
+        logits = self._compute_routing_logits(x, active_mask)
+        # Diagnostic scalars characterising the two routing pathways. Must be computed
+        # before balance_capacity injects -1e8 sentinels that would corrupt std and
+        # cosine similarity. Extracted to _compute_bias_diagnostics to keep the forward
+        # body free of non-(B,N,L) reduction logic.
+        bias_diagnostics = self._compute_bias_diagnostics(
+            logits["routing_logits"], logits["balance_logits"], logits["semantic_logits"]
+        )
         # Pre-capacity semantic softmax for gathering routing_probs. Computed before
         # balance_capacity so that gathered probabilities reflect genuine preference
         # magnitudes rather than hard-masked sentinel values.
+        routing_scores = F.softmax(logits["semantic_logits"], dim=-1)          # (B, N, L)
         # Capacity-balanced semantic logits for selection. Injects -1e8 into positions
         # that would exceed per-expert token budget, enforcing the packing constraint.
         balanced_semantic_logits = self.balance_capacity(
+            logits["semantic_logits"],
             used_capacity,
             self.capacity,
             self.num_selected_heads,
         gathered      = routing_scores.gather(dim=-1, index=selected_heads)    # (B, N, K)
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)          # P, (B, N, K)
+        # assignment_mask: (B, N, L) float — 1.0 at each token's K selected heads, 0 elsewhere.
+        # The discrete routing decision; no gradient flows through it. Passed alongside
+        # load_balancing_logits and active_mask to the loss and max_vio methods, which
+        # own all frequency aggregation and reduction internally.
         assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
         assignment_mask.scatter_(-1, selected_heads, 1.0)
+        load_balance_loss = self._load_balance_loss(
+            logits["load_balancing_logits"], assignment_mask, active_mask
+        )
+        # MaxVio: detached monitoring scalar averaged over batch items. Computed from
+        # the same (B, N, L) assignment_mask so frequencies are consistent with the loss.
+        max_vio = self._compute_max_vio(assignment_mask, active_mask, L)
         router_diagnostics = {
             "load_balance_loss": load_balance_loss,
             "max_vio": max_vio,
+            **bias_diagnostics,
         }
         return selected_heads, routing_probs, router_diagnostics
     @staticmethod
+    def exclusive_cumsum(logits: torch.Tensor) -> torch.Tensor:
+        """Compute the exclusive cumulative sum along the sequence dimension.
+        u[n] = sum(logits[0..n-1]): position n receives the accumulated sum of all
+        prior positions, giving it a read on the routing preferences expressed by
+        earlier tokens in the sequence. Position 0 always receives zeros — no prior
+        history exists at the first position.
+        Args:
+            logits: Shape (B, N, L). Any per-head score tensor along a sequence.
+        Returns:
+            Exclusive cumsum, shape (B, N, L). Same dtype and device as input.
+        """
+        shifted = torch.cat(
+            [torch.zeros_like(logits[:, :1, :]), logits[:, :-1, :]], dim=1
+        )
+        return shifted.cumsum(dim=1)
+    def _compute_routing_logits(
+        self, x: torch.Tensor, active_mask: torch.Tensor
+    ) -> dict[str, torch.Tensor]:
+        """Compute the gradient-isolated logit pathways from input hidden states.
+        Base pathways (both modes):
+          Two gradient-isolated pathways over numerically identical values:
+          - semantic_logits = A·x + (B·x).detach(): task gradients reach routing_weight;
+            balance_weight is isolated from task loss.
+          - load_balancing_logits = (A·x).detach() + B·(x.detach()): load balance
+            gradients reach balance_weight; routing_weight and x are isolated.
+        Integral extension (routing_mode == "integral"):
+          Dead tokens are zeroed out of the logits before computing the cumsum, so
+          inactive positions do not contribute to the routing history of downstream
+          live tokens. u_semantic and u_load therefore represent history from live
+          tokens only.
+          u_semantic = exclusive_cumsum(semantic_logits * active_mask)    — (B, N, L)
+          u_load     = exclusive_cumsum(load_balancing_logits * active_mask) — (B, N, L)
+          semantic_logits       += A'·u_semantic + (B'·u_semantic).detach()
+          load_balancing_logits += (A'·u_load).detach() + B'·u_load
+          Detaching the full (B'·u_semantic) result mirrors the (B·x).detach() base
+          pattern: it isolates balance_integral_weight from task loss AND prevents
+          double-counting the cumsum gradient path back to routing_weight.
+          The same reasoning applies to (A'·u_load).detach() in the load-balance
+          pathway — u_load already has no path to routing_weight (routing_logits is
+          detached in load_balancing_logits), and the detach additionally blocks
+          routing_integral_weight.
+        Args:
+            x: Input hidden states, shape (batch, seq_len, embedding_width).
+            active_mask: Boolean active-token mask, shape (batch, seq_len). Dead tokens
+                are excluded from the cumsum history in integral mode.
+        Returns:
+            Dict with keys:
+            - ``routing_logits``:        A·x, shape (B, N, L).
+            - ``balance_logits``:        B·x, shape (B, N, L).
+            - ``semantic_logits``:       combined task-loss pathway, shape (B, N, L).
+            - ``load_balancing_logits``: combined load-balance pathway, shape (B, N, L).
+        """
+        routing_logits = F.linear(x, self.routing_weight)                     # (B, N, L)
+        balance_logits = F.linear(x, self.balance_weight)                     # (B, N, L)
+        semantic_logits       = routing_logits + balance_logits.detach()
+        load_balancing_logits = routing_logits.detach() + F.linear(x.detach(), self.balance_weight)
+        if self.routing_mode == "integral":
+            # Zero out dead token positions before cumsum so inactive tokens do not
+            # contaminate the routing history of subsequent live tokens.
+            live = active_mask.unsqueeze(-1)                                   # (B, N, 1)
+            u_semantic = self.exclusive_cumsum(semantic_logits * live)         # (B, N, L)
+            u_load     = self.exclusive_cumsum(load_balancing_logits * live)   # (B, N, L)
+            # Semantic pathway: A' trains on task loss; B' term is fully detached to
+            # isolate balance_integral_weight from task loss and prevent double-counting
+            # the cumsum gradient path back to routing_weight.
+            semantic_logits = (
+                semantic_logits
+                + F.linear(u_semantic, self.routing_integral_weight)
+                + F.linear(u_semantic, self.balance_integral_weight).detach()
+            )
+            # Load-balance pathway: B' trains on load_balance_loss; A' term is fully
+            # detached to isolate routing_integral_weight from load_balance_loss.
+            load_balancing_logits = (
+                load_balancing_logits
+                + F.linear(u_load, self.routing_integral_weight).detach()
+                + F.linear(u_load, self.balance_integral_weight)
+            )
+        return {
+            "routing_logits":        routing_logits,
+            "balance_logits":        balance_logits,
+            "semantic_logits":       semantic_logits,
+            "load_balancing_logits": load_balancing_logits,
+        }
+    @staticmethod
+    def _compute_bias_diagnostics(
+        routing_logits: torch.Tensor,
+        balance_logits: torch.Tensor,
+        semantic_logits: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        """Compute detached diagnostic scalars characterising the two routing pathways.
+        All scalars must be computed from pre-capacity logits; balance_capacity
+        applies -1e8 sentinels that would corrupt std and cosine similarity.
+        Extracted from forward to keep the main body free of reduction logic.
+        Args:
+            routing_logits:  A·x, routing pathway output, shape (B, N, L).
+            balance_logits:  B·x, balance pathway output, shape (B, N, L).
+            semantic_logits: A·x + (B·x).detach(), combined signal, shape (B, N, L).
+        Returns:
+            Dict with keys:
+            - ``raw_logit_std``:  Mean per-token std of routing_logits. Natural
+                                   routing preference scale; reference baseline for
+                                   interpreting bias_std.
+            - ``bias_std``:       Mean per-token std of balance_logits. Near-zero
+                                   means balance corrections have not built up
+                                   relative to the routing scale.
+            - ``logit_std``:      Mean per-token std of semantic_logits. Lower than
+                                   raw_logit_std indicates balance is flattening
+                                   preferences (healthy correction signal).
+            - ``bias_alignment``: Mean cosine similarity of routing_logits vs
+                                   balance_logits per token. Range [-1, 1]. Negative
+                                   means balance opposes routing direction (healthy
+                                   correction); positive means runaway reinforcement.
+        """
+        return {
+            "raw_logit_std":  routing_logits.std(dim=-1).mean().detach(),
+            "bias_std":       balance_logits.std(dim=-1).mean().detach(),
+            "logit_std":      semantic_logits.std(dim=-1).mean().detach(),
+            "bias_alignment": F.cosine_similarity(
+                routing_logits, balance_logits, dim=-1
+            ).mean().detach(),
+        }
+    @staticmethod
+    def _compute_max_vio(
+        assignment_mask: torch.Tensor,
+        active_mask: torch.Tensor,
+        num_heads: int,
+    ) -> torch.Tensor:
         """Compute the MaxVio routing-imbalance scalar.
+        MaxVio = mean_b( L · max_l(f_bl − 1/L) ), where f_bl is the per-batch-item
+        realised routing frequency of head l. Uses reduce_frequency_tokens for consistent
+        per-batch-item frequency computation with dead tokens excluded, matching how the
+        load balance loss computes frequencies. A value of zero indicates perfect balance;
+        a value of 0.5 means the most overloaded head in the average batch item received
+        50% more routed tokens than ideal.
+        The result is detached — MaxVio is a monitoring scalar and must not contribute
+        gradients to any parameter.
         Args:
+            assignment_mask: Per-token head-assignment indicators, shape (B, N, L).
+            active_mask:     Boolean active-token mask, shape (B, N).
+            num_heads:       Total number of MoSRAH heads L.
         Returns:
             Detached scalar MaxVio tensor.
         """
+        f_bl = reduce_frequency_tokens(assignment_mask, active_mask)                   # (B, L)
+        per_item_max_vio = num_heads * (f_bl - 1.0 / num_heads).max(dim=-1).values    # (B,)
+        return per_item_max_vio.mean().detach()
 # -----------
 # Inlined from: positions_converter.py