Update architecture and tokenizer

Browse files

Files changed (4) hide show

README.md +1 -0
config.json +1 -0
configuration.py +12 -0
huggingface.py +94 -56

README.md CHANGED Viewed

@@ -96,6 +96,7 @@ contains no weights. All values are overridable via kwargs.
 | `output_hidden_states` | False |
 | `rms_norm_eps` | 1e-05 |
 | `rope_mode` | main_sequence |
 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |

 | `output_hidden_states` | False |
 | `rms_norm_eps` | 1e-05 |
 | `rope_mode` | main_sequence |
+| `router_init_scale` | 0.0001 |
 | `tie_word_embeddings` | False |
 | `training_sequence_length` | 1024 |
 | `use_cache` | True |

config.json CHANGED Viewed

@@ -23,6 +23,7 @@
   "num_sliding_window_heads": 16,
   "rms_norm_eps": 1e-05,
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
   "transformers_version": "5.10.2",

   "num_sliding_window_heads": 16,
   "rms_norm_eps": 1e-05,
   "rope_mode": "main_sequence",
+  "router_init_scale": 0.0001,
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
   "transformers_version": "5.10.2",

configuration.py CHANGED Viewed

@@ -99,6 +99,11 @@ class ShramConfig(PretrainedConfig):
             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
     """
     model_type = "shram"
@@ -135,6 +140,7 @@ class ShramConfig(PretrainedConfig):
         load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -191,6 +197,11 @@ class ShramConfig(PretrainedConfig):
             raise ValueError("In cross entropy mode, aggregation of "
                              "frequencies must be with mean 1.0")
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
@@ -212,6 +223,7 @@ class ShramConfig(PretrainedConfig):
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
+        router_init_scale: Initial standard deviation for the ``routing_scale``
+            scalar gate on routing logits. Brings routing logit magnitude to
+            ``expert_bias`` scale at initialisation so load balancing is operative
+            from step one. Must be positive. Default ``1e-4``. Note lower values
+            may require more bidding rounds to converge and more overcapacity to support.
     """
     model_type = "shram"
         load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
+        router_init_scale: float = 1e-4,
         **kwargs
     ):
         if head_dim % 2 != 0:
             raise ValueError("In cross entropy mode, aggregation of "
                              "frequencies must be with mean 1.0")
+        if router_init_scale <= 0.0:
+            raise ValueError(
+                f"router_init_scale must be positive, got {router_init_scale}."
+            )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
+        self.router_init_scale = router_init_scale
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

huggingface.py CHANGED Viewed

@@ -187,6 +187,11 @@ class ShramConfig(PretrainedConfig):
             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
     """
     model_type = "shram"
@@ -223,6 +228,7 @@ class ShramConfig(PretrainedConfig):
         load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -279,6 +285,11 @@ class ShramConfig(PretrainedConfig):
             raise ValueError("In cross entropy mode, aggregation of "
                              "frequencies must be with mean 1.0")
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
@@ -300,6 +311,7 @@ class ShramConfig(PretrainedConfig):
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
@@ -2724,19 +2736,33 @@ This module implements the routing mechanism described in Appendix A.Routing of
 paper. Given an input hidden state x, the router produces two outputs used downstream:
   - selected_heads (I): which K of the L available expert heads each token routes to,
-    determined by TopK over biased routing scores.
   - routing_probs (P): the weights used for the weighted output reduction, gathered from
-    *unbiased* routing scores at the selected indices and renormalized. The learned expert
-    bias b must not influence P.
-This separation is architecturally critical: expert_bias drives selection (and thus load
-balancing) but does not corrupt the gradient path from the output through routing_probs
-back to the routing projection weights.
 The router also computes and returns the load balance loss via a log-probability auxiliary
 loss (see load_balance_loss.py). The loss formulation is selected by config; the default
-is cross-entropy. Gradients flow only to expert_bias — routing_projection.weight is
-isolated by detaching logits before computing assignment probabilities.
 The router additionally computes and returns MaxVio, a detached scalar summarising
 routing imbalance for the current forward pass:
@@ -2923,14 +2949,14 @@ def make_load_balance_loss(
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
-    Each input token independently selects K of the L available expert heads. Selection
-    is driven by biased routing scores to enable load balancing, but the routing
-    probabilities used for output reduction are computed from unbiased scores so that
-    the expert bias does not interfere with the gradient path to the router weights.
     The routing projection W_r has no bias term — the paper specifies xW_r with no
     additional projection bias. The only bias-like parameter is expert_bias (b), which
-    has an entirely separate role and update mechanism.
     Args:
         config: Model configuration. Must expose ``hidden_size``, ``num_mosrah_heads``
@@ -2955,9 +2981,17 @@ class MoSRAHRouter(nn.Module):
             config.embedding_width, config.num_mosrah_heads, bias=False
         )
         # b: learned per-head bias for load balancing. Initialized to zero so that all
         # heads start with equal selection probability. Updated by the main optimizer
-        # via the LoadBalanceLoss custom backward.
         self.expert_bias = nn.Parameter(torch.zeros(config.num_mosrah_heads))
     @staticmethod
@@ -3202,16 +3236,17 @@ class MoSRAHRouter(nn.Module):
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
-                Each token's K selected head indices, determined by TopK on biased scores.
             routing_probs: Routing probabilities P of shape (batch, seq_len,
-                num_selected_heads). Gathered from unbiased scores at selected_heads
-                indices and renormalized to sum to 1 per token.
             router_diagnostics: Dict of routing feedback scalars. Keys:
                 - ``load_balance_loss``: scalar load-balance loss with gradient.
                 - ``max_vio``: detached scalar routing-imbalance summary.
                 - ``bias_std``: std of expert_bias; near-zero means corrections have not built up.
-                - ``raw_logit_std``: mean per-token std of unbiased logits; the natural routing scale.
-                - ``logit_std``: mean per-token std of (logits + expert_bias); lower than
                   raw_logit_std means bias is flattening preferences (healthy correction).
                 - ``bias_alignment``: mean cosine similarity of expert_bias against per-token
                   logits. Negative means bias opposes routing direction (healthy correction);
@@ -3221,48 +3256,58 @@ class MoSRAHRouter(nn.Module):
         L = self.num_mosrah_heads
         K = self.num_selected_heads
-        # Unbiased routing scores R = Softmax(xW_r). These are the scores used to
-        # compute routing_probs — expert_bias must not influence them.
-        logits = self.routing_projection(x)                    # (B, N, L)
         # Diagnostic scalars characterising the load-balance mechanism. Must be
         # computed here — before balance_capacity injects -1e8 sentinels that
         # would corrupt std and cosine similarity.
-        bias_std = self.expert_bias.std().detach()
-        raw_logit_std = logits.std(dim=-1).mean().detach()
-        logit_std = (logits + self.expert_bias).std(dim=-1).mean().detach()
         bias_alignment = F.cosine_similarity(
             logits, self.expert_bias.expand_as(logits), dim=-1
         ).mean().detach()
-        # Routing scores. Direct.
-        routing_scores = F.softmax(logits, dim=-1)             # R, (B, N, L)
-        # Biased routing scores R̂ = Softmax(xW_r + b). Used only for TopK head
-        # selection. expert_bias is added to logits before softmax so that the bias
-        # shifts selection probability without rescaling the unbiased distribution.
-        biased_logits = logits.detach() + self.expert_bias
-        biased_logits = self.balance_capacity(
-            biased_logits,
             used_capacity,
             self.capacity,
             self.num_selected_heads,
             self.max_bid_rounds,
         )
-        biased_routing_scores = F.softmax(                     # R̂, (B, N, L)
-           biased_logits, dim=-1
-        )
-        # selected_heads I = TopK(R̂): K head indices per token, shape (B, N, K).
-        # and routing logits directly
-        selected_heads = biased_routing_scores.topk(K, dim=-1).indices
-        gathered = routing_scores.gather(dim=-1, index=selected_heads)   # V, (B, N, K)
-        # Routing probabilities P: gathered from unbiased R at selected_heads indices,
-        # then renormalized so they sum to 1 per token. Gathering from routing_scores
-        # (not biased_routing_scores) is the invariant that keeps the gradient path from
-        # the output back to the router weights free of expert_bias influence.
-        routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)    # P, (B, N, K)
         # Per-item routing frequencies f_{b,l}: for each batch item b and head l, what
         # fraction of that item's active K assignments over all tokens go to head l.
@@ -3271,9 +3316,9 @@ class MoSRAHRouter(nn.Module):
         assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
         assignment_mask.scatter_(-1, selected_heads, 1.0)
         active_assignments = assignment_mask * active_mask.unsqueeze(-1)
-        per_item_counts = active_assignments.sum(dim=1)             # (B, L)
-        per_item_total = active_mask.sum(dim=1, keepdim=True) * K   # (B, 1)
-        per_item_freqs = per_item_counts / per_item_total            # (B, L)
         # p-mean of per_item_freqs over the batch dimension produces routing_freqs (L,).
         # p-mean weights aggregation toward the worst-case batch item relative to
@@ -3282,13 +3327,6 @@ class MoSRAHRouter(nn.Module):
         p = self.load_balance_p
         routing_freqs = (per_item_freqs ** p).mean(dim=0) ** (1.0 / p)  # (L,)
-        # Active-token mean softmax probabilities. Detaching logits before softmax
-        # ensures the only differentiable path into p is through expert_bias — the
-        # load balance loss cannot reach routing_projection.weight.
-        biased_probs = biased_routing_scores                                   # (B, N, L)
-        active_float = active_mask.float().unsqueeze(-1)                       # (B, N, 1)
-        assignment_probs = (biased_probs * active_float).sum(dim=(0, 1))       # (L,) unnorm
-        assignment_probs = assignment_probs / active_mask.float().sum()        # (L,) norm
         load_balance_loss = self._load_balance_loss(routing_freqs, assignment_probs)
         # MaxVio is a detached monitoring scalar following the paper's formula

             is the default; its log-probability signal scales with violation severity
             and makes correction magnitude proportional to routing imbalance.
             Default ``"ce"``.
+        router_init_scale: Initial standard deviation for the ``routing_scale``
+            scalar gate on routing logits. Brings routing logit magnitude to
+            ``expert_bias`` scale at initialisation so load balancing is operative
+            from step one. Must be positive. Default ``1e-4``. Note lower values
+            may require more bidding rounds to converge and more overcapacity to support.
     """
     model_type = "shram"
         load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
         load_balance_loss_type: str = "ce",
+        router_init_scale: float = 1e-4,
         **kwargs
     ):
         if head_dim % 2 != 0:
             raise ValueError("In cross entropy mode, aggregation of "
                              "frequencies must be with mean 1.0")
+        if router_init_scale <= 0.0:
+            raise ValueError(
+                f"router_init_scale must be positive, got {router_init_scale}."
+            )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.load_balance_loss_type = load_balance_loss_type
+        self.router_init_scale = router_init_scale
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
 paper. Given an input hidden state x, the router produces two outputs used downstream:
   - selected_heads (I): which K of the L available expert heads each token routes to,
+    determined by TopK over capacity-balanced semantic routing scores.
   - routing_probs (P): the weights used for the weighted output reduction, gathered from
+    the semantic routing scores at the selected indices and renormalized to sum to 1
+    per token.
+Routing computation uses two gradient-isolated pathways over numerically identical
+biased values:
+  - semantic_logits = logits + expert_bias.detach(): drives selection and routing_probs.
+    Task gradients reach routing_projection.weight; expert_bias is isolated from task loss.
+  - load_balancing_logits = logits.detach() + expert_bias: drives assignment_probs.
+    Load balance gradients reach expert_bias; routing_projection.weight is isolated from
+    load balance loss.
+No unbiased routing computation exists. All routing uses biased values. The separation
+of gradient paths replaces the previous biased/unbiased split, closing the loophole where
+a bias-redirected expert could be selected but contribute negligibly to the output because
+its unbiased preference — and thus its routing_prob — remained near zero.
+Assignment probabilities are computed before balance_capacity applies -1e8 sentinels.
+Post-capacity softmax would invert the load balance gradient for over-capacity experts
+(near-zero probability after masking signals "increase bias" for an already-overloaded
+expert).
 The router also computes and returns the load balance loss via a log-probability auxiliary
 loss (see load_balance_loss.py). The loss formulation is selected by config; the default
+is cross-entropy.
 The router additionally computes and returns MaxVio, a detached scalar summarising
 routing imbalance for the current forward pass:
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
+    Each input token independently selects K of the L available expert heads. Both
+    selection and routing_probs incorporate expert_bias via two gradient-isolated
+    pathways over numerically identical biased values. See module docstring for the
+    two-pathway architecture.
     The routing projection W_r has no bias term — the paper specifies xW_r with no
     additional projection bias. The only bias-like parameter is expert_bias (b), which
+    has an entirely separate role and gradient path.
     Args:
         config: Model configuration. Must expose ``hidden_size``, ``num_mosrah_heads``
             config.embedding_width, config.num_mosrah_heads, bias=False
         )
+        # Scalar gate on routing logits. As an nn.Parameter it is exempt from
+        # HuggingFace _init_weights, so its near-zero initial value is preserved
+        # after from_config construction. Near-zero initialization ensures routing
+        # starts near-uniform and expert_bias has leverage over logits from step one.
+        self.routing_scale = nn.Parameter(
+            torch.full((1,), config.router_init_scale)
+        )
         # b: learned per-head bias for load balancing. Initialized to zero so that all
         # heads start with equal selection probability. Updated by the main optimizer
+        # via gradients from the load balance loss through load_balancing_logits.
         self.expert_bias = nn.Parameter(torch.zeros(config.num_mosrah_heads))
     @staticmethod
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
+                Each token's K selected head indices, determined by TopK on
+                capacity-balanced semantic scores.
             routing_probs: Routing probabilities P of shape (batch, seq_len,
+                num_selected_heads). Gathered from pre-capacity semantic softmax at
+                selected_heads indices and renormalized to sum to 1 per token.
             router_diagnostics: Dict of routing feedback scalars. Keys:
                 - ``load_balance_loss``: scalar load-balance loss with gradient.
                 - ``max_vio``: detached scalar routing-imbalance summary.
                 - ``bias_std``: std of expert_bias; near-zero means corrections have not built up.
+                - ``raw_logit_std``: mean per-token std of scaled logits; the natural routing scale.
+                - ``logit_std``: mean per-token std of semantic_logits; lower than
                   raw_logit_std means bias is flattening preferences (healthy correction).
                 - ``bias_alignment``: mean cosine similarity of expert_bias against per-token
                   logits. Negative means bias opposes routing direction (healthy correction);
         L = self.num_mosrah_heads
         K = self.num_selected_heads
+        # Scaled logits. routing_scale is a near-zero nn.Parameter exempt from
+        # HuggingFace _init_weights, so routing starts near-uniform and expert_bias
+        # has leverage from step one.
+        logits = self.routing_projection(x) * self.routing_scale    # (B, N, L)
+        # Two gradient-isolated pathways over numerically identical biased values.
+        # semantic_logits: task gradients reach routing_projection; expert_bias isolated.
+        # load_balancing_logits: load balance gradients reach expert_bias; routing_projection isolated.
+        semantic_logits       = logits + self.expert_bias.detach()   # (B, N, L)
+        load_balancing_logits = logits.detach() + self.expert_bias   # (B, N, L)
         # Diagnostic scalars characterising the load-balance mechanism. Must be
         # computed here — before balance_capacity injects -1e8 sentinels that
         # would corrupt std and cosine similarity.
+        bias_std       = self.expert_bias.std().detach()
+        raw_logit_std  = logits.std(dim=-1).mean().detach()
+        logit_std      = semantic_logits.std(dim=-1).mean().detach()
         bias_alignment = F.cosine_similarity(
             logits, self.expert_bias.expand_as(logits), dim=-1
         ).mean().detach()
+        # Assignment probabilities for load balance loss. Computed from load_balancing_logits
+        # before balance_capacity so that -1e8 sentinels do not invert the load balance
+        # gradient for over-capacity experts. active_float is reused below for routing freqs.
+        active_float     = active_mask.float().unsqueeze(-1)                          # (B, N, 1)
+        lb_softmax        = F.softmax(load_balancing_logits, dim=-1)                  # (B, N, L)
+        assignment_probs  = (lb_softmax * active_float).sum(dim=(0, 1))               # (L,) unnorm
+        assignment_probs  = assignment_probs / active_mask.float().sum()              # (L,) norm
+        # Pre-capacity semantic softmax for gathering routing_probs. Computed before
+        # balance_capacity so that gathered probabilities reflect genuine preference
+        # magnitudes rather than hard-masked sentinel values.
+        routing_scores = F.softmax(semantic_logits, dim=-1)          # (B, N, L)
+        # Capacity-balanced semantic logits for selection. Injects -1e8 into positions
+        # that would exceed per-expert token budget, enforcing the packing constraint.
+        balanced_semantic_logits = self.balance_capacity(
+            semantic_logits,
             used_capacity,
             self.capacity,
             self.num_selected_heads,
             self.max_bid_rounds,
         )
+        selection_scores = F.softmax(balanced_semantic_logits, dim=-1)    # (B, N, L)
+        # selected_heads I = TopK over capacity-balanced semantic scores.
+        selected_heads = selection_scores.topk(K, dim=-1).indices          # (B, N, K)
+        # Routing probabilities P: gathered from pre-capacity semantic softmax at
+        # selected_heads positions, renormalized so they sum to 1 per token.
+        gathered      = routing_scores.gather(dim=-1, index=selected_heads)    # (B, N, K)
+        routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)          # P, (B, N, K)
         # Per-item routing frequencies f_{b,l}: for each batch item b and head l, what
         # fraction of that item's active K assignments over all tokens go to head l.
         assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
         assignment_mask.scatter_(-1, selected_heads, 1.0)
         active_assignments = assignment_mask * active_mask.unsqueeze(-1)
+        per_item_counts = active_assignments.sum(dim=1)              # (B, L)
+        per_item_total  = active_mask.sum(dim=1, keepdim=True) * K  # (B, 1)
+        per_item_freqs  = per_item_counts / per_item_total           # (B, L)
         # p-mean of per_item_freqs over the batch dimension produces routing_freqs (L,).
         # p-mean weights aggregation toward the worst-case batch item relative to
         p = self.load_balance_p
         routing_freqs = (per_item_freqs ** p).mean(dim=0) ** (1.0 / p)  # (L,)
         load_balance_loss = self._load_balance_loss(routing_freqs, assignment_probs)
         # MaxVio is a detached monitoring scalar following the paper's formula