smithblack-0
/

SHRAM-dev

@@ -44,6 +44,7 @@ from torch import nn
 from torch.nn.attention.flex_attention import create_block_mask
 from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
@@ -547,8 +548,7 @@ class MoSRAHCache(CacheLayerMixin):
         # boolean-mask transfer is correct without any explicit count verification.
         self.keys[dest_mask] = key_states[active_mask]
         self.values[dest_mask] = value_states[active_mask]
-        self._counts = post_counts
         return self.keys, self.values, self._make_active_mask()
@@ -1405,15 +1405,20 @@ Returns a plain dict with keys:
 """Decoder layer — a single transformer block.
 Each block applies pre-norm hybrid attention followed by pre-norm MLP, with
-residual connections around both sublayers:
     normed_attn = RMSNorm(x)
     attn_out, load_balance_loss, max_vio = SHRAMHybridLayer(normed_attn, ...)
-    h = x + attn_out
     normed_mlp = RMSNorm(h)
     mlp_out = SwiGLUMLP(normed_mlp)
-    out = h + mlp_out
 Pre-norm keeps the residual stream unnormalised. Gradients flow more cleanly
 through unnormalised residuals at depth, and each sublayer receives a stable,
@@ -2344,7 +2349,7 @@ def setup_packing(
         batch_size,
         sequence_length * num_selected_heads,
     )
     permutation = torch.argsort(flattened_selected_heads, dim=-1, stable=True)
     inverse_permutation = torch.argsort(permutation, dim=-1)
@@ -2352,6 +2357,7 @@ def setup_packing(
         "flattened_selected_heads": flattened_selected_heads,
         "permutation": permutation,
         "inverse_permutation": inverse_permutation,
     }
@@ -2493,6 +2499,7 @@ def pack_experts(
             (batch_size, num_experts, packed_length, *extra_shape),
             fill_value=padding_value,
         )
         packed_tensor[unpacking_mask] = sorted_tensor.reshape(-1, *extra_shape)
         packed_entries[key] = packed_tensor
@@ -2537,7 +2544,17 @@ def unpack_experts(
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     hidden_dim = expert_outputs.shape[-1]
-    active_outputs = expert_outputs[unpacking_mask]
     sorted_token_choice_outputs = active_outputs.reshape(
         batch_size,
         sequence_length * num_selected_heads,
@@ -2547,7 +2564,6 @@ def unpack_experts(
         dim=1,
         index=inverse_permutation.unsqueeze(-1).expand(-1, -1, hidden_dim),
     )
     return restored_outputs.reshape(
         batch_size,
         sequence_length,
@@ -2753,6 +2769,7 @@ class LoadBalanceLoss(torch.autograd.Function):
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
@@ -2775,6 +2792,10 @@ class MoSRAHRouter(nn.Module):
         self.num_mosrah_heads = config.num_mosrah_heads
         self.num_selected_heads = config.num_selected_heads
         self.load_balance_p = config.load_balance_p
         # W_r: routing projection, no bias (paper specifies xW_r, no additional term).
         self.routing_projection = nn.Linear(
@@ -2786,8 +2807,69 @@ class MoSRAHRouter(nn.Module):
         # via the LoadBalanceLoss custom backward.
         self.expert_bias = nn.Parameter(torch.zeros(config.num_mosrah_heads))
     def forward(
-        self, x: torch.Tensor, active_mask: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Route input tokens to K expert heads each and compute routing probabilities.
@@ -2796,7 +2878,7 @@ class MoSRAHRouter(nn.Module):
             active_mask: Current-chunk active mask of shape (batch, seq_len), where
                 True means the token is semantically live. Dead tokens do not
                 contribute to routing frequencies, load_balance_loss, or max_vio.
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
                 Each token's K selected head indices, determined by TopK on biased scores.
@@ -2821,18 +2903,21 @@ class MoSRAHRouter(nn.Module):
         # Biased routing scores R̂ = Softmax(xW_r + b). Used only for TopK head
         # selection. expert_bias is added to logits before softmax so that the bias
         # shifts selection probability without rescaling the unbiased distribution.
         biased_routing_scores = F.softmax(                     # R̂, (B, N, L)
-            logits + self.expert_bias, dim=-1
         )
         # selected_heads I = TopK(R̂): K head indices per token, shape (B, N, K).
         selected_heads = biased_routing_scores.topk(K, dim=-1).indices
         # Routing probabilities P: gathered from unbiased R at selected_heads indices,
         # then renormalized so they sum to 1 per token. Gathering from routing_scores
         # (not biased_routing_scores) is the invariant that keeps the gradient path from
         # the output back to the router weights free of expert_bias influence.
-        gathered = routing_scores.gather(dim=-1, index=selected_heads)   # V, (B, N, K)
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)    # P, (B, N, K)
         # Per-item routing frequencies f_{b,l}: for each batch item b and head l, what
@@ -3062,8 +3147,9 @@ class MoSRAHLayer(nn.Module):
         # B*N*K True entries) and the packed active mask (live slots only);
         # active_mask is rebound to the packed form after this point.
         # -------------------------------------------------------------------
         selected_heads, routing_probs, load_balance_loss, max_vio = self.router(
-            hidden_states, active_mask
         )
         setup = setup_packing(selected_heads)
@@ -3282,7 +3368,7 @@ class DecoderLayer(nn.Module):
         self.mlp_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
         self.attention = SHRAMHybridLayer(config)
         self.mlp = SwiGLUMLP(config)
     def num_mosrah_parameters(self) -> int:
         """Return the total number of trainable MoSRAH parameters in this decoder layer."""
         return self.attention.num_mosrah_parameters()
@@ -3318,8 +3404,8 @@ class DecoderLayer(nn.Module):
             active_mask=active_mask,
             cache=cache,
         )
-        hidden_states = x + attn_out
-        output = hidden_states + self.mlp(self.mlp_norm(hidden_states))
         return output, load_balance_loss, max_vio

 from torch.nn.attention.flex_attention import create_block_mask
 from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
+from typing import Optional
         # boolean-mask transfer is correct without any explicit count verification.
         self.keys[dest_mask] = key_states[active_mask]
         self.values[dest_mask] = value_states[active_mask]
+        self._counts[:] = post_counts[:]
         return self.keys, self.values, self._make_active_mask()
 """Decoder layer — a single transformer block.
 Each block applies pre-norm hybrid attention followed by pre-norm MLP, with
+gated residual connections around both sublayers:
     normed_attn = RMSNorm(x)
     attn_out, load_balance_loss, max_vio = SHRAMHybridLayer(normed_attn, ...)
+    h = x + residual_gate * attn_out
     normed_mlp = RMSNorm(h)
     mlp_out = SwiGLUMLP(normed_mlp)
+    out = h + residual_gate * mlp_out
+A single shared residual_gate vector (shape: embedding_width, init: zeros) gates
+both sublayer contributions. At initialisation the layer is a pure identity, which
+prevents variance explosion through depth regardless of how HuggingFace initialises
+the projection weights. The gate is a trainable parameter and opens during training.
 Pre-norm keeps the residual stream unnormalised. Gradients flow more cleanly
 through unnormalised residuals at depth, and each sublayer receives a stable,
         batch_size,
         sequence_length * num_selected_heads,
     )
+    num_elements = batch_size*sequence_length*num_selected_heads
     permutation = torch.argsort(flattened_selected_heads, dim=-1, stable=True)
     inverse_permutation = torch.argsort(permutation, dim=-1)
         "flattened_selected_heads": flattened_selected_heads,
         "permutation": permutation,
         "inverse_permutation": inverse_permutation,
+        "num_elements" : num_elements,
     }
             (batch_size, num_experts, packed_length, *extra_shape),
             fill_value=padding_value,
         )
         packed_tensor[unpacking_mask] = sorted_tensor.reshape(-1, *extra_shape)
         packed_entries[key] = packed_tensor
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     hidden_dim = expert_outputs.shape[-1]
+    coords = torch.nonzero_static(
+        unpacking_mask,
+        size=setup["num_elements"],
+    )  # shape: (B*N*K, 3)
+    active_outputs = expert_outputs[
+        coords[:, 0],
+        coords[:, 1],
+        coords[:, 2],
+    ]  # shape: (B*N*K, d)
     sorted_token_choice_outputs = active_outputs.reshape(
         batch_size,
         sequence_length * num_selected_heads,
         dim=1,
         index=inverse_permutation.unsqueeze(-1).expand(-1, -1, hidden_dim),
     )
     return restored_outputs.reshape(
         batch_size,
         sequence_length,
 class MoSRAHRouter(nn.Module):
     """Token-choice router for MoSRAH sparse attention.
         self.num_mosrah_heads = config.num_mosrah_heads
         self.num_selected_heads = config.num_selected_heads
         self.load_balance_p = config.load_balance_p
+        if config.use_cache:
+            self.capacity = config.mosrah_cache_length
+        else:
+            self.capacity = config.mosrah_packed_length
         # W_r: routing projection, no bias (paper specifies xW_r, no additional term).
         self.routing_projection = nn.Linear(
         # via the LoadBalanceLoss custom backward.
         self.expert_bias = nn.Parameter(torch.zeros(config.num_mosrah_heads))
+    @staticmethod
+    def balance_capacity(logits: torch.Tensor,
+                         used_capacity: torch.Tensor | None,
+                         capacity: int,
+                         )->torch.Tensor:
+        """
+        Balances capacity limits so that if choosing an
+        expert would go over capacity, the expert is simply
+        not chosen instead
+        :param logits: The logits to balance. (B, N, L)
+        :param used_capacity: The used capacity, if it exists. (B, L)
+        :param capacity: The maximum available capacity. Int.
+        :return: Modified logits.
+        """
+        if used_capacity is None:
+            # Presume we are in training mode.
+            # Looking up capacity limits only
+            # matters if it is, in fact, possible
+            # to exceed capacity limits.
+            if logits.shape[-2] < capacity:
+                return logits
+            # Look up the kthvalue and use that as
+            # the threshold to mask when below.
+            # Note we negate then negate again to sort
+            # in ascending order.
+            response = torch.kthvalue(-logits, capacity, dim=-2)
+            threshold = -response.values
+            threshold = threshold.unsqueeze(-2) #(B, 1, L)
+        else:
+            # We are operating in inference mode.
+            # We have to use padding to accomodate the
+            # response physically not being long enough
+            # to reach capacity
+            # Note that padding at zero and shifting
+            # the indexes prevents dereferencing a symint,
+            # as a version that just patted at 0, 1 and set to
+            # length + 1 would do. This prevents a graph break.
+            remaining_capacity = capacity - used_capacity # 0 means all used, can be at most capacity
+            response_length = logits.shape[-2]
+            index = torch.clamp(remaining_capacity, 0, response_length+1)
+            # Sort, and add padding. Anything asking for a sequence position
+            # outside the current sequence will get a threshold of -1e8; always include
+            # If we are asking for a value at zero, get 1e8, or full and we include
+            # nothing.
+            ordered_logits = torch.sort(logits, dim=-2, descending=True).values
+            ordered_logits = F.pad(ordered_logits, (0,0, 1, 0), value=1e8)
+            ordered_logits = F.pad(ordered_logits, (0, 0, 0, 1), value=-1e8)
+            threshold = ordered_logits.gather(-2, index.unsqueeze(-2)) #(B, 1, L)
+        mask = threshold > logits
+        logits = logits.masked_fill(mask, -1e8)
+        return logits
     def forward(
+        self,
+        x: torch.Tensor,
+        active_mask: torch.Tensor,
+        used_capacity: torch.Tensor | None
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Route input tokens to K expert heads each and compute routing probabilities.
             active_mask: Current-chunk active mask of shape (batch, seq_len), where
                 True means the token is semantically live. Dead tokens do not
                 contribute to routing frequencies, load_balance_loss, or max_vio.
+            used_capacity: Used for capacity management during inference, missing during training.
         Returns:
             selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
                 Each token's K selected head indices, determined by TopK on biased scores.
         # Biased routing scores R̂ = Softmax(xW_r + b). Used only for TopK head
         # selection. expert_bias is added to logits before softmax so that the bias
         # shifts selection probability without rescaling the unbiased distribution.
+        biased_logits = logits + self.expert_bias
+        biased_logits = self.balance_capacity(biased_logits, used_capacity, self.capacity)
         biased_routing_scores = F.softmax(                     # R̂, (B, N, L)
+           biased_logits, dim=-1
         )
         # selected_heads I = TopK(R̂): K head indices per token, shape (B, N, K).
+        # and routing logits directly
         selected_heads = biased_routing_scores.topk(K, dim=-1).indices
+        gathered = routing_scores.gather(dim=-1, index=selected_heads)   # V, (B, N, K)
         # Routing probabilities P: gathered from unbiased R at selected_heads indices,
         # then renormalized so they sum to 1 per token. Gathering from routing_scores
         # (not biased_routing_scores) is the invariant that keeps the gradient path from
         # the output back to the router weights free of expert_bias influence.
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)    # P, (B, N, K)
         # Per-item routing frequencies f_{b,l}: for each batch item b and head l, what
         # B*N*K True entries) and the packed active mask (live slots only);
         # active_mask is rebound to the packed form after this point.
         # -------------------------------------------------------------------
+        used_capacity = cache.get_heads_lengths() if cache is not None else None
         selected_heads, routing_probs, load_balance_loss, max_vio = self.router(
+            hidden_states, active_mask, used_capacity
         )
         setup = setup_packing(selected_heads)
         self.mlp_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
         self.attention = SHRAMHybridLayer(config)
         self.mlp = SwiGLUMLP(config)
+        self.residual_gate = nn.Parameter(torch.zeros([config.embedding_width]))
     def num_mosrah_parameters(self) -> int:
         """Return the total number of trainable MoSRAH parameters in this decoder layer."""
         return self.attention.num_mosrah_parameters()
             active_mask=active_mask,
             cache=cache,
         )
+        hidden_states = x + self.residual_gate*attn_out
+        output = hidden_states + self.residual_gate*self.mlp(self.mlp_norm(hidden_states))
         return output, load_balance_loss, max_vio