smithblack-0 commited on 3 days ago

Commit

a86502d

verified ·

1 Parent(s): 0c295ed

Update architecture and tokenizer

Browse files

Files changed (20) hide show

README.md +5 -3
__attention__bottlenecked_ensemble_attention.py +15 -4
__attention__expert_packing.py +148 -142
__attention__mosrah.py +14 -14
__attention__positions_converter.py +14 -8
__attention__router.py +23 -15
__attention__shram.py +0 -13
__attention__sliding_window_attention.py +2 -1
__cache__mosrah_cache.py +69 -61
__cache__shram_cache.py +14 -28
__cache__shram_layer_cache.py +23 -35
__cache__sliding_window_cache.py +1 -1
__cache__slow_mosrah_cache.py +20 -39
config.json +6 -4
configuration.py +76 -5
decoder_layer.py +2 -2
huggingface.py +85 -8
mlp.py +3 -3
model.py +2 -2
rope.py +55 -48

README.md CHANGED Viewed

@@ -79,13 +79,15 @@ contains no weights. All values are overridable via kwargs.
 | `attention_dropout` | 0.0 |
 | `beta` | 32.0 |
 | `dtype` | None |
 | `head_dim` | 16 |
-| `hidden_size` | 512 |
 | `inference_sequence_length` | 1024 |
-| `intermediate_size` | 1366 |
 | `local_rope_theta` | 10000.0 |
 | `mosrah_rope_theta` | 10000.0 |
-| `num_hidden_layers` | 12 |
 | `num_mosrah_heads` | 16 |
 | `num_selected_heads` | 16 |
 | `num_sliding_window_heads` | 16 |

 | `attention_dropout` | 0.0 |
 | `beta` | 32.0 |
 | `dtype` | None |
+| `embedding_width` | 512 |
 | `head_dim` | 16 |
 | `inference_sequence_length` | 1024 |
+| `load_balance_p` | 2.0 |
 | `local_rope_theta` | 10000.0 |
+| `mlp_width` | 1366 |
+| `mosrah_overallocation_factor` | 2.0 |
 | `mosrah_rope_theta` | 10000.0 |
+| `num_decoder_layers` | 12 |
 | `num_mosrah_heads` | 16 |
 | `num_selected_heads` | 16 |
 | `num_sliding_window_heads` | 16 |

__attention__bottlenecked_ensemble_attention.py CHANGED Viewed

@@ -38,14 +38,14 @@ class BottleneckedEnsembleAttention(nn.Module):
     Args:
         config: SHRAM config. Must expose `hidden_size`, `num_mosrah_heads`,
-            `head_dim`, `mosrah_rope_theta`, `training_sequence_length`,
-            `inference_sequence_length`, `alpha`, and `beta`.
     """
     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
-        self.hidden_size = config.hidden_size
         self.num_heads = config.num_mosrah_heads
         self.head_dim = config.head_dim
@@ -68,11 +68,22 @@ class BottleneckedEnsembleAttention(nn.Module):
         # BEA uses the YaRN-capable RoPE path. The caller supplies the position tensor;
         # this unit only consumes it. In training modes, dilation will be 1.0 and so
         # no yarn dilation occurs.
         self.rope = RotaryEmbedding(
             mode="yarn",
             head_dim=self.head_dim,
             theta=config.mosrah_rope_theta,
-            initial_seq_length=config.training_sequence_length,
             dilation=config.scale,
             alpha=config.alpha,
             beta=config.beta,

     Args:
         config: SHRAM config. Must expose `hidden_size`, `num_mosrah_heads`,
+            `head_dim`, `mosrah_rope_theta`, `inference_sequence_length`,
+            `scale`, `alpha`, and `beta`.
     """
     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
+        self.hidden_size = config.embedding_width
         self.num_heads = config.num_mosrah_heads
         self.head_dim = config.head_dim
         # BEA uses the YaRN-capable RoPE path. The caller supplies the position tensor;
         # this unit only consumes it. In training modes, dilation will be 1.0 and so
         # no yarn dilation occurs.
+        #
+        # The required table size depends on position semantics:
+        #   main_sequence    — positions are original token positions, bounded by
+        #                      inference_sequence_length.
+        #   semantic_sequence — positions are local per-expert slot indices, bounded
+        #                       by mosrah_packed_length.
+        maximum_rope_length = (
+            config.mosrah_packed_length
+            if config.rope_mode == "semantic_sequence"
+            else config.inference_sequence_length
+        )
         self.rope = RotaryEmbedding(
             mode="yarn",
             head_dim=self.head_dim,
             theta=config.mosrah_rope_theta,
+            maximum_sequence_length=maximum_rope_length,
             dilation=config.scale,
             alpha=config.alpha,
             beta=config.beta,

__attention__expert_packing.py CHANGED Viewed

@@ -3,26 +3,30 @@
 This module implements the low-level token-choice -> expert-choice -> token-choice
 conversion boundary specified in the paper. The externally visible behavior is fixed:
-- setup_packing() prepares the auxiliary ordering data.
-- pack_experts() converts routed token-choice state into packed expert-choice state.
 - unpack_experts() restores token-choice ordering afterward.
 Stable sort is a correctness requirement. It preserves causal ordering inside each
 expert bucket, which is the foundation on which BEA's later triangular causal mask
 is correct.
-pack_experts() returns two distinct masks that serve different roles and must not
-be interchanged:
 - unpacking_mask: marks every packed slot that contains a routed token copy,
   live or dead. Always has exactly B*N*K True entries. Required by unpack_experts
   so its reshape invariant holds regardless of outer token liveness.
-- active_mask: marks only the packed slots whose source token was semantically
-  live. This is what BEA consumes for attention gating. Dead outer tokens must
-  not influence sparse attention outputs.
 """
 import torch
 # ---------------------------------------------------------------------------
@@ -31,7 +35,7 @@ import torch
 def setup_packing(
     selected_heads: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Prepare the auxiliary ordering data used by pack/unpack.
     Routing produces token-choice state I of shape (B, N, K): for each token, which
@@ -48,10 +52,11 @@ def setup_packing(
         selected_heads: Routed token-choice head selections I of shape (B, N, K).
     Returns:
-        Tuple of:
-          - flattened_selected_heads: H of shape (B, N*K)
-          - permutation: stable expert-major permutation Pi of shape (B, N*K)
-          - inverse_permutation: inverse permutation Pi^{-1} of shape (B, N*K)
     """
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     flattened_selected_heads = selected_heads.reshape(
@@ -62,7 +67,11 @@ def setup_packing(
     permutation = torch.argsort(flattened_selected_heads, dim=-1, stable=True)
     inverse_permutation = torch.argsort(permutation, dim=-1)
-    return flattened_selected_heads, permutation, inverse_permutation
 # ---------------------------------------------------------------------------
@@ -70,27 +79,22 @@ def setup_packing(
 # ---------------------------------------------------------------------------
 def pack_experts(
-    hidden_states: torch.Tensor,
-    position_ids: torch.Tensor,
     selected_heads: torch.Tensor,
     num_experts: int,
-    flattened_selected_heads: torch.Tensor,
-    permutation: torch.Tensor,
-    outer_active_mask: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Pack token-choice hidden states into expert-choice padded form.
     The paper's packing path has two jobs:
     1. Convert routed token-choice copies into expert-major order.
     2. Materialize that expert-major order into a padded tensor layout BEA can consume.
-    The routed hidden-state copies are not stored explicitly in token-choice form.
-    Instead, the same token hidden state is conceptually copied once per selected expert.
-    The packing step reconstructs those copies by expanding local source-token indices,
-    reordering those indices with Pi, then gathering hidden states, positions, and outer
-    liveness in that packed order. All three are carried through the same expert-major
-    rearrangement so they remain aligned in the packed frame.
     Packed positions are sourced from the authoritative upstream position_ids tensor
     rather than synthesized locally from arange(N). This preserves advanced positions
@@ -98,40 +102,40 @@ def pack_experts(
     unchanged when position_ids is the ordinary sequential token positions.
     Args:
-        hidden_states: Token-choice hidden states x of shape (B, N, d).
-        position_ids: Authoritative upstream token positions J of shape (B, N).
         selected_heads: Routed head selections I of shape (B, N, K).
         num_experts: Total number of experts L.
-        flattened_selected_heads: H from setup_packing(), shape (B, N*K).
-        permutation: Pi from setup_packing(), shape (B, N*K).
-        outer_active_mask: Current-chunk active mask of shape (B, N), where True
-            means the token is semantically live. Dead tokens do not become
-            semantically active in the packed sparse representation.
     Returns:
         Tuple of:
-          - packed_hidden_states: x' of shape (B, L, T, d)
-          - packed_positions: J' of shape (B, L, T)
-          - unpacking_mask: of shape (B, L, T). True where a slot contains any
-            routed token copy, live or dead. Always has exactly B*N*K True entries.
-            Pass this to unpack_experts — not active_mask.
-          - active_mask: of shape (B, L, T). True only where a slot contains a
-            copy of a live outer token. Pass this to BEA for attention gating.
     """
-    batch_size, sequence_length, hidden_dim = hidden_states.shape
-    _, _, num_selected_heads = selected_heads.shape
     # -----------------------------------------------------------------------
     # Reconstruct routed local source-token indices in token-choice order.
     #
-    # The internal arange(N) is no longer the packed position tensor. It is only
-    # the local source-row index object used to gather from the current chunk
-    # tensor x. Flattening this object gives a (B, N*K) tensor aligned with H's
-    # token-major routed-copy order.
     # -----------------------------------------------------------------------
     source_token_indices = torch.arange(
         sequence_length,
-        device=hidden_states.device,
         dtype=torch.long,
     ).view(1, sequence_length, 1).expand(
         batch_size,
@@ -147,89 +151,71 @@ def pack_experts(
     # Reorder source-token indices into expert-major order.
     #
     # Applying Pi yields the local source-token rows in the packed expert-major
-    # order required by the paper. Those same reordered source indices are then
-    # used to gather hidden states, authoritative upstream positions, and outer
-    # liveness so all three remain aligned under the exact same packing
-    # transformation.
     # -----------------------------------------------------------------------
     sorted_source_indices = flattened_source_indices.gather(
         dim=1,
         index=permutation,
     )
-    sorted_hidden_states = hidden_states.gather(
-        dim=1,
-        index=sorted_source_indices.unsqueeze(-1).expand(-1, -1, hidden_dim),
-    )
-    sorted_positions = position_ids.gather(
-        dim=1,
-        index=sorted_source_indices,
-    )
-    sorted_active_mask = outer_active_mask.gather(
-        dim=1,
-        index=sorted_source_indices,
-    )
     # -----------------------------------------------------------------------
-    # Count how many routed copies land in each expert bucket.
     #
-    # S[b, l] is the number of routed token copies assigned to expert l in batch b.
-    # T is the maximum such count across all batches and experts; it determines the
-    # padded expert-length dimension of the packed representation.
     # -----------------------------------------------------------------------
-    tokens_per_expert = _bincount_rows(
-        values=flattened_selected_heads,
-        num_bins=num_experts,
-    )
-    max_tokens_per_expert = int(tokens_per_expert.max().item())
     # -----------------------------------------------------------------------
-    # Construct the active-token mask M.
     #
     # Each expert bucket is left-justified: if S[b, l] = s, then slots
-    # t = 0, ..., s-1 are active and all later slots are padding. The resulting
-    # mask therefore both identifies real packed tokens and enforces left-justified
-    # packing. This is the unpacking_mask — it marks slot occupancy regardless of
-    # outer token liveness, and always has exactly B*N*K True entries.
     # -----------------------------------------------------------------------
     time_axis = torch.arange(
-        max_tokens_per_expert,
-        device=hidden_states.device,
         dtype=torch.long,
-    ).view(1, 1, max_tokens_per_expert)
     unpacking_mask = time_axis < tokens_per_expert.unsqueeze(-1)
     # -----------------------------------------------------------------------
-    # Materialize the padded packed tensors.
     #
-    # The packed hidden states x', packed original-token positions J', and packed
-    # active-token mask are allocated as zero-filled tensors. Active entries are
-    # then written into those buffers in the expert-major order established above.
-    # Padding remains zero / inactive.
     # -----------------------------------------------------------------------
-    packed_hidden_states = hidden_states.new_zeros(
-        batch_size,
-        num_experts,
-        max_tokens_per_expert,
-        hidden_dim,
-    )
-    packed_positions = position_ids.new_zeros(
-        batch_size,
-        num_experts,
-        max_tokens_per_expert,
-    )
-    active_mask = torch.zeros(
-        batch_size,
-        num_experts,
-        max_tokens_per_expert,
-        dtype=torch.bool,
-        device=hidden_states.device,
-    )
-    packed_hidden_states[unpacking_mask] = sorted_hidden_states.reshape(-1, hidden_dim)
-    packed_positions[unpacking_mask] = sorted_positions.reshape(-1)
-    active_mask[unpacking_mask] = sorted_active_mask.reshape(-1)
-    return packed_hidden_states, packed_positions, unpacking_mask, active_mask
 # ---------------------------------------------------------------------------
@@ -238,9 +224,9 @@ def pack_experts(
 def unpack_experts(
     expert_outputs: torch.Tensor,
-    selected_heads: torch.Tensor,
     unpacking_mask: torch.Tensor,
-    inverse_permutation: torch.Tensor,
 ) -> torch.Tensor:
     """Restore token-choice ordering from BEA expert-choice output.
@@ -257,14 +243,16 @@ def unpack_experts(
     Args:
         expert_outputs: Expert-choice BEA output y of shape (B, L, T, d).
-        selected_heads: Routed head selections I of shape (B, N, K).
         unpacking_mask: From pack_experts(), shape (B, L, T). Identifies all
             occupied packed slots regardless of outer token liveness.
-        inverse_permutation: Pi^{-1} from setup_packing(), shape (B, N*K).
     Returns:
         Restored token-choice tensor y_tilde of shape (B, N, K, d).
     """
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     hidden_dim = expert_outputs.shape[-1]
@@ -291,45 +279,63 @@ def unpack_experts(
 # Helpers
 # ---------------------------------------------------------------------------
-def _bincount_rows(
-    values: torch.Tensor,
-    num_bins: int,
-) -> torch.Tensor:
-    """Count per-row integer occurrences for a 2D tensor.
-    torch.bincount operates on a flat 1D vector, but the packing algorithm needs
-    one bincount per batch row. The trick used here is to shift each row into its
-    own disjoint bin range before flattening:
-      row 0 uses bins [0, ..., num_bins - 1]
-      row 1 uses bins [num_bins, ..., 2*num_bins - 1]
-      row 2 uses bins [2*num_bins, ..., 3*num_bins - 1]
-      ...
-    After that shift, one global torch.bincount produces all row-local counts at
-    once. Reshaping the result back to (B, num_bins) recovers the per-row bincount.
-    This is a vectorized implementation detail only; externally visible behavior
-    remains exactly the paper's S tensor of per-batch per-expert token counts.
     Args:
-        values: Integer tensor of shape (B, M) with entries in [0, num_bins).
-        num_bins: Number of bins.
     Returns:
-        Counts tensor of shape (B, num_bins).
     """
-    batch_size = values.shape[0]
-    row_offsets = torch.arange(
         batch_size,
-        device=values.device,
-        dtype=values.dtype,
-    ).unsqueeze(1) * num_bins
-    shifted_values = values + row_offsets
-    counts = torch.bincount(
-        shifted_values.reshape(-1),
-        minlength=batch_size * num_bins,
     )
-    return counts.reshape(batch_size, num_bins)

 This module implements the low-level token-choice -> expert-choice -> token-choice
 conversion boundary specified in the paper. The externally visible behavior is fixed:
+- setup_packing() prepares the auxiliary ordering data and returns it as a dict
+  payload forwarded whole to pack_experts and unpack_experts.
+- pack_experts() converts a dict of routed token-choice tensors into packed
+  expert-choice form. Each entry is paired with its intended padding value; all
+  entries undergo the same expert-major gather-scatter so they remain aligned.
 - unpack_experts() restores token-choice ordering afterward.
 Stable sort is a correctness requirement. It preserves causal ordering inside each
 expert bucket, which is the foundation on which BEA's later triangular causal mask
 is correct.
+pack_experts() returns the packed entries dict together with a separate unpacking_mask.
+Two masks serve different roles and must not be interchanged:
 - unpacking_mask: marks every packed slot that contains a routed token copy,
   live or dead. Always has exactly B*N*K True entries. Required by unpack_experts
   so its reshape invariant holds regardless of outer token liveness.
+- active_mask (caller-supplied entry): marks only the packed slots whose source
+  token was semantically live. This is what BEA consumes for attention gating.
+  Dead outer tokens must not influence sparse attention outputs.
 """
 import torch
+from typing import Any
 # ---------------------------------------------------------------------------
 def setup_packing(
     selected_heads: torch.Tensor,
+) -> dict[str, torch.Tensor]:
     """Prepare the auxiliary ordering data used by pack/unpack.
     Routing produces token-choice state I of shape (B, N, K): for each token, which
         selected_heads: Routed token-choice head selections I of shape (B, N, K).
     Returns:
+        Auxiliary payload dict with keys:
+          - "flattened_selected_heads": H of shape (B, N*K)
+          - "permutation": stable expert-major permutation Pi of shape (B, N*K)
+          - "inverse_permutation": inverse permutation Pi^{-1} of shape (B, N*K)
+        This dict is forwarded whole to pack_experts and unpack_experts.
     """
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     flattened_selected_heads = selected_heads.reshape(
     permutation = torch.argsort(flattened_selected_heads, dim=-1, stable=True)
     inverse_permutation = torch.argsort(permutation, dim=-1)
+    return {
+        "flattened_selected_heads": flattened_selected_heads,
+        "permutation": permutation,
+        "inverse_permutation": inverse_permutation,
+    }
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 def pack_experts(
+    entries: dict[str, tuple[torch.Tensor, Any]],
+    setup: dict[str, torch.Tensor],
     selected_heads: torch.Tensor,
     num_experts: int,
+    packed_length: int,
+) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+    """Pack token-choice tensors into expert-choice padded form.
     The paper's packing path has two jobs:
     1. Convert routed token-choice copies into expert-major order.
     2. Materialize that expert-major order into a padded tensor layout BEA can consume.
+    All entries in the provided dict undergo the same expert-major gather-scatter so
+    they remain mutually aligned in the packed frame. Each entry is paired with its
+    intended padding value, which fills slots that contain no routed token copy.
     Packed positions are sourced from the authoritative upstream position_ids tensor
     rather than synthesized locally from arange(N). This preserves advanced positions
     unchanged when position_ids is the ordinary sequential token positions.
     Args:
+        entries: Mapping from string keys to (tensor, padding_value) pairs. Each
+            tensor has shape (B, N, ...) and is rearranged into expert-choice layout
+            (B, L, T, ...). The returned dict carries the same keys.
+        setup: Auxiliary payload returned by setup_packing().
         selected_heads: Routed head selections I of shape (B, N, K).
         num_experts: Total number of experts L.
+        packed_length: Static packed time dimension T. All per-expert buffers are
+            allocated to exactly this length. Use config.mosrah_packed_length as the
+            source of this value. Raises if any actual per-expert token count exceeds
+            this value.
     Returns:
         Tuple of:
+          - packed_entries: Dict with same keys as entries; each value is the
+            packed tensor of shape (B, L, T, ...).
+          - unpacking_mask: Boolean tensor of shape (B, L, T). True where a slot
+            contains any routed token copy, live or dead. Always has exactly
+            B*N*K True entries. Pass this to unpack_experts — not active_mask.
     """
+    batch_size, sequence_length, num_selected_heads = selected_heads.shape
+    flattened_selected_heads = setup["flattened_selected_heads"]
+    permutation = setup["permutation"]
     # -----------------------------------------------------------------------
     # Reconstruct routed local source-token indices in token-choice order.
     #
+    # The internal arange(N) is only the local source-row index object used to
+    # gather from the current chunk tensors. Flattening gives a (B, N*K) tensor
+    # aligned with H's token-major routed-copy order.
     # -----------------------------------------------------------------------
     source_token_indices = torch.arange(
         sequence_length,
+        device=flattened_selected_heads.device,
         dtype=torch.long,
     ).view(1, sequence_length, 1).expand(
         batch_size,
     # Reorder source-token indices into expert-major order.
     #
     # Applying Pi yields the local source-token rows in the packed expert-major
+    # order required by the paper. All entries are then gathered using these same
+    # reordered indices so they remain aligned under the exact same transformation.
     # -----------------------------------------------------------------------
     sorted_source_indices = flattened_source_indices.gather(
         dim=1,
         index=permutation,
     )
     # -----------------------------------------------------------------------
+    # Count how many routed copies land in each expert bucket and verify
+    # that no bucket exceeds the statically preallocated packed_length T.
     #
+    # S[b, l] is the number of routed token copies assigned to expert l in
+    # batch b. T (packed_length) is a static allocation derived from config,
+    # not a data-dependent maximum. Overflow is detected here and raises in
+    # both eager and compiled modes.
     # -----------------------------------------------------------------------
+    tokens_per_expert = _count_tokens_per_expert(flattened_selected_heads, num_experts)
+    max_count = tokens_per_expert.max().item()
+    no_overflow = max_count <= packed_length
+    _enforce_no_overflow(no_overflow)
     # -----------------------------------------------------------------------
+    # Construct the unpacking mask.
     #
     # Each expert bucket is left-justified: if S[b, l] = s, then slots
+    # t = 0, ..., s-1 are occupied and all later slots are padding. The mask
+    # marks slot occupancy regardless of outer token liveness, and always has
+    # exactly B*N*K True entries.
     # -----------------------------------------------------------------------
     time_axis = torch.arange(
+        packed_length,
+        device=flattened_selected_heads.device,
         dtype=torch.long,
+    ).view(1, 1, packed_length)
     unpacking_mask = time_axis < tokens_per_expert.unsqueeze(-1)
     # -----------------------------------------------------------------------
+    # Materialize all entries into the packed expert-choice frame.
     #
+    # Each entry is gathered using the expert-major sorted source indices, then
+    # scattered into a padded buffer. The gather index is expanded to cover each
+    # tensor's trailing dimensions. Padding slots receive the caller-supplied fill
+    # value rather than an implicit zero.
     # -----------------------------------------------------------------------
+    packed_entries: dict[str, torch.Tensor] = {}
+    for key, (tensor, padding_value) in entries.items():
+        extra_shape = tensor.shape[2:]
+        # Expand gather index to cover trailing dimensions, if any.
+        idx = sorted_source_indices.view(
+            batch_size,
+            sequence_length * num_selected_heads,
+            *(1,) * len(extra_shape),
+        ).expand(-1, -1, *extra_shape)
+        sorted_tensor = tensor.gather(dim=1, index=idx)
+        packed_tensor = tensor.new_full(
+            (batch_size, num_experts, packed_length, *extra_shape),
+            fill_value=padding_value,
+        )
+        packed_tensor[unpacking_mask] = sorted_tensor.reshape(-1, *extra_shape)
+        packed_entries[key] = packed_tensor
+    return packed_entries, unpacking_mask
 # ---------------------------------------------------------------------------
 def unpack_experts(
     expert_outputs: torch.Tensor,
+    setup: dict[str, torch.Tensor],
     unpacking_mask: torch.Tensor,
+    selected_heads: torch.Tensor,
 ) -> torch.Tensor:
     """Restore token-choice ordering from BEA expert-choice output.
     Args:
         expert_outputs: Expert-choice BEA output y of shape (B, L, T, d).
+        setup: Auxiliary payload returned by setup_packing().
         unpacking_mask: From pack_experts(), shape (B, L, T). Identifies all
             occupied packed slots regardless of outer token liveness.
+        selected_heads: Routed head selections I of shape (B, N, K).
     Returns:
         Restored token-choice tensor y_tilde of shape (B, N, K, d).
     """
+    inverse_permutation = setup["inverse_permutation"]
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     hidden_dim = expert_outputs.shape[-1]
 # Helpers
 # ---------------------------------------------------------------------------
+def _enforce_no_overflow(condition: bool) -> None:
+    """Enforce that no expert bucket exceeds the preallocated packed length.
+    This check fires when the number of tokens assigned to any expert in any
+    batch item exceeds mosrah_packed_length. When that limit is exceeded, the
+    packed buffer is too small to hold all assignments and data would be dropped.
+    Increase mosrah_overallocation_factor in ShramConfig to resolve.
+    The caller must derive condition via .item() on the max count tensor so that
+    dynamo captures a SymInt and the comparison produces a SymBool. Passing a
+    tensor comparison result directly bypasses the SymInt mechanism and prevents
+    the check from firing at compiled runtime.
+    Args:
+        condition: True means no overflow has occurred; False means at least one
+            expert bucket exceeds packed_length. In compiled mode this is a SymBool
+            produced by comparing a SymInt against the static packed_length.
+    """
+    if torch.compiler.is_compiling():
+        torch._check(condition)
+    else:
+        if not condition:
+            raise RuntimeError(
+                "Expert packing overflow: at least one expert bucket contains more "
+                "tokens than mosrah_packed_length allows. Increase "
+                "mosrah_overallocation_factor in ShramConfig to resolve."
+            )
+def _count_tokens_per_expert(
+    flattened_selected_heads: torch.Tensor,
+    num_experts: int,
+) -> torch.Tensor:
+    """Count how many routed token copies are assigned to each expert per batch item.
+    Uses scatter_add into a pre-sized (B, num_experts) zero buffer, producing a
+    statically-shaped output that compiles without graph breaks. Each position in
+    flattened_selected_heads contributes one count to the corresponding expert slot.
     Args:
+        flattened_selected_heads: Expert assignments of shape (B, N*K) with values
+            in [0, num_experts).
+        num_experts: Total number of experts L.
     Returns:
+        Counts tensor of shape (B, num_experts).
     """
+    batch_size = flattened_selected_heads.shape[0]
+    counts = torch.zeros(
         batch_size,
+        num_experts,
+        device=flattened_selected_heads.device,
+        dtype=flattened_selected_heads.dtype,
+    )
+    counts.scatter_add_(
+        dim=1,
+        index=flattened_selected_heads,
+        src=torch.ones_like(flattened_selected_heads),
     )
+    return counts

__attention__mosrah.py CHANGED Viewed

@@ -40,6 +40,7 @@ class MoSRAHLayer(nn.Module):
     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
         self.num_experts = config.num_mosrah_heads
         self.router = MoSRAHRouter(config)
         self.positions = SparseMoSRAHPositions(config)
@@ -91,18 +92,16 @@ class MoSRAHLayer(nn.Module):
             hidden_states, active_mask
         )
-        flattened_selected_heads, permutation, inverse_permutation = setup_packing(
-            selected_heads
-        )
-        packed_hidden_states, packed_positions, unpacking_mask, active_mask = pack_experts(
-            hidden_states=hidden_states,
-            position_ids=position_ids,
-            selected_heads=selected_heads,
-            num_experts=self.num_experts,
-            flattened_selected_heads=flattened_selected_heads,
-            permutation=permutation,
-            outer_active_mask=active_mask,
-        )
         # -------------------------------------------------------------------
         # Sparse attention runs entirely in the packed expert-choice frame, so
@@ -114,6 +113,7 @@ class MoSRAHLayer(nn.Module):
         # -------------------------------------------------------------------
         bea_positions = self.positions(
             packed_positions=packed_positions,
             cache=cache,
         )
         packed_outputs = self.bea(
@@ -133,9 +133,9 @@ class MoSRAHLayer(nn.Module):
         # -------------------------------------------------------------------
         token_choice_outputs = unpack_experts(
             expert_outputs=packed_outputs,
-            selected_heads=selected_heads,
             unpacking_mask=unpacking_mask,
-            inverse_permutation=inverse_permutation,
         )
         final_output = (
             token_choice_outputs * routing_probs.unsqueeze(-1)

     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
         self.num_experts = config.num_mosrah_heads
+        self.packed_length = config.mosrah_packed_length
         self.router = MoSRAHRouter(config)
         self.positions = SparseMoSRAHPositions(config)
             hidden_states, active_mask
         )
+        setup = setup_packing(selected_heads)
+        entries = {
+            "hidden_states": (hidden_states, 0.0),
+            "position_ids": (position_ids, 0),
+            "active_mask": (active_mask, False),
+        }
+        packed, unpacking_mask = pack_experts(entries, setup, selected_heads, self.num_experts, self.packed_length)
+        packed_hidden_states = packed["hidden_states"]
+        packed_positions = packed["position_ids"]
+        active_mask = packed["active_mask"]
         # -------------------------------------------------------------------
         # Sparse attention runs entirely in the packed expert-choice frame, so
         # -------------------------------------------------------------------
         bea_positions = self.positions(
             packed_positions=packed_positions,
+            active_mask=active_mask,
             cache=cache,
         )
         packed_outputs = self.bea(
         # -------------------------------------------------------------------
         token_choice_outputs = unpack_experts(
             expert_outputs=packed_outputs,
+            setup=setup,
             unpacking_mask=unpacking_mask,
+            selected_heads=selected_heads,
         )
         final_output = (
             token_choice_outputs * routing_probs.unsqueeze(-1)

__attention__positions_converter.py CHANGED Viewed

@@ -32,12 +32,17 @@ class SparseMoSRAHPositions(nn.Module):
     def forward(
         self,
         packed_positions: torch.Tensor,
         cache: MoSRAHCache | None,
     ) -> torch.Tensor:
         """Compute the packed position tensor P consumed by BEA.
         Args:
             packed_positions: Packed original-token positions J' of shape (B, L, T).
             cache: Optional layer-local MoSRAH cache. When present in semantic-sequence
                 mode, the current per-head occupancies offset the local packed sequence.
@@ -45,14 +50,15 @@ class SparseMoSRAHPositions(nn.Module):
             Packed position tensor P of shape (B, L, T).
         """
         if self.rope_mode == "main_sequence":
-            return self._main_sequence_positions(packed_positions)
-        if self.rope_mode == "semantic_sequence":
-            return self._semantic_sequence_positions(packed_positions, cache)
-        raise NotImplementedError(
-            f"Unsupported MoSRAH rope_mode '{self.rope_mode}'."
-        )
     def _main_sequence_positions(
         self,

     def forward(
         self,
         packed_positions: torch.Tensor,
+        active_mask: torch.Tensor,
         cache: MoSRAHCache | None,
     ) -> torch.Tensor:
         """Compute the packed position tensor P consumed by BEA.
         Args:
             packed_positions: Packed original-token positions J' of shape (B, L, T).
+            active_mask: Boolean active-token mask of shape (B, L, T). Inactive
+                positions are zeroed in the returned tensor regardless of mode —
+                their position value is semantically irrelevant and 0 is guaranteed
+                to be within any valid RoPE table.
             cache: Optional layer-local MoSRAH cache. When present in semantic-sequence
                 mode, the current per-head occupancies offset the local packed sequence.
             Packed position tensor P of shape (B, L, T).
         """
         if self.rope_mode == "main_sequence":
+            positions = self._main_sequence_positions(packed_positions)
+        elif self.rope_mode == "semantic_sequence":
+            positions = self._semantic_sequence_positions(packed_positions, cache)
+        else:
+            raise NotImplementedError(
+                f"Unsupported MoSRAH rope_mode '{self.rope_mode}'."
+            )
+        return torch.where(active_mask, positions, torch.zeros_like(positions))
     def _main_sequence_positions(
         self,

__attention__router.py CHANGED Viewed

@@ -57,10 +57,11 @@ class MoSRAHRouter(nn.Module):
         super().__init__()
         self.num_mosrah_heads = config.num_mosrah_heads
         self.num_selected_heads = config.num_selected_heads
         # W_r: routing projection, no bias (paper specifies xW_r, no additional term).
         self.routing_projection = nn.Linear(
-            config.hidden_size, config.num_mosrah_heads, bias=False
         )
         # b: learned per-head bias for load balancing. Initialized to zero so that all
@@ -117,25 +118,31 @@ class MoSRAHRouter(nn.Module):
         gathered = routing_scores.gather(dim=-1, index=selected_heads)   # V, (B, N, K)
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)    # P, (B, N, K)
-        # Routing frequency f_l: fraction of active (batch, token, head_slot) triples
-        # assigned to each head. Dead tokens are excluded by zeroing their rows in the
-        # assignment mask before reduction. Normalization uses the active assignment
-        # count so frequencies remain properly scaled regardless of how many tokens
-        # are live in this chunk.
         assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
         assignment_mask.scatter_(-1, selected_heads, 1.0)
         active_assignments = assignment_mask * active_mask.unsqueeze(-1)
-        num_active_assignments = active_mask.sum() * K
-        routing_freqs = active_assignments.sum(dim=(0, 1)) / num_active_assignments  # f, (L,)
         # Load balance loss via custom autograd. expert_bias is an input so PyTorch
         # registers it as a graph node; the custom backward writes the DeepSeek-style
         # correction gradient to expert_bias.grad for the optimizer to consume.
         load_balance_loss = LoadBalanceLoss.apply(self.expert_bias, routing_freqs)
-        # MaxVio is a detached monitoring scalar derived from routing_freqs. It must
-        # not contribute gradients under any circumstance, so it is detached at the
-        # point of computation rather than left to callers to detach.
         max_vio = self._compute_max_vio(routing_freqs, L)
         return selected_heads, routing_probs, load_balance_loss, max_vio
@@ -145,15 +152,16 @@ class MoSRAHRouter(nn.Module):
         """Compute the MaxVio routing-imbalance scalar.
         MaxVio = L · max_l(f_l − 1/L), where f_l is the realised routing frequency of
-        head l and 1/L is the perfectly balanced target. A value of zero indicates
-        perfect balance; a value of 1 means the most overloaded head received exactly
-        double its fair share.
         The result is detached from the autograd graph — MaxVio is a monitoring scalar
         and must never contribute gradients to any parameter.
         Args:
-            routing_freqs: Per-head routing frequencies of shape (L,). Sums to 1.
             num_heads: Total number of MoSRAH heads L.
         Returns:

         super().__init__()
         self.num_mosrah_heads = config.num_mosrah_heads
         self.num_selected_heads = config.num_selected_heads
+        self.load_balance_p = config.load_balance_p
         # W_r: routing projection, no bias (paper specifies xW_r, no additional term).
         self.routing_projection = nn.Linear(
+            config.embedding_width, config.num_mosrah_heads, bias=False
         )
         # b: learned per-head bias for load balancing. Initialized to zero so that all
         gathered = routing_scores.gather(dim=-1, index=selected_heads)   # V, (B, N, K)
         routing_probs = gathered / gathered.sum(dim=-1, keepdim=True)    # P, (B, N, K)
+        # Per-item routing frequencies f_{b,l}: for each batch item b and head l, what
+        # fraction of that item's active K assignments over all tokens go to head l.
+        # Dead tokens are excluded before reduction. Normalization is per batch item so
+        # each item's frequencies sum to 1 independently of other items in the batch.
         assignment_mask = torch.zeros(B, N, L, device=x.device, dtype=x.dtype)
         assignment_mask.scatter_(-1, selected_heads, 1.0)
         active_assignments = assignment_mask * active_mask.unsqueeze(-1)
+        per_item_counts = active_assignments.sum(dim=1)             # (B, L)
+        per_item_total = active_mask.sum(dim=1, keepdim=True) * K   # (B, 1)
+        per_item_freqs = per_item_counts / per_item_total            # (B, L)
+        # p-mean of per_item_freqs over the batch dimension produces routing_freqs (L,).
+        # p-mean weights aggregation toward the worst-case batch item relative to
+        # arithmetic mean, making the load balance signal sensitive to per-item spikes
+        # that cause packing overflow.
+        p = self.load_balance_p
+        routing_freqs = (per_item_freqs ** p).mean(dim=0) ** (1.0 / p)  # (L,)
         # Load balance loss via custom autograd. expert_bias is an input so PyTorch
         # registers it as a graph node; the custom backward writes the DeepSeek-style
         # correction gradient to expert_bias.grad for the optimizer to consume.
         load_balance_loss = LoadBalanceLoss.apply(self.expert_bias, routing_freqs)
+        # MaxVio is a detached monitoring scalar following the paper's formula
+        # L · max_l(f_l − 1/L) applied to routing_freqs. Must not contribute gradients.
         max_vio = self._compute_max_vio(routing_freqs, L)
         return selected_heads, routing_probs, load_balance_loss, max_vio
         """Compute the MaxVio routing-imbalance scalar.
         MaxVio = L · max_l(f_l − 1/L), where f_l is the realised routing frequency of
+        head l and 1/L is the perfectly balanced target. Follows the paper's definition
+        (Wang et al.) applied to routing_freqs. A value of zero indicates perfect
+        balance; a value of 0.5 means the most overloaded head received 50% more routed
+        tokens than ideal.
         The result is detached from the autograd graph — MaxVio is a monitoring scalar
         and must never contribute gradients to any parameter.
         Args:
+            routing_freqs: Per-head routing frequencies of shape (L,).
             num_heads: Total number of MoSRAH heads L.
         Returns:

__attention__shram.py CHANGED Viewed

@@ -64,19 +64,6 @@ class SHRAMHybridLayer(nn.Module):
             max_vio: Detached scalar routing-imbalance summary. Passed through
                 unchanged from MoSRAHLayer; see MoSRAHRouter for semantics.
         """
-        # ------------------------------------------------
-        # It is not possible, due to how bea constructs its block mask,
-        # for the model to process a sequence that does not start at zero
-        # without a cache to track the per-head offsets
-        # ------------------------------------------------
-        if cache is None and torch.any(position_ids[:, 0] != 0):
-            raise ValueError(
-                "Uncached SHRAMHybridLayer does not support nonzero starting positions. "
-                "Either provide a matching ShramLayerCache populated by the prefix for "
-                "continued decoding, or rebase the uncached sequence to start at 0."
-            )
         # -------------------------------------------------------------------
         # The hybrid layer's first responsibility is cache dispatch. The layer
         # cache already owns the concrete sub-cache objects required by each

             max_vio: Detached scalar routing-imbalance summary. Passed through
                 unchanged from MoSRAHLayer; see MoSRAHRouter for semantics.
         """
         # -------------------------------------------------------------------
         # The hybrid layer's first responsibility is cache dispatch. The layer
         # cache already owns the concrete sub-cache objects required by each

__attention__sliding_window_attention.py CHANGED Viewed

@@ -44,7 +44,7 @@ class SlidingWindowAttention(nn.Module):
     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
-        self.hidden_size = config.hidden_size
         self.num_heads = config.num_sliding_window_heads
         self.head_dim = config.head_dim
         self.window_size = config.window_size
@@ -69,6 +69,7 @@ class SlidingWindowAttention(nn.Module):
             mode="default",
             head_dim=self.head_dim,
             theta=config.local_rope_theta,
         )
     def forward(

     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
+        self.hidden_size = config.embedding_width
         self.num_heads = config.num_sliding_window_heads
         self.head_dim = config.head_dim
         self.window_size = config.window_size
             mode="default",
             head_dim=self.head_dim,
             theta=config.local_rope_theta,
+            maximum_sequence_length=config.inference_sequence_length,
         )
     def forward(

__cache__mosrah_cache.py CHANGED Viewed

@@ -61,12 +61,13 @@ class MoSRAHCache(CacheLayerMixin):
         batch_size: Number of sequences in the batch. Determines the first dimension
             of all storage tensors.
         device: Device on which to allocate all tensors. Should match the model device.
-        initial_buffer_size: Initial sequence capacity per (batch, head) slot. Doubled
-            when any slot overflows. Defaults to 64 to avoid repeated reallocation
-            during prompt processing.
     """
-    is_compileable = False
     is_sliding = False
     def __init__(
@@ -75,22 +76,23 @@ class MoSRAHCache(CacheLayerMixin):
         head_dim: int,
         batch_size: int,
         device: torch.device,
-        initial_buffer_size: int = 64,
     ) -> None:
         super().__init__()
         self.num_mosrah_heads = num_mosrah_heads
         self.head_dim = head_dim
         self.batch_size = batch_size
         self.device = device
         # Allocate primary storage into the mixin-standard self.keys / self.values so
         # that inherited methods (offload, prefetch) operate on real tensors. _counts
         # tracks valid occupancy per (batch, head) slot.
         self.keys: torch.Tensor = torch.zeros(
-            batch_size, num_mosrah_heads, initial_buffer_size, head_dim, device=device
         )
         self.values: torch.Tensor = torch.zeros(
-            batch_size, num_mosrah_heads, initial_buffer_size, head_dim, device=device
         )
         self._counts: torch.Tensor = torch.zeros(
             batch_size, num_mosrah_heads, dtype=torch.long, device=device
@@ -107,8 +109,8 @@ class MoSRAHCache(CacheLayerMixin):
     def buffer_capacity(self) -> int:
         """Current number of slots allocated per (batch, head) pair.
-        Derived directly from self.keys rather than tracked separately, so it is
-        always consistent with the actual buffer after expansion.
         """
         return self.keys.shape[2]
@@ -129,10 +131,11 @@ class MoSRAHCache(CacheLayerMixin):
         active_mask is (B, L, T) bool with True marking real tokens. Only active
         positions are written; inactive positions are ignored.
-        Uses a cumsum construction to derive the absolute buffer position for each
-        active token without any Python loops. For a given (batch, head) slot,
-        positions are assigned in the order tokens appear along the T dimension,
-        preserving causal ordering.
         Returns the full accumulated (keys, values, active_mask) across the cached
         sparse sequence. The returned active_mask is True exactly for slots t <
@@ -150,35 +153,36 @@ class MoSRAHCache(CacheLayerMixin):
         Returns:
             Tuple of (keys, values, active_mask):
-              keys: (B, L, T, u) float — full key buffer including junk slots.
-              values: (B, L, T, u) float — full value buffer including junk slots.
-              active_mask: (B, L, T) bool — True iff slot (b, l, t) has been written.
         """
         incoming_delta = active_mask.long().sum(dim=2)  # (B, L)
-        if (self._counts + incoming_delta).max().item() > self.buffer_capacity:
-            self._expand()
-        # Cumulative count of active positions along T for each (b, l) slot. Entry
-        # [b, l, t] is the 1-based rank of position t among all active positions in
-        # that slot. Subtract 1 for a zero-based within-slot index. Inactive positions
-        # produce a negative value, which is excluded by the mask gate below.
-        within_slot = active_mask.long().cumsum(dim=2) - 1  # (B, L, T)
-        # Add the pre-update count to get the absolute buffer position for each
-        # active token.
-        abs_pos = within_slot + self._counts.unsqueeze(-1)  # (B, L, T)
-        # Scatter key and value vectors at all active positions.
-        b_idx, l_idx, t_idx = torch.where(active_mask)
-        self.keys[b_idx, l_idx, abs_pos[b_idx, l_idx, t_idx]] = (
-            key_states[b_idx, l_idx, t_idx]
-        )
-        self.values[b_idx, l_idx, abs_pos[b_idx, l_idx, t_idx]] = (
-            value_states[b_idx, l_idx, t_idx]
         )
-        self._counts += incoming_delta
         return self.keys, self.values, self._make_active_mask()
@@ -303,10 +307,13 @@ class MoSRAHCache(CacheLayerMixin):
         )
     def get_max_cache_shape(self) -> int:  # type: ignore[override]
-        """Not supported — MoSRAHCache is dynamic and unbounded."""
-        raise NotImplementedError(
-            "MoSRAHCache is unbounded; get_max_cache_shape() is not supported."
-        )
     def get_mask_sizes(  # type: ignore[override]
         self,
@@ -335,25 +342,26 @@ class MoSRAHCache(CacheLayerMixin):
             < self._counts.unsqueeze(-1)
         )
-    def _expand(self) -> None:
-        """Double the buffer capacity, preserving existing data.
-        Called by update() when an incoming batch of tokens would cause any
-        (batch, head) slot to exceed the current buffer capacity. All existing
-        key and value data is copied into the low half of the new buffer; the
-        high half is zero-initialised and will be filled by subsequent writes.
-        After reassignment, buffer_capacity reflects the new size automatically.
         """
-        old_cap = self.buffer_capacity
-        new_cap = old_cap * 2
-        dev = self.keys.device
-        new_keys = torch.zeros(
-            self.batch_size, self.num_mosrah_heads, new_cap, self.head_dim, device=dev
-        )
-        new_values = torch.zeros(
-            self.batch_size, self.num_mosrah_heads, new_cap, self.head_dim, device=dev
-        )
-        new_keys[:, :, :old_cap, :] = self.keys
-        new_values[:, :, :old_cap, :] = self.values
-        self.keys = new_keys
-        self.values = new_values

         batch_size: Number of sequences in the batch. Determines the first dimension
             of all storage tensors.
         device: Device on which to allocate all tensors. Should match the model device.
+        mosrah_cache_length: Static sequence capacity per (batch, head) slot. Equal to
+            config.mosrah_cache_length. The buffer never grows; if any slot would exceed
+            this capacity, update() raises in both eager and compiled modes. Increase
+            mosrah_overallocation_factor in ShramConfig to resolve an overflow.
     """
+    is_compileable = True
     is_sliding = False
     def __init__(
         head_dim: int,
         batch_size: int,
         device: torch.device,
+        mosrah_cache_length: int,
     ) -> None:
         super().__init__()
         self.num_mosrah_heads = num_mosrah_heads
         self.head_dim = head_dim
         self.batch_size = batch_size
         self.device = device
+        self.mosrah_cache_length = mosrah_cache_length
         # Allocate primary storage into the mixin-standard self.keys / self.values so
         # that inherited methods (offload, prefetch) operate on real tensors. _counts
         # tracks valid occupancy per (batch, head) slot.
         self.keys: torch.Tensor = torch.zeros(
+            batch_size, num_mosrah_heads, mosrah_cache_length, head_dim, device=device
         )
         self.values: torch.Tensor = torch.zeros(
+            batch_size, num_mosrah_heads, mosrah_cache_length, head_dim, device=device
         )
         self._counts: torch.Tensor = torch.zeros(
             batch_size, num_mosrah_heads, dtype=torch.long, device=device
     def buffer_capacity(self) -> int:
         """Current number of slots allocated per (batch, head) pair.
+        Equal to mosrah_cache_length as supplied at construction. Derived from
+        self.keys so it remains consistent with the actual buffer shape.
         """
         return self.keys.shape[2]
         active_mask is (B, L, T) bool with True marking real tokens. Only active
         positions are written; inactive positions are ignored.
+        Uses a fixed-shape destination mask constructed from per-slot write intervals
+        to transfer active tokens into the buffer without any data-dependent shape
+        operations. Active tokens are left-justified within each packed slot by the
+        packing machinery, so the destination positions are a contiguous range
+        starting at the current slot count — no cumsum or torch.where needed.
         Returns the full accumulated (keys, values, active_mask) across the cached
         sparse sequence. The returned active_mask is True exactly for slots t <
         Returns:
             Tuple of (keys, values, active_mask):
+              keys: (B, L, mosrah_cache_length, u) float — full key buffer including junk slots.
+              values: (B, L, mosrah_cache_length, u) float — full value buffer including junk slots.
+              active_mask: (B, L, mosrah_cache_length) bool — True iff slot t has been written.
         """
         incoming_delta = active_mask.long().sum(dim=2)  # (B, L)
+        post_counts = self._counts + incoming_delta
+        self._check_no_overflow(post_counts.max(), self.mosrah_cache_length)
+        # Build a fixed-shape destination mask in cache space. Active tokens within
+        # each (b, l) slot are left-justified by the packing machinery, so they occupy
+        # positions 0..s-1 in their packed slot. The corresponding cache positions are
+        # write_start[b,l]..write_start[b,l]+write_count[b,l]-1. Broadcasting a
+        # time arange against these per-slot intervals selects exactly the target
+        # positions without any data-dependent shape query.
+        write_start = self._counts.unsqueeze(-1)    # cache position where new tokens begin
+        write_count = incoming_delta.unsqueeze(-1)  # number of new tokens arriving per slot
+        time_arange = torch.arange(
+            self.mosrah_cache_length, device=active_mask.device
         )
+        dest_mask = (time_arange >= write_start) & (time_arange < write_start + write_count)
+        # dest_mask: (B, L, mosrah_cache_length)
+        # Transfer key and value vectors. Left-justification guarantees that
+        # dest_mask and active_mask have equal True counts per (b, l) slot, so the
+        # boolean-mask transfer is correct without any explicit count verification.
+        self.keys[dest_mask] = key_states[active_mask]
+        self.values[dest_mask] = value_states[active_mask]
+        self._counts = post_counts
         return self.keys, self.values, self._make_active_mask()
         )
     def get_max_cache_shape(self) -> int:  # type: ignore[override]
+        """Return the static per-(batch, head) slot capacity of this cache.
+        Equal to mosrah_cache_length as supplied at construction, which is derived
+        from config.mosrah_cache_length. Required by the HuggingFace static cache
+        contract; generation machinery uses this to size attention masks.
+        """
+        return self.mosrah_cache_length
     def get_mask_sizes(  # type: ignore[override]
         self,
             < self._counts.unsqueeze(-1)
         )
+    @staticmethod
+    def _check_no_overflow(max_count: torch.Tensor, capacity: int) -> None:
+        """Raise if any (batch, head) slot would exceed the static buffer capacity.
+        Uses the 19.F.1 pattern: branches on whether the graph is being compiled.
+        In compiled mode, `.item()` folds into the graph when capture_scalar_outputs=True
+        and `torch._check` issues a compile-time assertion. In eager mode, a plain
+        RuntimeError is raised with a descriptive message.
+        Args:
+            max_count: Scalar tensor — the maximum post-update count across all slots.
+            capacity: The static buffer capacity (mosrah_cache_length).
         """
+        if torch.compiler.is_compiling():
+            torch._check(max_count.item() <= capacity)
+        else:
+            if max_count.item() > capacity:
+                raise RuntimeError(
+                    f"MoSRAHCache overflow: a (batch, head) slot would reach "
+                    f"{max_count.item()} tokens but the static buffer capacity is "
+                    f"{capacity}. Increase mosrah_overallocation_factor in ShramConfig."
+                )

__cache__shram_cache.py CHANGED Viewed

@@ -21,6 +21,7 @@ what HuggingFace generation reads through get_seq_length().
 import torch
 from transformers.cache_utils import Cache
 from .__cache__shram_layer_cache import ShramLayerCache
@@ -36,44 +37,28 @@ class ShramCache(Cache):
     via cache.layers[layer_idx].sliding_window_cache or cache.layers[layer_idx].mosrah_cache.
     Args:
-        num_hidden_layers: Number of SHRAM decoder layers. Determines how many
-            ShramLayerCache objects are constructed.
-        sliding_window: Token window size passed to each layer's LocalSlidingWindowLayerCache.
-        num_local_heads: Number of local attention heads per layer.
-        local_head_dim: Per-head embedding width for the local path.
-        num_mosrah_heads: Total number of MoSRAH expert heads (L) per layer.
-        mosrah_head_dim: Bottlenecked head embedding width (u) for the MoSRAH path.
         batch_size: Number of sequences in the batch.
         device: Device on which to allocate cache tensors.
-        initial_buffer_size: Initial per-(batch, head) capacity for each MoSRAHCache.
-            Doubled when any slot overflows. Defaults to 64 to avoid repeated reallocation
-            during prompt processing.
     """
     def __init__(
         self,
-        num_hidden_layers: int,
-        sliding_window: int,
-        num_local_heads: int,
-        local_head_dim: int,
-        num_mosrah_heads: int,
-        mosrah_head_dim: int,
         batch_size: int,
         device: torch.device,
-        initial_buffer_size: int = 64,
     ) -> None:
         layers = [
             ShramLayerCache(
-                sliding_window=sliding_window,
-                num_local_heads=num_local_heads,
-                local_head_dim=local_head_dim,
-                num_mosrah_heads=num_mosrah_heads,
-                mosrah_head_dim=mosrah_head_dim,
                 batch_size=batch_size,
                 device=device,
-                initial_buffer_size=initial_buffer_size,
             )
-            for _ in range(num_hidden_layers)
         ]
         super().__init__(layers=layers)
@@ -133,9 +118,10 @@ class ShramCache(Cache):
     @property
     def max_cache_len(self) -> int:
-        """Not supported — ShramCache has no single maximum cache length.
-        The sliding-window side is bounded by sliding_window; the MoSRAH side is unbounded.
-        No truthful scalar maximum represents the composite.
         """
-        raise NotImplementedError("ShramCache does not expose max_cache_len.")

 import torch
 from transformers.cache_utils import Cache
+from .configuration import ShramConfig
 from .__cache__shram_layer_cache import ShramLayerCache
     via cache.layers[layer_idx].sliding_window_cache or cache.layers[layer_idx].mosrah_cache.
     Args:
+        config: ShramConfig instance. All layer counts, buffer sizes, and sub-cache
+            dimensions are derived from config so that a single source of truth governs
+            every buffer size across the full cache stack.
         batch_size: Number of sequences in the batch.
         device: Device on which to allocate cache tensors.
     """
+    is_compileable = True
     def __init__(
         self,
+        config: ShramConfig,
         batch_size: int,
         device: torch.device,
     ) -> None:
         layers = [
             ShramLayerCache(
+                config=config,
                 batch_size=batch_size,
                 device=device,
             )
+            for _ in range(config.num_decoder_layers)
         ]
         super().__init__(layers=layers)
     @property
     def max_cache_len(self) -> int:
+        """Return the maximum sequence length the cache can serve.
+        Delegates to layers[0].get_max_cache_shape(), which returns
+        config.inference_sequence_length. HuggingFace's static-cache machinery reads
+        this value to size generation loops and verify compileable cache contracts.
         """
+        return self.layers[0].get_max_cache_shape()

__cache__shram_layer_cache.py CHANGED Viewed

@@ -21,6 +21,7 @@ quantity HuggingFace generation reads through get_seq_length().
 import torch
 from transformers.cache_utils import CacheLayerMixin
 from .__cache__mosrah_cache import MoSRAHCache
 from .__cache__sliding_window_cache import LocalSlidingWindowLayerCache
@@ -40,46 +41,36 @@ class ShramLayerCache(CacheLayerMixin):
     tracks the cumulative count of token positions processed across all update() calls.
     Args:
-        sliding_window: Number of tokens retained by the local sliding-window cache.
-        num_local_heads: Number of local attention heads.
-        local_head_dim: Per-head embedding width for the local path.
-        num_mosrah_heads: Total number of MoSRAH expert heads (L).
-        mosrah_head_dim: Bottlenecked head embedding width (u) for the MoSRAH path.
         batch_size: Number of sequences in the batch.
         device: Device on which to allocate cache tensors.
-        initial_buffer_size: Initial per-(batch, head) capacity for MoSRAHCache. Doubled
-            when any slot overflows. Defaults to 64 to avoid repeated reallocation during
-            prompt processing.
     """
-    is_compileable = False
     is_sliding = False
     def __init__(
         self,
-        sliding_window: int,
-        num_local_heads: int,
-        local_head_dim: int,
-        num_mosrah_heads: int,
-        mosrah_head_dim: int,
         batch_size: int,
         device: torch.device,
-        initial_buffer_size: int = 64,
     ) -> None:
         super().__init__()
         self.sliding_window_cache = LocalSlidingWindowLayerCache(
-            sliding_window=sliding_window,
-            num_heads=num_local_heads,
-            head_dim=local_head_dim,
             batch_size=batch_size,
             device=device,
         )
         self.mosrah_cache = MoSRAHCache(
-            num_mosrah_heads=num_mosrah_heads,
-            head_dim=mosrah_head_dim,
             batch_size=batch_size,
             device=device,
-            initial_buffer_size=initial_buffer_size,
         )
     # ---------------------------------------------------------------------------
@@ -208,26 +199,23 @@ class ShramLayerCache(CacheLayerMixin):
         )
     def get_max_cache_shape(self) -> int:  # type: ignore[override]
-        """Not supported — the composite cache has no single maximum shape.
-        The sliding-window cache is bounded by sliding_window; the MoSRAH cache is
-        unbounded. No truthful scalar maximum represents the composite.
         """
-        raise NotImplementedError(
-            "ShramLayerCache has no single maximum cache shape. "
-            "Query sliding_window_cache or mosrah_cache directly."
-        )
     def get_mask_sizes(  # type: ignore[override]
         self,
         cache_position: torch.Tensor,
     ) -> tuple[int, int]:
-        """Not supported — ShramLayerCache does not participate in HF mask construction.
-        The two sub-caches have different mask semantics and their respective attention
-        paths handle masking directly.
         """
-        raise NotImplementedError(
-            "ShramLayerCache does not support get_mask_sizes(). "
-            "Query sliding_window_cache or mosrah_cache directly."
-        )

 import torch
 from transformers.cache_utils import CacheLayerMixin
+from .configuration import ShramConfig
 from .__cache__mosrah_cache import MoSRAHCache
 from .__cache__sliding_window_cache import LocalSlidingWindowLayerCache
     tracks the cumulative count of token positions processed across all update() calls.
     Args:
+        config: ShramConfig instance. All sub-cache dimensions and capacities are derived
+            from config so that a single source of truth governs every buffer size.
         batch_size: Number of sequences in the batch.
         device: Device on which to allocate cache tensors.
     """
+    is_compileable = True
     is_sliding = False
     def __init__(
         self,
+        config: ShramConfig,
         batch_size: int,
         device: torch.device,
     ) -> None:
         super().__init__()
+        self._inference_sequence_length = config.inference_sequence_length
         self.sliding_window_cache = LocalSlidingWindowLayerCache(
+            sliding_window=config.window_size,
+            num_heads=config.num_sliding_window_heads,
+            head_dim=config.head_dim,
             batch_size=batch_size,
             device=device,
         )
         self.mosrah_cache = MoSRAHCache(
+            num_mosrah_heads=config.num_mosrah_heads,
+            head_dim=config.head_dim,
             batch_size=batch_size,
             device=device,
+            mosrah_cache_length=config.mosrah_cache_length,
         )
     # ---------------------------------------------------------------------------
         )
     def get_max_cache_shape(self) -> int:  # type: ignore[override]
+        """Return the maximum sequence length this layer cache can serve.
+        The authoritative upper bound is ``config.inference_sequence_length``, which
+        governs the full accumulated token history the model is configured to handle.
+        HuggingFace's static-cache machinery reads this value to determine whether the
+        cache is compileable and to size generation loops.
         """
+        return self._inference_sequence_length
     def get_mask_sizes(  # type: ignore[override]
         self,
         cache_position: torch.Tensor,
     ) -> tuple[int, int]:
+        """Return the KV dimensions for HuggingFace causal mask construction.
+        Returns (inference_sequence_length, 0): the full static cache capacity as
+        kv_length and zero offset. HuggingFace reads these values to size the causal
+        attention mask when is_compileable is True.
         """
+        return self._inference_sequence_length, 0

__cache__sliding_window_cache.py CHANGED Viewed

@@ -39,7 +39,7 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         device: Device on which to allocate cache storage.
     """
-    is_compileable = False
     is_sliding = True
     def __init__(

         device: Device on which to allocate cache storage.
     """
+    is_compileable = True
     is_sliding = True
     def __init__(

__cache__slow_mosrah_cache.py CHANGED Viewed

@@ -41,9 +41,9 @@ class SlowMoSRAHCache(CacheLayerMixin):
         batch_size: Number of sequences in the batch. Determines the first dimension
             of all storage tensors.
         device: Device on which to allocate all tensors. Should match the model device.
-        initial_buffer_size: Initial sequence capacity per (batch, head) slot. Doubled
-            when any slot overflows. Defaults to 64 to avoid repeated reallocation
-            during prompt processing.
     """
     is_compileable = False
@@ -55,22 +55,23 @@ class SlowMoSRAHCache(CacheLayerMixin):
         head_dim: int,
         batch_size: int,
         device: torch.device,
-        initial_buffer_size: int = 64,
     ) -> None:
         super().__init__()
         self.num_mosrah_heads = num_mosrah_heads
         self.head_dim = head_dim
         self.batch_size = batch_size
         self.device = device
         # Allocate primary storage into the mixin-standard self.keys / self.values so
         # that inherited methods (offload, prefetch) operate on real tensors. _counts
         # tracks valid occupancy per (batch, head) slot.
         self.keys: torch.Tensor = torch.zeros(
-            batch_size, num_mosrah_heads, initial_buffer_size, head_dim, device=device
         )
         self.values: torch.Tensor = torch.zeros(
-            batch_size, num_mosrah_heads, initial_buffer_size, head_dim, device=device
         )
         self._counts: torch.Tensor = torch.zeros(
             batch_size, num_mosrah_heads, dtype=torch.long, device=device
@@ -87,8 +88,8 @@ class SlowMoSRAHCache(CacheLayerMixin):
     def buffer_capacity(self) -> int:
         """Current number of slots allocated per (batch, head) pair.
-        Derived directly from self.keys rather than tracked separately, so it is
-        always consistent with the actual buffer after expansion.
         """
         return self.keys.shape[2]
@@ -111,8 +112,8 @@ class SlowMoSRAHCache(CacheLayerMixin):
         because the t dimension is traversed from 0 to T-1 and counts are updated
         immediately after each write.
-        Buffer expansion (doubling buffer_capacity) is triggered before any writes if
-        the incoming tokens would cause any slot to overflow the current capacity.
         Args:
             key_states: Shape (B, L, T, u) — post-RoPE key vectors in expert-choice layout.
@@ -122,17 +123,19 @@ class SlowMoSRAHCache(CacheLayerMixin):
         Returns:
             Tuple of (keys, values, active_mask):
-              keys: (B, L, T, u) float — full key buffer including junk slots.
-              values: (B, L, T, u) float — full value buffer including junk slots.
-              active_mask: (B, L, T) bool — True iff slot (b, l, t) has been written.
         """
         B, L, T = active_mask.shape
-        # Expansion check uses the total active tokens per slot, same as the
-        # vectorized implementation, so both expand under identical conditions.
         incoming_delta = active_mask.long().sum(dim=2)  # (B, L)
-        if (self._counts + incoming_delta).max().item() > self.buffer_capacity:
-            self._expand()
         # Write each active position into the next available slot for its (batch, head)
         # pair. Iterating t from 0 to T-1 preserves causal ordering within each slot.
@@ -297,25 +300,3 @@ class SlowMoSRAHCache(CacheLayerMixin):
             < self._counts.unsqueeze(-1)
         )
-    def _expand(self) -> None:
-        """Double the buffer capacity, preserving existing data.
-        Called by update() when an incoming batch of tokens would cause any
-        (batch, head) slot to exceed the current buffer capacity. All existing
-        key and value data is copied into the low half of the new buffer; the
-        high half is zero-initialised and will be filled by subsequent writes.
-        After reassignment, buffer_capacity reflects the new size automatically.
-        """
-        old_cap = self.buffer_capacity
-        new_cap = old_cap * 2
-        dev = self.keys.device
-        new_keys = torch.zeros(
-            self.batch_size, self.num_mosrah_heads, new_cap, self.head_dim, device=dev
-        )
-        new_values = torch.zeros(
-            self.batch_size, self.num_mosrah_heads, new_cap, self.head_dim, device=dev
-        )
-        new_keys[:, :, :old_cap, :] = self.keys
-        new_values[:, :, :old_cap, :] = self.values
-        self.keys = new_keys
-        self.values = new_values

         batch_size: Number of sequences in the batch. Determines the first dimension
             of all storage tensors.
         device: Device on which to allocate all tensors. Should match the model device.
+        mosrah_cache_length: Static sequence capacity per (batch, head) slot. Equal to
+            config.mosrah_cache_length. The buffer never grows; if any slot would exceed
+            this capacity, update() raises a RuntimeError.
     """
     is_compileable = False
         head_dim: int,
         batch_size: int,
         device: torch.device,
+        mosrah_cache_length: int,
     ) -> None:
         super().__init__()
         self.num_mosrah_heads = num_mosrah_heads
         self.head_dim = head_dim
         self.batch_size = batch_size
         self.device = device
+        self.mosrah_cache_length = mosrah_cache_length
         # Allocate primary storage into the mixin-standard self.keys / self.values so
         # that inherited methods (offload, prefetch) operate on real tensors. _counts
         # tracks valid occupancy per (batch, head) slot.
         self.keys: torch.Tensor = torch.zeros(
+            batch_size, num_mosrah_heads, mosrah_cache_length, head_dim, device=device
         )
         self.values: torch.Tensor = torch.zeros(
+            batch_size, num_mosrah_heads, mosrah_cache_length, head_dim, device=device
         )
         self._counts: torch.Tensor = torch.zeros(
             batch_size, num_mosrah_heads, dtype=torch.long, device=device
     def buffer_capacity(self) -> int:
         """Current number of slots allocated per (batch, head) pair.
+        Equal to mosrah_cache_length as supplied at construction. Derived from
+        self.keys so it remains consistent with the actual buffer shape.
         """
         return self.keys.shape[2]
         because the t dimension is traversed from 0 to T-1 and counts are updated
         immediately after each write.
+        Raises RuntimeError before any writes if the incoming tokens would cause any
+        slot to exceed the static mosrah_cache_length capacity.
         Args:
             key_states: Shape (B, L, T, u) — post-RoPE key vectors in expert-choice layout.
         Returns:
             Tuple of (keys, values, active_mask):
+              keys: (B, L, mosrah_cache_length, u) float — full key buffer including junk slots.
+              values: (B, L, mosrah_cache_length, u) float — full value buffer including junk slots.
+              active_mask: (B, L, mosrah_cache_length) bool — True iff slot t has been written.
         """
         B, L, T = active_mask.shape
         incoming_delta = active_mask.long().sum(dim=2)  # (B, L)
+        if (self._counts + incoming_delta).max().item() > self.mosrah_cache_length:
+            raise RuntimeError(
+                f"SlowMoSRAHCache overflow: a (batch, head) slot would exceed the "
+                f"static buffer capacity of {self.mosrah_cache_length}. Increase "
+                f"mosrah_overallocation_factor in ShramConfig."
+            )
         # Write each active position into the next available slot for its (batch, head)
         # pair. Iterating t from 0 to T-1 preserves causal ordering within each slot.
             < self._counts.unsqueeze(-1)
         )

config.json CHANGED Viewed

@@ -6,14 +6,16 @@
     "AutoModelForCausalLM": "huggingface.ShramForCausalLM"
   },
   "beta": 32.0,
   "head_dim": 16,
-  "hidden_size": 512,
   "inference_sequence_length": 1024,
-  "intermediate_size": 1366,
   "local_rope_theta": 10000.0,
   "model_type": "shram",
   "mosrah_rope_theta": 10000.0,
-  "num_hidden_layers": 12,
   "num_mosrah_heads": 16,
   "num_selected_heads": 16,
   "num_sliding_window_heads": 16,
@@ -21,7 +23,7 @@
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
-  "transformers_version": "5.8.0",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128

     "AutoModelForCausalLM": "huggingface.ShramForCausalLM"
   },
   "beta": 32.0,
+  "embedding_width": 512,
   "head_dim": 16,
   "inference_sequence_length": 1024,
+  "load_balance_p": 2.0,
   "local_rope_theta": 10000.0,
+  "mlp_width": 1366,
   "model_type": "shram",
+  "mosrah_overallocation_factor": 2.0,
   "mosrah_rope_theta": 10000.0,
+  "num_decoder_layers": 12,
   "num_mosrah_heads": 16,
   "num_selected_heads": 16,
   "num_sliding_window_heads": 16,
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
+  "transformers_version": "5.8.1",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128

configuration.py CHANGED Viewed

@@ -11,6 +11,8 @@ parameters directly and constructs its own RotaryEmbedding instance explicitly
 HuggingFace rope infrastructure is used. See Unit 5.A design decisions in plan.md.
 """
 from transformers import PretrainedConfig
@@ -77,6 +79,15 @@ class ShramConfig(PretrainedConfig):
         use_cache: Whether to return past_key_values for KV caching.
         output_hidden_states: Whether to return hidden states after each layer.
         tie_word_embeddings: Whether input embedding and LM head share weights.
     """
     model_type = "shram"
@@ -109,7 +120,9 @@ class ShramConfig(PretrainedConfig):
         use_cache: bool = True,
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
-        **kwargs,
     ):
         if head_dim % 2 != 0:
             raise ValueError(
@@ -137,10 +150,22 @@ class ShramConfig(PretrainedConfig):
                 f"got {inference_sequence_length}."
             )
         self.vocab_size = vocab_size
-        self.hidden_size = embedding_width
-        self.intermediate_size = mlp_width
-        self.num_hidden_layers = num_decoder_layers
         self.num_sliding_window_heads = num_sliding_window_heads
         self.num_mosrah_heads = num_mosrah_heads
         self.num_selected_heads = num_selected_heads
@@ -154,13 +179,15 @@ class ShramConfig(PretrainedConfig):
         self.inference_sequence_length = inference_sequence_length
         self.alpha = alpha
         self.beta = beta
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             output_hidden_states=output_hidden_states,
-            **kwargs,
         )
         # Promote auto_map to an instance attribute so PretrainedConfig.to_dict()
@@ -176,3 +203,47 @@ class ShramConfig(PretrainedConfig):
         """
         return self.inference_sequence_length / self.training_sequence_length

 HuggingFace rope infrastructure is used. See Unit 5.A design decisions in plan.md.
 """
+import math
 from transformers import PretrainedConfig
         use_cache: Whether to return past_key_values for KV caching.
         output_hidden_states: Whether to return hidden states after each layer.
         tie_word_embeddings: Whether input embedding and LM head share weights.
+        mosrah_overallocation_factor: Overallocation multiplier for the expert packing
+            buffer. ``mosrah_packed_length`` = ceil(training_sequence_length *
+            num_selected_heads / num_mosrah_heads * mosrah_overallocation_factor).
+            Must be > 1.0 to guarantee a buffer larger than the balanced-routing
+            baseline. Default 2.0.
+        load_balance_p: Exponent p for the p-mean aggregation of per-item routing
+            frequencies into the load balance signal. Higher p weights aggregation
+            toward the worst-case batch item, making the correction signal more
+            sensitive to per-item allocation spikes. Must be positive. Default 2.0.
     """
     model_type = "shram"
         use_cache: bool = True,
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
+        mosrah_overallocation_factor: float = 2.0,
+        load_balance_p: float = 2.0,
+        **kwargs
     ):
         if head_dim % 2 != 0:
             raise ValueError(
                 f"got {inference_sequence_length}."
             )
+        if mosrah_overallocation_factor <= 1.0:
+            raise ValueError(
+                f"mosrah_overallocation_factor must be > 1.0 to guarantee a packed "
+                f"buffer larger than the balanced-routing baseline. "
+                f"Got {mosrah_overallocation_factor}."
+            )
+        if load_balance_p <= 0.0:
+            raise ValueError(
+                f"load_balance_p must be positive, got {load_balance_p}."
+            )
         self.vocab_size = vocab_size
+        self.embedding_width = embedding_width
+        self.mlp_width = mlp_width
+        self.num_decoder_layers = num_decoder_layers
         self.num_sliding_window_heads = num_sliding_window_heads
         self.num_mosrah_heads = num_mosrah_heads
         self.num_selected_heads = num_selected_heads
         self.inference_sequence_length = inference_sequence_length
         self.alpha = alpha
         self.beta = beta
+        self.mosrah_overallocation_factor = mosrah_overallocation_factor
+        self.load_balance_p = load_balance_p
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             output_hidden_states=output_hidden_states,
+            **kwargs
         )
         # Promote auto_map to an instance attribute so PretrainedConfig.to_dict()
         """
         return self.inference_sequence_length / self.training_sequence_length
+    @property
+    def mosrah_packed_length(self) -> int:
+        """Static packed time dimension T for expert packing.
+        The expected tokens per expert under perfectly balanced routing is
+        ``training_sequence_length * num_selected_heads / num_mosrah_heads``.
+        Multiplying by ``mosrah_overallocation_factor`` provides a buffer above
+        that baseline. The ceiling ensures T is always an integer >= 1.
+        All consumers of the packed buffer size must read this property rather
+        than deriving T independently.
+        """
+        return math.ceil(
+            self.training_sequence_length
+            * self.num_selected_heads
+            / self.num_mosrah_heads
+            * self.mosrah_overallocation_factor
+        )
+    @property
+    def mosrah_cache_length(self) -> int:
+        """Static per-(batch, head) slot capacity for the MoSRAH inference cache.
+        The expected tokens per expert over the full inference context under perfectly
+        balanced routing is ``inference_sequence_length * num_selected_heads /
+        num_mosrah_heads``. Multiplying by ``mosrah_overallocation_factor`` provides
+        a buffer above that baseline. The ceiling ensures the result is always an
+        integer >= 1.
+        Distinct from ``mosrah_packed_length``, which sizes the training packing buffer
+        using ``training_sequence_length``. This property uses
+        ``inference_sequence_length`` because the cache must hold the full accumulated
+        token history across the entire inference run.
+        All consumers of the MoSRAH cache buffer size must read this property rather
+        than deriving the capacity independently.
+        """
+        return math.ceil(
+            self.inference_sequence_length
+            * self.num_selected_heads
+            / self.num_mosrah_heads
+            * self.mosrah_overallocation_factor
+        )

decoder_layer.py CHANGED Viewed

@@ -46,8 +46,8 @@ class DecoderLayer(nn.Module):
     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
-        self.attn_norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.mlp_norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.attention = SHRAMHybridLayer(config)
         self.mlp = SwiGLUMLP(config)

     def __init__(self, config: ShramConfig) -> None:
         super().__init__()
+        self.attn_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
+        self.mlp_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
         self.attention = SHRAMHybridLayer(config)
         self.mlp = SwiGLUMLP(config)

huggingface.py CHANGED Viewed

@@ -74,9 +74,9 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
     def __init__(self, config: ShramConfig) -> None:
         super().__init__(config)
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
         self.model = ShramModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self._configure_tied_embeddings()
         self.post_init()
@@ -127,12 +127,7 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
     ) -> ShramCache:
         """Construct a fresh top-level SHRAM cache."""
         return ShramCache(
-            num_hidden_layers=self.config.num_hidden_layers,
-            sliding_window=self.config.window_size,
-            num_local_heads=self.config.num_sliding_window_heads,
-            local_head_dim=self.config.head_dim,
-            num_mosrah_heads=self.config.num_mosrah_heads,
-            mosrah_head_dim=self.config.head_dim,
             batch_size=batch_size,
             device=device,
         )
@@ -231,6 +226,26 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
         past_key_values.reorder_cache(beam_idx)
         return past_key_values
     def _validate_input_ids(self, input_ids: torch.Tensor) -> None:
         """Validate token IDs at the wrapper boundary."""
         if input_ids.ndim != 2:
@@ -352,6 +367,63 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
                 f"Unsupported forward kwargs for ShramForCausalLM: {unsupported}"
             )
     def _standardize_full_attention_mask(
         self,
         input_ids: torch.Tensor,
@@ -449,6 +521,7 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
         # This keeps the main sequence readable while ensuring invalid states
         # fail before they can silently contaminate backbone execution.
         # ------------------------------------------------------------------
         self._validate_input_ids(input_ids)
         self._validate_attention_mask(input_ids, attention_mask)
         self._validate_position_ids(input_ids, position_ids)
@@ -487,6 +560,10 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
         )
         shram_cache: ShramCache | None = past_key_values if use_cache else None
         # ------------------------------------------------------------------
         # Core wrapper responsibilities.
         #

     def __init__(self, config: ShramConfig) -> None:
         super().__init__(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.embedding_width)
         self.model = ShramModel(config)
+        self.lm_head = nn.Linear(config.embedding_width, config.vocab_size, bias=False)
         self._configure_tied_embeddings()
         self.post_init()
     ) -> ShramCache:
         """Construct a fresh top-level SHRAM cache."""
         return ShramCache(
+            config=self.config,
             batch_size=batch_size,
             device=device,
         )
         past_key_values.reorder_cache(beam_idx)
         return past_key_values
+    @staticmethod
+    def create_masks_for_generate(
+        config: Any,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None,
+        position_ids: torch.Tensor | None = None,
+        **kwargs: Any,
+    ) -> torch.Tensor | None:
+        """Return the 2D attention_mask unchanged.
+        HuggingFace calls this during compiled generation to convert the 2D
+        attention mask into a 4D causal additive-bias mask. SHRAM uses flex
+        attention with custom masking and constructs causality internally; the
+        4D format is incompatible with the SHRAM masking contract. Overriding
+        as a no-op restores symmetry between compiled and non-compiled pathways
+        without any loss of correctness or performance (see Unit 19.G.4).
+        """
+        return attention_mask
     def _validate_input_ids(self, input_ids: torch.Tensor) -> None:
         """Validate token IDs at the wrapper boundary."""
         if input_ids.ndim != 2:
                 f"Unsupported forward kwargs for ShramForCausalLM: {unsupported}"
             )
+    @staticmethod
+    def _enforce_uncached_starting_position(condition: torch.Tensor) -> None:
+        """Enforce that an uncached forward pass begins at position 0.
+        An uncached forward has no prior KV state. Nonzero starting positions
+        produce silently incorrect RoPE encoding and attention outputs with no
+        downstream diagnostic. This method intercepts that misuse at the
+        outermost boundary before any backbone computation runs.
+        To resolve a violation: either supply a ShramCache populated with the
+        prefix (for continued decoding), or rebase the sequence so positions
+        start at 0.
+        Args:
+            condition: Scalar bool tensor. True = all batch items start at 0
+                (valid); False = at least one batch item starts nonzero
+                (violated).
+        """
+        if torch.compiler.is_compiling():
+            # bool.item() is not captured as a SymBool by dynamo; converting to
+            # int first produces a SymInt, and the Python comparison (!=0) then
+            # yields a SymBool that torch._check folds into the compiled graph.
+            condition_as_int = condition.to(torch.int).item()
+            torch._check(condition_as_int != 0)
+        else:
+            if not condition.item():
+                raise RuntimeError(
+                    "Uncached ShramForCausalLM forward does not support nonzero "
+                    "starting positions. Either provide a ShramCache populated "
+                    "with the prefix for continued decoding, or rebase the "
+                    "uncached sequence to start at 0.",
+                )
+    @staticmethod
+    def _enforce_capture_scalar_outputs() -> None:
+        """Enforce that capture_scalar_outputs is enabled when compiling.
+        The safety checks in this model (e.g. position-zero constraint, packing
+        overflow detection) rely on torch._check folding into the compiled graph,
+        which requires torch._dynamo.config.capture_scalar_outputs = True. Without
+        it those checks are silently absent in the compiled model while appearing
+        to work in eager mode — a misconfiguration with no diagnostic output.
+        This method fires during dynamo tracing so the missing flag is surfaced
+        immediately at compile time rather than discovered from downstream failures.
+        """
+        if torch.compiler.is_compiling():
+            torch._check(
+                torch._dynamo.config.capture_scalar_outputs,
+                lambda: RuntimeError(
+                    "ShramForCausalLM requires torch._dynamo.config.capture_scalar_outputs = True "
+                    "when compiled. Without it, runtime safety checks (position constraints, "
+                    "overflow detection) are silently absent in the compiled model. Set the flag "
+                    "before calling torch.compile()."
+                ),
+            )
     def _standardize_full_attention_mask(
         self,
         input_ids: torch.Tensor,
         # This keeps the main sequence readable while ensuring invalid states
         # fail before they can silently contaminate backbone execution.
         # ------------------------------------------------------------------
+        self._enforce_capture_scalar_outputs()
         self._validate_input_ids(input_ids)
         self._validate_attention_mask(input_ids, attention_mask)
         self._validate_position_ids(input_ids, position_ids)
         )
         shram_cache: ShramCache | None = past_key_values if use_cache else None
+        if shram_cache is None:
+            positions_start_sane = torch.all(current_position_ids[:, 0] == 0)
+            self._enforce_uncached_starting_position(positions_start_sane)
         # ------------------------------------------------------------------
         # Core wrapper responsibilities.
         #

mlp.py CHANGED Viewed

@@ -36,9 +36,9 @@ class SwiGLUMLP(nn.Module):
     def __init__(self, config: PretrainedConfig) -> None:
         super().__init__()
-        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
-        self.up_proj   = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Apply the SwiGLU feed-forward transformation.

     def __init__(self, config: PretrainedConfig) -> None:
         super().__init__()
+        self.gate_proj = nn.Linear(config.embedding_width, config.mlp_width, bias=False)
+        self.up_proj   = nn.Linear(config.embedding_width, config.mlp_width, bias=False)
+        self.down_proj = nn.Linear(config.mlp_width, config.embedding_width, bias=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Apply the SwiGLU feed-forward transformation.

model.py CHANGED Viewed

@@ -58,9 +58,9 @@ class ShramModel(nn.Module):
         super().__init__()
         self.config = config
         self.layers = nn.ModuleList(
-            [DecoderLayer(config) for _ in range(config.num_hidden_layers)]
         )
-        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def num_mosrah_parameters(self) -> int:
         """Return the total number of trainable MoSRAH parameters across all decoder layers."""

         super().__init__()
         self.config = config
         self.layers = nn.ModuleList(
+            [DecoderLayer(config) for _ in range(config.num_decoder_layers)]
         )
+        self.norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
     def num_mosrah_parameters(self) -> int:
         """Return the total number of trainable MoSRAH parameters across all decoder layers."""

rope.py CHANGED Viewed

@@ -26,10 +26,10 @@ Each attention path (h_l and BEA) constructs its own RotaryEmbedding with explic
 parameters — no shared instance, no config reading. See Unit 5.A design decisions.
 Cache sharing: all instances with identical parameters share one cos/sin table via a
-class-level registry. The first instance that needs a particular (parameters, seq_len,
-device, dtype) combination builds the table; all subsequent instances reference it
-directly. This avoids redundant builds across the num_hidden_layers instances that
-share the same parametrisation.
 """
 import math
@@ -66,17 +66,21 @@ class RotaryEmbedding(nn.Module):
     h_l always uses ``mode="default"``; BEA always uses ``mode="yarn"``. No
     config object is read inside this module.
-    The cos/sin cache is built lazily on the first forward call and extended
-    automatically when a longer sequence is encountered. Instances with identical
-    parameters share one cache via the class-level ``_cache`` registry,
-    avoiding redundant computation across decoder layers.
     Args:
         mode: ``"default"`` for standard RoPE; ``"yarn"`` for YaRN extrapolation.
         head_dim: Per-head embedding dimension ``u``. Must be even.
         theta: Base frequency ``b`` in θ_d = b^{-2d/u}.
-        initial_seq_length: ``C_train`` — context length the model was trained at.
-            Required for ``mode="yarn"``.
         dilation: Scale factor ``s = C_target / C_train`` — how much the context
             window is extended beyond training length. Required for ``mode="yarn"``.
             When ``dilation=1.0``, YaRN reduces to standard RoPE.
@@ -88,11 +92,11 @@ class RotaryEmbedding(nn.Module):
     Raises:
         NotImplementedError: If ``mode`` is not ``"default"`` or ``"yarn"``.
-        ValueError: If ``mode="yarn"`` and any of ``initial_seq_length``,
-            ``dilation``, ``alpha``, ``beta`` are absent.
     """
-    # Maps (freq_key, seq_len, device_str, dtype_str) → (cos_table, sin_table).
     # Shared across all RotaryEmbedding instances in the process. Keys include device
     # and dtype so that tables built on different devices or in different precisions
     # are stored independently.
@@ -103,7 +107,7 @@ class RotaryEmbedding(nn.Module):
         mode: str,
         head_dim: int,
         theta: float,
-        initial_seq_length: int | None = None,
         dilation: float | None = None,
         alpha: float | None = None,
         beta: float | None = None,
@@ -112,8 +116,9 @@ class RotaryEmbedding(nn.Module):
         super().__init__()
         self._validate_mode(mode)
-        self._validate_yarn_params(mode, initial_seq_length, dilation, alpha, beta)
         self.mode = mode
         # Compute per-dimension rotation frequencies θ_d (default) or θ_d' (yarn).
         # d_index ranges over 0, 2, 4, ..., head_dim-2 — one index per dimension pair,
@@ -128,9 +133,14 @@ class RotaryEmbedding(nn.Module):
         else:  # yarn
             s = dilation
             # r(d) = C_train · θ_d / (2π) — normalized frequency used by the ramp
             # function to classify each dimension into one of three regimes.
-            normalized_freqs = initial_seq_length * base_freqs / (2.0 * math.pi)
             # γ(r) ramp: 0 for r < α (fully interpolate), 1 for r > β (unchanged),
             # linear blend between α and β.
@@ -142,16 +152,13 @@ class RotaryEmbedding(nn.Module):
             # A_rope = (0.1 · ln(s) + 1)² — attention logit scaling returned to caller.
             self.attention_scaling = (0.1 * math.log(s) + 1.0) ** 2
-        # freq_key uniquely identifies the parameter set that produced rotation_freqs.
-        # Used as the primary component of the cache registry key.
         if mode == "default":
-            self._freq_key: tuple = ("default", head_dim, float(theta))
         else:
-            self._freq_key = (
-                "yarn", head_dim, float(theta),
-                int(initial_seq_length), float(dilation),
-                float(alpha), float(beta),
-            )
         # rotation_freqs is a non-persistent buffer so it moves with the model across
         # devices via .to() / .cuda() without appearing in saved checkpoints.
@@ -167,6 +174,11 @@ class RotaryEmbedding(nn.Module):
         self._cos_cached: torch.Tensor | None = None
         self._sin_cached: torch.Tensor | None = None
     # ---------------------------------------------------------------------------
     # Validation helpers
     # ---------------------------------------------------------------------------
@@ -182,7 +194,6 @@ class RotaryEmbedding(nn.Module):
     @staticmethod
     def _validate_yarn_params(
         mode: str,
-        initial_seq_length: int | None,
         dilation: float | None,
         alpha: float | None,
         beta: float | None,
@@ -192,7 +203,6 @@ class RotaryEmbedding(nn.Module):
             return
         missing = [
             name for name, val in [
-                ("initial_seq_length", initial_seq_length),
                 ("dilation", dilation),
                 ("alpha", alpha),
                 ("beta", beta),
@@ -206,20 +216,23 @@ class RotaryEmbedding(nn.Module):
     # Cache management
     # ---------------------------------------------------------------------------
-    def _extend_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> None:
-        """Build the cos/sin table to cover positions [0, seq_len).
         Checks the class-level registry first. If a table already exists for this
-        exact (parameters, seq_len, device, dtype) combination it is reused directly;
         otherwise it is computed and stored. The instance attributes are pointed at
         the registry entry so that all layers sharing the same parametrisation
         reference the same tensor.
         """
-        cache_key = (self._freq_key, seq_len, str(device), str(dtype))
         if cache_key not in RotaryEmbedding._cache:
-            positions = torch.arange(seq_len, device=device, dtype=torch.float32)
-            # outer product → (seq_len, head_dim // 2); duplicate to (seq_len, head_dim)
             freqs = torch.outer(
                 positions,
                 self.rotation_freqs.to(device=device, dtype=torch.float32),
@@ -240,11 +253,12 @@ class RotaryEmbedding(nn.Module):
     ) -> tuple[torch.Tensor, torch.Tensor, float]:
         """Apply rotary embeddings to query and key tensors.
-        The cos/sin cache is extended lazily when position_ids reference positions
-        beyond its current length, or when the device or dtype has changed.
-        ``position_ids`` may be any integer tensor shape. Its values are valid
-        position indices into the cos/sin cache:
         - h_l (standard causal): position_ids (B, N), q/k (B, H, N, head_dim).
         - BEA (packed):          position_ids (B, L, T), q/k (B, L, T, head_dim).
@@ -262,18 +276,11 @@ class RotaryEmbedding(nn.Module):
             1.0 for default mode; YaRN returns (0.1·ln(s)+1)² which the caller must
             apply to attention logits before softmax.
         """
-        seq_len = int(position_ids.max().item()) + 1
-        # The cache is valid when it exists, covers all positions referenced by
-        # position_ids, and matches q's dtype and device. Each condition is named
-        # separately so the rebuild trigger is readable rather than a compound predicate.
-        cache_missing = self._cos_cached is None
-        cache_too_short = not cache_missing and seq_len > self._cos_cached.shape[0]
-        wrong_dtype = not cache_missing and self._cos_cached.dtype != q.dtype
-        wrong_device = not cache_missing and self._cos_cached.device != q.device
-        if cache_missing or cache_too_short or wrong_dtype or wrong_device:
-            self._extend_cache(seq_len, device=q.device, dtype=q.dtype)
         cos = self._cos_cached[position_ids]
         sin = self._sin_cached[position_ids]

 parameters — no shared instance, no config reading. See Unit 5.A design decisions.
 Cache sharing: all instances with identical parameters share one cos/sin table via a
+class-level registry. The first instance that needs a particular (parameters, device,
+dtype) combination builds the table; all subsequent instances reference it directly.
+This avoids redundant builds across the num_hidden_layers instances that share the
+same parametrisation.
 """
 import math
     h_l always uses ``mode="default"``; BEA always uses ``mode="yarn"``. No
     config object is read inside this module.
+    The cos/sin table is built at construction time to cover all positions in
+    ``[0, maximum_sequence_length)``. In forward, the table is rebuilt only if
+    the query tensor's dtype or device has changed since construction.
+    Instances with identical parameters share one cos/sin table via the class-level
+    ``_cache`` registry, avoiding redundant computation across decoder layers.
     Args:
         mode: ``"default"`` for standard RoPE; ``"yarn"`` for YaRN extrapolation.
         head_dim: Per-head embedding dimension ``u``. Must be even.
         theta: Base frequency ``b`` in θ_d = b^{-2d/u}.
+        maximum_sequence_length: Maximum number of positions the table must cover.
+            The cos/sin table is preallocated to this length at construction time.
+            For ``mode="yarn"``, the training context length C_train is derived
+            internally as ``round(maximum_sequence_length / dilation)``.
         dilation: Scale factor ``s = C_target / C_train`` — how much the context
             window is extended beyond training length. Required for ``mode="yarn"``.
             When ``dilation=1.0``, YaRN reduces to standard RoPE.
     Raises:
         NotImplementedError: If ``mode`` is not ``"default"`` or ``"yarn"``.
+        ValueError: If ``mode="yarn"`` and any of ``dilation``, ``alpha``,
+            ``beta`` are absent.
     """
+    # Maps (freq_key, device_str, dtype_str) → (cos_table, sin_table).
     # Shared across all RotaryEmbedding instances in the process. Keys include device
     # and dtype so that tables built on different devices or in different precisions
     # are stored independently.
         mode: str,
         head_dim: int,
         theta: float,
+        maximum_sequence_length: int,
         dilation: float | None = None,
         alpha: float | None = None,
         beta: float | None = None,
         super().__init__()
         self._validate_mode(mode)
+        self._validate_yarn_params(mode, dilation, alpha, beta)
         self.mode = mode
+        self._maximum_sequence_length = maximum_sequence_length
         # Compute per-dimension rotation frequencies θ_d (default) or θ_d' (yarn).
         # d_index ranges over 0, 2, 4, ..., head_dim-2 — one index per dimension pair,
         else:  # yarn
             s = dilation
+            # C_train is the training context length, recovered from the inference
+            # context length and the dilation factor. round() guards against floating
+            # point error since both underlying quantities are integers.
+            c_train: int = round(maximum_sequence_length / dilation)
             # r(d) = C_train · θ_d / (2π) — normalized frequency used by the ramp
             # function to classify each dimension into one of three regimes.
+            normalized_freqs = c_train * base_freqs / (2.0 * math.pi)
             # γ(r) ramp: 0 for r < α (fully interpolate), 1 for r > β (unchanged),
             # linear blend between α and β.
             # A_rope = (0.1 · ln(s) + 1)² — attention logit scaling returned to caller.
             self.attention_scaling = (0.1 * math.log(s) + 1.0) ** 2
+        # freq_key uniquely identifies the parameter set that produced rotation_freqs,
+        # including maximum_sequence_length so instances with different table sizes
+        # do not collide in the registry.
         if mode == "default":
+            self._freq_key: tuple = ("default", head_dim, theta, maximum_sequence_length)
         else:
+            self._freq_key = ("yarn", head_dim, theta, maximum_sequence_length, dilation, alpha, beta)
         # rotation_freqs is a non-persistent buffer so it moves with the model across
         # devices via .to() / .cuda() without appearing in saved checkpoints.
         self._cos_cached: torch.Tensor | None = None
         self._sin_cached: torch.Tensor | None = None
+        # Build the table at construction time. Forward rebuilds only on dtype or
+        # device change. If no device is specified, build on CPU as the default.
+        build_device = device if device is not None else torch.device("cpu")
+        self._build_cache(device=build_device, dtype=torch.float32)
     # ---------------------------------------------------------------------------
     # Validation helpers
     # ---------------------------------------------------------------------------
     @staticmethod
     def _validate_yarn_params(
         mode: str,
         dilation: float | None,
         alpha: float | None,
         beta: float | None,
             return
         missing = [
             name for name, val in [
                 ("dilation", dilation),
                 ("alpha", alpha),
                 ("beta", beta),
     # Cache management
     # ---------------------------------------------------------------------------
+    def _build_cache(self, device: torch.device, dtype: torch.dtype) -> None:
+        """Build the cos/sin table to cover positions [0, maximum_sequence_length).
         Checks the class-level registry first. If a table already exists for this
+        exact (parameters, device, dtype) combination it is reused directly;
         otherwise it is computed and stored. The instance attributes are pointed at
         the registry entry so that all layers sharing the same parametrisation
         reference the same tensor.
         """
+        cache_key = (self._freq_key, str(device), str(dtype))
         if cache_key not in RotaryEmbedding._cache:
+            positions = torch.arange(
+                self._maximum_sequence_length, device=device, dtype=torch.float32
+            )
+            # outer product → (maximum_sequence_length, head_dim // 2);
+            # duplicate to (maximum_sequence_length, head_dim)
             freqs = torch.outer(
                 positions,
                 self.rotation_freqs.to(device=device, dtype=torch.float32),
     ) -> tuple[torch.Tensor, torch.Tensor, float]:
         """Apply rotary embeddings to query and key tensors.
+        The cos/sin table is built at construction time. It is rebuilt here only
+        if ``q``'s dtype or device differs from the cached table — for example,
+        after moving the model to a different device via ``.cuda()``.
+        ``position_ids`` may be any integer tensor shape. Its values must be in
+        ``[0, maximum_sequence_length)``:
         - h_l (standard causal): position_ids (B, N), q/k (B, H, N, head_dim).
         - BEA (packed):          position_ids (B, L, T), q/k (B, L, T, head_dim).
             1.0 for default mode; YaRN returns (0.1·ln(s)+1)² which the caller must
             apply to attention logits before softmax.
         """
+        wrong_dtype = self._cos_cached.dtype != q.dtype
+        wrong_device = self._cos_cached.device != q.device
+        if wrong_dtype or wrong_device:
+            self._build_cache(device=q.device, dtype=q.dtype)
         cos = self._cos_cached[position_ids]
         sin = self._sin_cached[position_ids]