smithblack-0
/

SHRAM-dev

@@ -48,7 +48,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 config = AutoConfig.from_pretrained(
     "smithblack-0/SHRAM-dev",
     trust_remote_code=True,
-    num_hidden_layers=16,       # example override
     num_mosrah_heads=32,        # example override
 )

 config = AutoConfig.from_pretrained(
     "smithblack-0/SHRAM-dev",
     trust_remote_code=True,
+    num_decoder_layers=16,      # example override
     num_mosrah_heads=32,        # example override
 )

configuration.py CHANGED Viewed

@@ -1,262 +1,262 @@
-"""Configuration for the SHRAM transformer.
-All architectural parameters that vary across model scales or are meaningful research
-variables are expressed here. Architectural constants (no bias in linear layers,
-SwiGLU activation with SiLU gate) are implemented in the relevant modules and
-documented at the point of use — they are not config parameters because they do not
-vary and changing them produces a different architecture, not a different scale.
-RoPE configuration is owned entirely by this config. Each attention path reads its
-parameters directly and constructs its own RotaryEmbedding instance explicitly — no
-HuggingFace rope infrastructure is used. See Unit 5.A design decisions in plan.md.
-"""
-import math
-from transformers import PretrainedConfig
-class ShramConfig(PretrainedConfig):
-    """Configuration class for the SHRAM decoder-only transformer.
-    SHRAM (Sparse Hybrid Token Routed Attention Mixture) replaces every standard
-    attention layer with a hybrid layer H(x) = h_l(x) + h_s(x), where h_l is a
-    local sliding-window causal attention path and h_s is the MoSRAH sparse routed
-    path. All other components follow the Llama 3 baseline.
-    This config is the single source of truth for every architectural dimension of the
-    model. Nothing in the architecture may use a literal number that belongs here.
-    Two independent RoPE configurations exist — one per attention path:
-    - h_l always uses standard RoPE with ``local_rope_theta``.
-    - BEA always uses YaRN with ``mosrah_rope_theta``, ``training_sequence_length``,
-      ``inference_sequence_length``, ``alpha``, and ``beta``. When
-      ``inference_sequence_length == training_sequence_length`` the YaRN scale factor
-      ``s = 1`` and YaRN reduces exactly to standard RoPE — this is the default state
-      and the correct setting for experiments that do not require context extension.
-    Registered with HuggingFace AutoClass via ``auto_map``. Instantiate from the Hub::
-        config = AutoConfig.from_pretrained(
-            "your-namespace/advanced-transformers-lib",
-            trust_remote_code=True,
-            num_hidden_layers=12,
-        )
-        model = AutoModelForCausalLM.from_config(config)
-    Args:
-        vocab_size: Vocabulary size. Controls the embedding table and output logits
-            dimension. Must match the tokenizer.
-        embedding_width: Model width ``d``. The dimension of the residual stream.
-        mlp_width: FFN hidden dimension.
-        num_decoder_layers: Number of transformer blocks stacked in sequence.
-        num_sliding_window_heads: Number of heads in the local sliding-window path h_l.
-        num_mosrah_heads: Total MoSRAH expert heads available ``L``.
-        num_selected_heads: MoSRAH heads each token selects ``K``.
-        head_dim: Per-head dimension, shared by both attention paths. Must be even
-            (RoPE rotates dimensions in pairs). Paper uses 16.
-        window_size: Sliding window size for h_l. Paper uses 128.
-        rope_mode: RoPE position encoding mode for BEA. ``"main_sequence"`` supplies
-            original sequence positions; ``"semantic_sequence"`` supplies local slot
-            indices. Both are required; experimentally correct mode is undetermined
-            (paper §4). Default ``"main_sequence"``.
-        rms_norm_eps: Epsilon for RMSNorm layers.
-        local_rope_theta: RoPE base frequency ``b`` for the local attention path h_l.
-            Paper uses b=10000.
-        mosrah_rope_theta: RoPE base frequency ``b`` for the BEA path. Paper uses
-            b=10000.
-        training_sequence_length: Context length ``C_train`` the model was or will be
-            trained at. Used to compute the YaRN scale factor for BEA.
-        inference_sequence_length: Context length ``C_target`` the model must support
-            at inference. Optional; defaults to ``training_sequence_length`` so that
-            ``scale=1`` and YaRN reduces to standard RoPE unless explicitly extended.
-        alpha: YaRN ramp lower boundary α (paper §A.2). Frequency dimensions with
-            ``r(d) < alpha`` are fully interpolated by scale s. Paper value: 1.0.
-        beta: YaRN ramp upper boundary β (paper §A.2). Frequency dimensions with
-            ``r(d) > beta`` are left unscaled. Paper value: 32.0.
-        attention_dropout: Dropout probability on attention weights. Default 0.0.
-        use_cache: Whether to return past_key_values for KV caching.
-        output_hidden_states: Whether to return hidden states after each layer.
-        tie_word_embeddings: Whether input embedding and LM head share weights.
-        mosrah_overallocation_factor: Overallocation multiplier for the expert packing
-            buffer. ``mosrah_packed_length`` = ceil(training_sequence_length *
-            num_selected_heads / num_mosrah_heads * mosrah_overallocation_factor).
-            Must be > 1.0 to guarantee a buffer larger than the balanced-routing
-            baseline. Default 2.0.
-        load_balance_p: Exponent p for the p-mean aggregation of per-item routing
-            frequencies into the load balance signal. Higher p weights aggregation
-            toward the worst-case batch item, making the correction signal more
-            sensitive to per-item allocation spikes. Must be positive. Default 2.0.
-        max_bid_rounds: Maximum bidding rounds for the deferred-acceptance capacity
-            solver in ``balance_capacity``. 10 covers convergence at approximately
-            the 98th percentile of routing densities; the top 2% of extreme-density
-            cases are not expected under normal training. The bound exists as a
-            correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
-            Default 10.
-    """
-    model_type = "shram"
-    auto_map = {
-        "AutoConfig": "configuration.ShramConfig",
-        "AutoModelForCausalLM": "huggingface.ShramForCausalLM",
-    }
-    def __init__(
-        self,
-        vocab_size: int = 50277,
-        embedding_width: int = 512,
-        mlp_width: int = 1366,
-        num_decoder_layers: int = 12,
-        num_sliding_window_heads: int = 16,
-        num_mosrah_heads: int = 16,
-        num_selected_heads: int = 16,
-        head_dim: int = 16,
-        window_size: int = 128,
-        rope_mode: str = "main_sequence",
-        rms_norm_eps: float = 1e-5,
-        local_rope_theta: float = 10000.0,
-        mosrah_rope_theta: float = 10000.0,
-        training_sequence_length: int = 1024,
-        inference_sequence_length: int | None = None,
-        alpha: float = 1.0,
-        beta: float = 32.0,
-        attention_dropout: float = 0.0,
-        use_cache: bool = True,
-        output_hidden_states: bool = False,
-        tie_word_embeddings: bool = False,
-        mosrah_overallocation_factor: float = 2.0,
-        load_balance_p: float = 2.0,
-        max_bid_rounds: int = 10,
-        **kwargs
-    ):
-        if head_dim % 2 != 0:
-            raise ValueError(
-                f"head_dim must be even (RoPE rotates dimensions in pairs). "
-                f"Got head_dim={head_dim}."
-            )
-        if rope_mode not in {"main_sequence", "semantic_sequence"}:
-            raise ValueError(
-                f"rope_mode must be 'main_sequence' or 'semantic_sequence', "
-                f"got '{rope_mode}'."
-            )
-        if training_sequence_length <= 0:
-            raise ValueError(
-                f"training_sequence_length must be positive, "
-                f"got {training_sequence_length}."
-            )
-        if inference_sequence_length is None:
-            inference_sequence_length = training_sequence_length
-        if inference_sequence_length <= 0:
-            raise ValueError(
-                f"inference_sequence_length must be positive, "
-                f"got {inference_sequence_length}."
-            )
-        if mosrah_overallocation_factor <= 1.0:
-            raise ValueError(
-                f"mosrah_overallocation_factor must be > 1.0 to guarantee a packed "
-                f"buffer larger than the balanced-routing baseline. "
-                f"Got {mosrah_overallocation_factor}."
-            )
-        if load_balance_p <= 0.0:
-            raise ValueError(
-                f"load_balance_p must be positive, got {load_balance_p}."
-            )
-        if max_bid_rounds < 1:
-            raise ValueError(
-                f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
-            )
-        self.vocab_size = vocab_size
-        self.embedding_width = embedding_width
-        self.mlp_width = mlp_width
-        self.num_decoder_layers = num_decoder_layers
-        self.num_sliding_window_heads = num_sliding_window_heads
-        self.num_mosrah_heads = num_mosrah_heads
-        self.num_selected_heads = num_selected_heads
-        self.head_dim = head_dim
-        self.window_size = window_size
-        self.rope_mode = rope_mode
-        self.rms_norm_eps = rms_norm_eps
-        self.local_rope_theta = local_rope_theta
-        self.mosrah_rope_theta = mosrah_rope_theta
-        self.training_sequence_length = training_sequence_length
-        self.inference_sequence_length = inference_sequence_length
-        self.alpha = alpha
-        self.beta = beta
-        self.mosrah_overallocation_factor = mosrah_overallocation_factor
-        self.load_balance_p = load_balance_p
-        self.max_bid_rounds = max_bid_rounds
-        self.attention_dropout = attention_dropout
-        self.use_cache = use_cache
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            output_hidden_states=output_hidden_states,
-            **kwargs
-        )
-        # Promote auto_map to an instance attribute so PretrainedConfig.to_dict()
-        # serialises it into config.json.
-        self.auto_map = type(self).auto_map
-    @property
-    def scale(self) -> float:
-        """YaRN context extension scale factor s = inference_sequence_length / training_sequence_length.
-        When scale == 1.0, YaRN reduces exactly to standard RoPE — all frequency
-        adjustments cancel and A_rope = 1. This is the default state.
-        """
-        return self.inference_sequence_length / self.training_sequence_length
-    @property
-    def mosrah_packed_length(self) -> int:
-        """Static packed time dimension T for expert packing.
-        The expected tokens per expert under perfectly balanced routing is
-        ``training_sequence_length * num_selected_heads / num_mosrah_heads``.
-        Multiplying by ``mosrah_overallocation_factor`` provides a buffer above
-        that baseline. The ceiling ensures T is always an integer >= 1.
-        All consumers of the packed buffer size must read this property rather
-        than deriving T independently.
-        """
-        return math.ceil(
-            self.training_sequence_length
-            * self.num_selected_heads
-            / self.num_mosrah_heads
-            * self.mosrah_overallocation_factor
-        )
-    @property
-    def mosrah_cache_length(self) -> int:
-        """Static per-(batch, head) slot capacity for the MoSRAH inference cache.
-        The expected tokens per expert over the full inference context under perfectly
-        balanced routing is ``inference_sequence_length * num_selected_heads /
-        num_mosrah_heads``. Multiplying by ``mosrah_overallocation_factor`` provides
-        a buffer above that baseline. The ceiling ensures the result is always an
-        integer >= 1.
-        Distinct from ``mosrah_packed_length``, which sizes the training packing buffer
-        using ``training_sequence_length``. This property uses
-        ``inference_sequence_length`` because the cache must hold the full accumulated
-        token history across the entire inference run.
-        All consumers of the MoSRAH cache buffer size must read this property rather
-        than deriving the capacity independently.
-        """
-        return math.ceil(
-            self.inference_sequence_length
-            * self.num_selected_heads
-            / self.num_mosrah_heads
-            * self.mosrah_overallocation_factor
-        )

+"""Configuration for the SHRAM transformer.
+All architectural parameters that vary across model scales or are meaningful research
+variables are expressed here. Architectural constants (no bias in linear layers,
+SwiGLU activation with SiLU gate) are implemented in the relevant modules and
+documented at the point of use — they are not config parameters because they do not
+vary and changing them produces a different architecture, not a different scale.
+RoPE configuration is owned entirely by this config. Each attention path reads its
+parameters directly and constructs its own RotaryEmbedding instance explicitly — no
+HuggingFace rope infrastructure is used. See Unit 5.A design decisions in plan.md.
+"""
+import math
+from transformers import PretrainedConfig
+class ShramConfig(PretrainedConfig):
+    """Configuration class for the SHRAM decoder-only transformer.
+    SHRAM (Sparse Hybrid Token Routed Attention Mixture) replaces every standard
+    attention layer with a hybrid layer H(x) = h_l(x) + h_s(x), where h_l is a
+    local sliding-window causal attention path and h_s is the MoSRAH sparse routed
+    path. All other components follow the Llama 3 baseline.
+    This config is the single source of truth for every architectural dimension of the
+    model. Nothing in the architecture may use a literal number that belongs here.
+    Two independent RoPE configurations exist — one per attention path:
+    - h_l always uses standard RoPE with ``local_rope_theta``.
+    - BEA always uses YaRN with ``mosrah_rope_theta``, ``training_sequence_length``,
+      ``inference_sequence_length``, ``alpha``, and ``beta``. When
+      ``inference_sequence_length == training_sequence_length`` the YaRN scale factor
+      ``s = 1`` and YaRN reduces exactly to standard RoPE — this is the default state
+      and the correct setting for experiments that do not require context extension.
+    Registered with HuggingFace AutoClass via ``auto_map``. Instantiate from the Hub::
+        config = AutoConfig.from_pretrained(
+            "your-namespace/advanced-transformers-lib",
+            trust_remote_code=True,
+            num_decoder_layers=12,
+        )
+        model = AutoModelForCausalLM.from_config(config)
+    Args:
+        vocab_size: Vocabulary size. Controls the embedding table and output logits
+            dimension. Must match the tokenizer.
+        embedding_width: Model width ``d``. The dimension of the residual stream.
+        mlp_width: FFN hidden dimension.
+        num_decoder_layers: Number of transformer blocks stacked in sequence.
+        num_sliding_window_heads: Number of heads in the local sliding-window path h_l.
+        num_mosrah_heads: Total MoSRAH expert heads available ``L``.
+        num_selected_heads: MoSRAH heads each token selects ``K``.
+        head_dim: Per-head dimension, shared by both attention paths. Must be even
+            (RoPE rotates dimensions in pairs). Paper uses 16.
+        window_size: Sliding window size for h_l. Paper uses 128.
+        rope_mode: RoPE position encoding mode for BEA. ``"main_sequence"`` supplies
+            original sequence positions; ``"semantic_sequence"`` supplies local slot
+            indices. Both are required; experimentally correct mode is undetermined
+            (paper §4). Default ``"main_sequence"``.
+        rms_norm_eps: Epsilon for RMSNorm layers.
+        local_rope_theta: RoPE base frequency ``b`` for the local attention path h_l.
+            Paper uses b=10000.
+        mosrah_rope_theta: RoPE base frequency ``b`` for the BEA path. Paper uses
+            b=10000.
+        training_sequence_length: Context length ``C_train`` the model was or will be
+            trained at. Used to compute the YaRN scale factor for BEA.
+        inference_sequence_length: Context length ``C_target`` the model must support
+            at inference. Optional; defaults to ``training_sequence_length`` so that
+            ``scale=1`` and YaRN reduces to standard RoPE unless explicitly extended.
+        alpha: YaRN ramp lower boundary α (paper §A.2). Frequency dimensions with
+            ``r(d) < alpha`` are fully interpolated by scale s. Paper value: 1.0.
+        beta: YaRN ramp upper boundary β (paper §A.2). Frequency dimensions with
+            ``r(d) > beta`` are left unscaled. Paper value: 32.0.
+        attention_dropout: Dropout probability on attention weights. Default 0.0.
+        use_cache: Whether to return past_key_values for KV caching.
+        output_hidden_states: Whether to return hidden states after each layer.
+        tie_word_embeddings: Whether input embedding and LM head share weights.
+        mosrah_overallocation_factor: Overallocation multiplier for the expert packing
+            buffer. ``mosrah_packed_length`` = ceil(training_sequence_length *
+            num_selected_heads / num_mosrah_heads * mosrah_overallocation_factor).
+            Must be > 1.0 to guarantee a buffer larger than the balanced-routing
+            baseline. Default 2.0.
+        load_balance_p: Exponent p for the p-mean aggregation of per-item routing
+            frequencies into the load balance signal. Higher p weights aggregation
+            toward the worst-case batch item, making the correction signal more
+            sensitive to per-item allocation spikes. Must be positive. Default 2.0.
+        max_bid_rounds: Maximum bidding rounds for the deferred-acceptance capacity
+            solver in ``balance_capacity``. 10 covers convergence at approximately
+            the 98th percentile of routing densities; the top 2% of extreme-density
+            cases are not expected under normal training. The bound exists as a
+            correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
+            Default 10.
+    """
+    model_type = "shram"
+    auto_map = {
+        "AutoConfig": "configuration.ShramConfig",
+        "AutoModelForCausalLM": "huggingface.ShramForCausalLM",
+    }
+    def __init__(
+        self,
+        vocab_size: int = 50277,
+        embedding_width: int = 512,
+        mlp_width: int = 1366,
+        num_decoder_layers: int = 12,
+        num_sliding_window_heads: int = 16,
+        num_mosrah_heads: int = 16,
+        num_selected_heads: int = 16,
+        head_dim: int = 16,
+        window_size: int = 128,
+        rope_mode: str = "main_sequence",
+        rms_norm_eps: float = 1e-5,
+        local_rope_theta: float = 10000.0,
+        mosrah_rope_theta: float = 10000.0,
+        training_sequence_length: int = 1024,
+        inference_sequence_length: int | None = None,
+        alpha: float = 1.0,
+        beta: float = 32.0,
+        attention_dropout: float = 0.0,
+        use_cache: bool = True,
+        output_hidden_states: bool = False,
+        tie_word_embeddings: bool = False,
+        mosrah_overallocation_factor: float = 2.0,
+        load_balance_p: float = 2.0,
+        max_bid_rounds: int = 10,
+        **kwargs
+    ):
+        if head_dim % 2 != 0:
+            raise ValueError(
+                f"head_dim must be even (RoPE rotates dimensions in pairs). "
+                f"Got head_dim={head_dim}."
+            )
+        if rope_mode not in {"main_sequence", "semantic_sequence"}:
+            raise ValueError(
+                f"rope_mode must be 'main_sequence' or 'semantic_sequence', "
+                f"got '{rope_mode}'."
+            )
+        if training_sequence_length <= 0:
+            raise ValueError(
+                f"training_sequence_length must be positive, "
+                f"got {training_sequence_length}."
+            )
+        if inference_sequence_length is None:
+            inference_sequence_length = training_sequence_length
+        if inference_sequence_length <= 0:
+            raise ValueError(
+                f"inference_sequence_length must be positive, "
+                f"got {inference_sequence_length}."
+            )
+        if mosrah_overallocation_factor <= 1.0:
+            raise ValueError(
+                f"mosrah_overallocation_factor must be > 1.0 to guarantee a packed "
+                f"buffer larger than the balanced-routing baseline. "
+                f"Got {mosrah_overallocation_factor}."
+            )
+        if load_balance_p <= 0.0:
+            raise ValueError(
+                f"load_balance_p must be positive, got {load_balance_p}."
+            )
+        if max_bid_rounds < 1:
+            raise ValueError(
+                f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
+            )
+        self.vocab_size = vocab_size
+        self.embedding_width = embedding_width
+        self.mlp_width = mlp_width
+        self.num_decoder_layers = num_decoder_layers
+        self.num_sliding_window_heads = num_sliding_window_heads
+        self.num_mosrah_heads = num_mosrah_heads
+        self.num_selected_heads = num_selected_heads
+        self.head_dim = head_dim
+        self.window_size = window_size
+        self.rope_mode = rope_mode
+        self.rms_norm_eps = rms_norm_eps
+        self.local_rope_theta = local_rope_theta
+        self.mosrah_rope_theta = mosrah_rope_theta
+        self.training_sequence_length = training_sequence_length
+        self.inference_sequence_length = inference_sequence_length
+        self.alpha = alpha
+        self.beta = beta
+        self.mosrah_overallocation_factor = mosrah_overallocation_factor
+        self.load_balance_p = load_balance_p
+        self.max_bid_rounds = max_bid_rounds
+        self.attention_dropout = attention_dropout
+        self.use_cache = use_cache
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            output_hidden_states=output_hidden_states,
+            **kwargs
+        )
+        # Promote auto_map to an instance attribute so PretrainedConfig.to_dict()
+        # serialises it into config.json.
+        self.auto_map = type(self).auto_map
+    @property
+    def scale(self) -> float:
+        """YaRN context extension scale factor s = inference_sequence_length / training_sequence_length.
+        When scale == 1.0, YaRN reduces exactly to standard RoPE — all frequency
+        adjustments cancel and A_rope = 1. This is the default state.
+        """
+        return self.inference_sequence_length / self.training_sequence_length
+    @property
+    def mosrah_packed_length(self) -> int:
+        """Static packed time dimension T for expert packing.
+        The expected tokens per expert under perfectly balanced routing is
+        ``training_sequence_length * num_selected_heads / num_mosrah_heads``.
+        Multiplying by ``mosrah_overallocation_factor`` provides a buffer above
+        that baseline. The ceiling ensures T is always an integer >= 1.
+        All consumers of the packed buffer size must read this property rather
+        than deriving T independently.
+        """
+        return math.ceil(
+            self.training_sequence_length
+            * self.num_selected_heads
+            / self.num_mosrah_heads
+            * self.mosrah_overallocation_factor
+        )
+    @property
+    def mosrah_cache_length(self) -> int:
+        """Static per-(batch, head) slot capacity for the MoSRAH inference cache.
+        The expected tokens per expert over the full inference context under perfectly
+        balanced routing is ``inference_sequence_length * num_selected_heads /
+        num_mosrah_heads``. Multiplying by ``mosrah_overallocation_factor`` provides
+        a buffer above that baseline. The ceiling ensures the result is always an
+        integer >= 1.
+        Distinct from ``mosrah_packed_length``, which sizes the training packing buffer
+        using ``training_sequence_length``. This property uses
+        ``inference_sequence_length`` because the cache must hold the full accumulated
+        token history across the entire inference run.
+        All consumers of the MoSRAH cache buffer size must read this property rather
+        than deriving the capacity independently.
+        """
+        return math.ceil(
+            self.inference_sequence_length
+            * self.num_selected_heads
+            / self.num_mosrah_heads
+            * self.mosrah_overallocation_factor
+        )

huggingface.py CHANGED Viewed

@@ -128,7 +128,7 @@ class ShramConfig(PretrainedConfig):
         config = AutoConfig.from_pretrained(
             "your-namespace/advanced-transformers-lib",
             trust_remote_code=True,
-            num_hidden_layers=12,
         )
         model = AutoModelForCausalLM.from_config(config)
@@ -725,17 +725,21 @@ class MoSRAHCache(CacheLayerMixin):
     def _check_no_overflow(max_count: torch.Tensor, capacity: int) -> None:
         """Raise if any (batch, head) slot would exceed the static buffer capacity.
-        Uses the 19.F.1 pattern: branches on whether the graph is being compiled.
-        In compiled mode, `.item()` folds into the graph when capture_scalar_outputs=True
-        and `torch._check` issues a compile-time assertion. In eager mode, a plain
-        RuntimeError is raised with a descriptive message.
         Args:
             max_count: Scalar tensor — the maximum post-update count across all slots.
             capacity: The static buffer capacity (mosrah_cache_length).
         """
         if torch.compiler.is_compiling():
-            torch._check(max_count.item() <= capacity)
         else:
             if max_count.item() > capacity:
                 raise RuntimeError(
@@ -856,7 +860,7 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         # Cumulative count of all token positions presented through update() for
         # this cache instance. This is the quantity HuggingFace generation reads
         # through get_seq_length() to track how far along the sequence we are.
-        self._total_processed: int = 0
     def update(  # type: ignore[override]
         self,
@@ -996,7 +1000,7 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         generation reads to track sequence progress and is not the same as active-token
         count or current window occupancy.
         """
-        return self._total_processed
     def get_max_cache_shape(self) -> int:
         return self.sliding_window
@@ -2299,29 +2303,24 @@ class BottleneckedEnsembleAttention(nn.Module):
 # -----------
 """Expert packing and unpacking for the MoSRAH path.
-This module implements the low-level token-choice -> expert-choice -> token-choice
-conversion boundary specified in the paper. The externally visible behavior is fixed:
-- setup_packing() prepares the auxiliary ordering data and returns it as a dict
-  payload forwarded whole to pack_experts and unpack_experts.
-- pack_experts() converts a dict of routed token-choice tensors into packed
-  expert-choice form. Each entry is paired with its intended padding value; all
-  entries undergo the same expert-major gather-scatter so they remain aligned.
-- unpack_experts() restores token-choice ordering afterward.
-Stable sort is a correctness requirement. It preserves causal ordering inside each
-expert bucket, which is the foundation on which BEA's later triangular causal mask
-is correct.
-pack_experts() returns the packed entries dict together with a separate unpacking_mask.
-Two masks serve different roles and must not be interchanged:
-- unpacking_mask: marks every packed slot that contains a routed token copy,
-  live or dead. Always has exactly B*N*K True entries. Required by unpack_experts
-  so its reshape invariant holds regardless of outer token liveness.
-- active_mask (caller-supplied entry): marks only the packed slots whose source
-  token was semantically live. This is what BEA consumes for attention gating.
-  Dead outer tokens must not influence sparse attention outputs.
 """
@@ -2337,23 +2336,13 @@ def setup_packing(
 ) -> dict[str, torch.Tensor]:
     """Prepare the auxiliary ordering data used by pack/unpack.
-    Routing produces token-choice state I of shape (B, N, K): for each token, which
-    K experts were selected. Packing needs the same routed token copies reordered into
-    expert-major order so each expert bucket becomes contiguous.
-    The paper's setup step does this by flattening (N, K) into one axis to produce
-    H in token-major order, then computing a stable argsort permutation Pi over the
-    expert indices stored in H. Applying Pi reorders the flattened routed copies into
-    expert-major order while preserving their original token order *within* each expert
-    bucket. That preservation is why stable sort is required for causality.
     Args:
         selected_heads: Routed token-choice head selections I of shape (B, N, K).
     Returns:
         Auxiliary payload dict with keys:
           - "flattened_selected_heads": H of shape (B, N*K)
-          - "permutation": stable expert-major permutation Pi of shape (B, N*K)
           - "inverse_permutation": inverse permutation Pi^{-1} of shape (B, N*K)
         This dict is forwarded whole to pack_experts and unpack_experts.
     """
@@ -2362,7 +2351,14 @@ def setup_packing(
         batch_size,
         sequence_length * num_selected_heads,
     )
-    num_elements = batch_size*sequence_length*num_selected_heads
     permutation = torch.argsort(flattened_selected_heads, dim=-1, stable=True)
     inverse_permutation = torch.argsort(permutation, dim=-1)
@@ -2370,7 +2366,6 @@ def setup_packing(
         "flattened_selected_heads": flattened_selected_heads,
         "permutation": permutation,
         "inverse_permutation": inverse_permutation,
-        "num_elements" : num_elements,
     }
@@ -2387,20 +2382,6 @@ def pack_experts(
 ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
     """Pack token-choice tensors into expert-choice padded form.
-    The paper's packing path has two jobs:
-    1. Convert routed token-choice copies into expert-major order.
-    2. Materialize that expert-major order into a padded tensor layout BEA can consume.
-    All entries in the provided dict undergo the same expert-major gather-scatter so
-    they remain mutually aligned in the packed frame. Each entry is paired with its
-    intended padding value, which fills slots that contain no routed token copy.
-    Packed positions are sourced from the authoritative upstream position_ids tensor
-    rather than synthesized locally from arange(N). This preserves advanced positions
-    correctly during cached inference while leaving training/full-sequence behavior
-    unchanged when position_ids is the ordinary sequential token positions.
     Args:
         entries: Mapping from string keys to (tensor, padding_value) pairs. Each
             tensor has shape (B, N, ...) and is rearranged into expert-choice layout
@@ -2409,29 +2390,40 @@ def pack_experts(
         selected_heads: Routed head selections I of shape (B, N, K).
         num_experts: Total number of experts L.
         packed_length: Static packed time dimension T. All per-expert buffers are
-            allocated to exactly this length. Use config.mosrah_packed_length as the
-            source of this value. Raises if any actual per-expert token count exceeds
-            this value.
     Returns:
         Tuple of:
           - packed_entries: Dict with same keys as entries; each value is the
             packed tensor of shape (B, L, T, ...).
-          - unpacking_mask: Boolean tensor of shape (B, L, T). True where a slot
-            contains any routed token copy, live or dead. Always has exactly
-            B*N*K True entries. Pass this to unpack_experts — not active_mask.
     """
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     flattened_selected_heads = setup["flattened_selected_heads"]
     permutation = setup["permutation"]
     # -----------------------------------------------------------------------
-    # Reconstruct routed local source-token indices in token-choice order.
     #
-    # The internal arange(N) is only the local source-row index object used to
-    # gather from the current chunk tensors. Flattening gives a (B, N*K) tensor
-    # aligned with H's token-major routed-copy order.
     # -----------------------------------------------------------------------
     source_token_indices = torch.arange(
         sequence_length,
@@ -2442,81 +2434,91 @@ def pack_experts(
         sequence_length,
         num_selected_heads,
     )
-    flattened_source_indices = source_token_indices.reshape(
         batch_size,
-        sequence_length * num_selected_heads,
     )
-    # -----------------------------------------------------------------------
-    # Reorder source-token indices into expert-major order.
-    #
-    # Applying Pi yields the local source-token rows in the packed expert-major
-    # order required by the paper. All entries are then gathered using these same
-    # reordered indices so they remain aligned under the exact same transformation.
-    # -----------------------------------------------------------------------
-    sorted_source_indices = flattened_source_indices.gather(
         dim=1,
         index=permutation,
     )
     # -----------------------------------------------------------------------
-    # Count how many routed copies land in each expert bucket and verify
-    # that no bucket exceeds the statically preallocated packed_length T.
     #
-    # S[b, l] is the number of routed token copies assigned to expert l in
-    # batch b. T (packed_length) is a static allocation derived from config,
-    # not a data-dependent maximum. Overflow is detected here and raises in
-    # both eager and compiled modes.
     # -----------------------------------------------------------------------
     tokens_per_expert = _count_tokens_per_expert(flattened_selected_heads, num_experts)
-    max_count = tokens_per_expert.max().item()
-    no_overflow = max_count <= packed_length
-    _enforce_no_overflow(no_overflow, tokens_per_expert, packed_length)
     # -----------------------------------------------------------------------
-    # Construct the unpacking mask.
     #
-    # Each expert bucket is left-justified: if S[b, l] = s, then slots
-    # t = 0, ..., s-1 are occupied and all later slots are padding. The mask
-    # marks slot occupancy regardless of outer token liveness, and always has
-    # exactly B*N*K True entries.
     # -----------------------------------------------------------------------
-    time_axis = torch.arange(
-        packed_length,
         device=flattened_selected_heads.device,
         dtype=torch.long,
-    ).view(1, 1, packed_length)
-    unpacking_mask = time_axis < tokens_per_expert.unsqueeze(-1)
     # -----------------------------------------------------------------------
-    # Materialize all entries into the packed expert-choice frame.
     #
-    # Each entry is gathered using the expert-major sorted source indices, then
-    # scattered into a padded buffer. The gather index is expanded to cover each
-    # tensor's trailing dimensions. Padding slots receive the caller-supplied fill
-    # value rather than an implicit zero.
     # -----------------------------------------------------------------------
     packed_entries: dict[str, torch.Tensor] = {}
     for key, (tensor, padding_value) in entries.items():
         extra_shape = tensor.shape[2:]
-        # Expand gather index to cover trailing dimensions, if any.
-        idx = sorted_source_indices.view(
             batch_size,
-            sequence_length * num_selected_heads,
             *(1,) * len(extra_shape),
         ).expand(-1, -1, *extra_shape)
-        sorted_tensor = tensor.gather(dim=1, index=idx)
         packed_tensor = tensor.new_full(
-            (batch_size, num_experts, packed_length, *extra_shape),
             fill_value=padding_value,
         )
-        packed_tensor[unpacking_mask] = sorted_tensor.reshape(-1, *extra_shape)
-        packed_entries[key] = packed_tensor
-    return packed_entries, unpacking_mask
 # ---------------------------------------------------------------------------
@@ -2526,27 +2528,17 @@ def pack_experts(
 def unpack_experts(
     expert_outputs: torch.Tensor,
     setup: dict[str, torch.Tensor],
-    unpacking_mask: torch.Tensor,
     selected_heads: torch.Tensor,
 ) -> torch.Tensor:
     """Restore token-choice ordering from BEA expert-choice output.
-    Unpacking inverts the packing path only on occupied entries. Padding does not
-    participate: the output tensor is first filtered by unpacking_mask to recover
-    only the real routed-token copies in expert-major order, then Pi^{-1} restores
-    the original token-choice ordering, and finally the tensor is reshaped back to
-    (B, N, K, d).
-    The unpacking_mask — not active_mask — must be used here. Even copies of dead
-    outer tokens occupy slots and must be un-scattered correctly for the inverse
-    permutation to hold. The total True entry count in unpacking_mask is always
-    B*N*K, which is exactly what the reshape to (B, N*K, d) requires.
     Args:
         expert_outputs: Expert-choice BEA output y of shape (B, L, T, d).
         setup: Auxiliary payload returned by setup_packing().
-        unpacking_mask: From pack_experts(), shape (B, L, T). Identifies all
-            occupied packed slots regardless of outer token liveness.
         selected_heads: Routed head selections I of shape (B, N, K).
     Returns:
@@ -2555,22 +2547,22 @@ def unpack_experts(
     inverse_permutation = setup["inverse_permutation"]
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
     hidden_dim = expert_outputs.shape[-1]
-    coords = torch.nonzero_static(
-        unpacking_mask,
-        size=setup["num_elements"],
-    )  # shape: (B*N*K, 3)
-    active_outputs = expert_outputs[
-        coords[:, 0],
-        coords[:, 1],
-        coords[:, 2],
-    ]  # shape: (B*N*K, d)
-    sorted_token_choice_outputs = active_outputs.reshape(
         batch_size,
-        sequence_length * num_selected_heads,
         hidden_dim,
     )
     restored_outputs = sorted_token_choice_outputs.gather(
@@ -2589,34 +2581,34 @@ def unpack_experts(
 # Helpers
 # ---------------------------------------------------------------------------
-def _enforce_no_overflow(condition: bool, tokens_per_expert, max_length) -> None:
-    """Enforce that no expert bucket exceeds the preallocated packed length.
-    This check fires when the number of tokens assigned to any expert in any
-    batch item exceeds mosrah_packed_length. When that limit is exceeded, the
-    packed buffer is too small to hold all assignments and data would be dropped.
-    Increase mosrah_overallocation_factor in ShramConfig to resolve.
-    The caller must derive condition via .item() on the max count tensor so that
-    dynamo captures a SymInt and the comparison produces a SymBool. Passing a
-    tensor comparison result directly bypasses the SymInt mechanism and prevents
-    the check from firing at compiled runtime.
     Args:
-        condition: True means no overflow has occurred; False means at least one
-            expert bucket exceeds packed_length. In compiled mode this is a SymBool
-            produced by comparing a SymInt against the static packed_length.
     """
     if torch.compiler.is_compiling():
-        torch._check(condition)
     else:
-        if not condition:
             raise RuntimeError(
                 "Expert packing overflow: at least one expert bucket contains more "
                 "tokens than mosrah_packed_length allows. Increase "
                 "mosrah_overallocation_factor in ShramConfig to resolve.\n"
-                f"Supported lengths were:\n {max_length}\n"
-                f"head lengths were:\n {tokens_per_expert}\n"
             )
@@ -2626,8 +2618,7 @@ def _count_tokens_per_expert(
 ) -> torch.Tensor:
     """Count how many routed token copies are assigned to each expert per batch item.
-    Uses scatter_add into a pre-sized (B, num_experts) zero buffer, producing a
-    statically-shaped output that compiles without graph breaks. Each position in
     flattened_selected_heads contributes one count to the corresponding expert slot.
     Args:
@@ -2639,19 +2630,18 @@ def _count_tokens_per_expert(
         Counts tensor of shape (B, num_experts).
     """
     batch_size = flattened_selected_heads.shape[0]
-    counts = torch.zeros(
         batch_size,
         num_experts,
         device=flattened_selected_heads.device,
-        dtype=flattened_selected_heads.dtype,
     )
-    counts.scatter_add_(
         dim=1,
         index=flattened_selected_heads,
-        src=torch.ones_like(flattened_selected_heads),
     )
-    return counts
 # -----------
 # Inlined from: router.py
 # -----------
@@ -2825,7 +2815,7 @@ class MoSRAHRouter(nn.Module):
         self.expert_bias = nn.Parameter(torch.zeros(config.num_mosrah_heads))
     @staticmethod
-    def get_mask(
             tensor: torch.Tensor,
             dim: int,
             n: int | torch.Tensor,
@@ -2958,7 +2948,7 @@ class MoSRAHRouter(nn.Module):
             choices_deficit = (min_choices - accepted_per_token).clamp_min(0)
             unproposed_logits = logits.masked_fill(proposals, float('-inf'))
-            new_proposals = cls.get_mask(
                 unproposed_logits, dim=-1, n=choices_deficit, capacity_scalar=min_choices,
             )
             proposals = proposals | new_proposals
@@ -2969,7 +2959,7 @@ class MoSRAHRouter(nn.Module):
             # Acceptances are recomputed from scratch each round so that a
             # stronger new proposal can displace a weaker prior one.
             proposed_logits = logits.masked_fill(~proposals, float('-inf'))
-            acceptances = cls.get_mask(
                 proposed_logits, dim=-2, n=remaining_capacity, capacity_scalar=capacity_scalar,
             )
@@ -3351,7 +3341,7 @@ class MoSRAHLayer(nn.Module):
             "position_ids": (position_ids, 0),
             "active_mask": (active_mask, False),
         }
-        packed, unpacking_mask = pack_experts(entries, setup, selected_heads, self.num_experts, self.packed_length)
         packed_hidden_states = packed["hidden_states"]
         packed_positions = packed["position_ids"]
         active_mask = packed["active_mask"]
@@ -3387,7 +3377,7 @@ class MoSRAHLayer(nn.Module):
         token_choice_outputs = unpack_experts(
             expert_outputs=packed_outputs,
             setup=setup,
-            unpacking_mask=unpacking_mask,
             selected_heads=selected_heads,
         )
         final_output = (
@@ -3886,11 +3876,7 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
     @staticmethod
     def create_masks_for_generate(
-        config: Any,
-        inputs_embeds: torch.Tensor,
         attention_mask: torch.Tensor | None,
-        past_key_values: Cache | None,
-        position_ids: torch.Tensor | None = None,
         **kwargs: Any,
     ) -> torch.Tensor | None:
         """Return the 2D attention_mask unchanged.
@@ -3944,7 +3930,7 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
             raise ValueError(
                 "position_ids must match the current input_ids shape exactly."
             )
-        if input_ids.dtype != torch.long:
             raise TypeError("position_ids must be an long tensor.")
     def _validate_labels(
@@ -3959,7 +3945,7 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
             raise ValueError("labels must have shape (batch, seq_len).")
         if labels.shape != input_ids.shape:
             raise ValueError("labels must have the same shape as input_ids.")
-        if input_ids.dtype != torch.long:
             raise TypeError("labels must be a long tensor.")
     def _validate_cache_inputs(
@@ -4044,11 +4030,11 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
                 (violated).
         """
         if torch.compiler.is_compiling():
-            # bool.item() is not captured as a SymBool by dynamo; converting to
-            # int first produces a SymInt, and the Python comparison (!=0) then
-            # yields a SymBool that torch._check folds into the compiled graph.
-            condition_as_int = condition.to(torch.int).item()
-            torch._check(condition_as_int != 0)
         else:
             if not condition.item():
                 raise RuntimeError(
@@ -4058,30 +4044,6 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
                     "uncached sequence to start at 0.",
                 )
-    @staticmethod
-    def _enforce_capture_scalar_outputs() -> None:
-        """Enforce that capture_scalar_outputs is enabled when compiling.
-        The safety checks in this model (e.g. position-zero constraint, packing
-        overflow detection) rely on torch._check folding into the compiled graph,
-        which requires torch._dynamo.config.capture_scalar_outputs = True. Without
-        it those checks are silently absent in the compiled model while appearing
-        to work in eager mode — a misconfiguration with no diagnostic output.
-        This method fires during dynamo tracing so the missing flag is surfaced
-        immediately at compile time rather than discovered from downstream failures.
-        """
-        if torch.compiler.is_compiling():
-            torch._check(
-                torch._dynamo.config.capture_scalar_outputs,
-                lambda: RuntimeError(
-                    "ShramForCausalLM requires torch._dynamo.config.capture_scalar_outputs = True "
-                    "when compiled. Without it, runtime safety checks (position constraints, "
-                    "overflow detection) are silently absent in the compiled model. Set the flag "
-                    "before calling torch.compile()."
-                ),
-            )
     def _standardize_full_attention_mask(
         self,
         input_ids: torch.Tensor,
@@ -4179,7 +4141,6 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
         # This keeps the main sequence readable while ensuring invalid states
         # fail before they can silently contaminate backbone execution.
         # ------------------------------------------------------------------
-        self._enforce_capture_scalar_outputs()
         self._validate_input_ids(input_ids)
         self._validate_attention_mask(input_ids, attention_mask)
         self._validate_position_ids(input_ids, position_ids)

         config = AutoConfig.from_pretrained(
             "your-namespace/advanced-transformers-lib",
             trust_remote_code=True,
+            num_decoder_layers=12,
         )
         model = AutoModelForCausalLM.from_config(config)
     def _check_no_overflow(max_count: torch.Tensor, capacity: int) -> None:
         """Raise if any (batch, head) slot would exceed the static buffer capacity.
+        Branches on whether the graph is being compiled. In compiled mode,
+        torch._assert_async fires asynchronously on the GPU when the condition
+        tensor is False. In eager mode, a plain RuntimeError is raised with a
+        descriptive message.
         Args:
             max_count: Scalar tensor — the maximum post-update count across all slots.
             capacity: The static buffer capacity (mosrah_cache_length).
         """
         if torch.compiler.is_compiling():
+            torch._assert_async(
+                max_count <= capacity,
+                "MoSRAHCache overflow: buffer capacity exceeded. "
+                "Increase mosrah_overallocation_factor in ShramConfig.",
+            )
         else:
             if max_count.item() > capacity:
                 raise RuntimeError(
         # Cumulative count of all token positions presented through update() for
         # this cache instance. This is the quantity HuggingFace generation reads
         # through get_seq_length() to track how far along the sequence we are.
+        self._total_processed = torch.tensor(0)
     def update(  # type: ignore[override]
         self,
         generation reads to track sequence progress and is not the same as active-token
         count or current window occupancy.
         """
+        return int(self._total_processed)
     def get_max_cache_shape(self) -> int:
         return self.sliding_window
 # -----------
 """Expert packing and unpacking for the MoSRAH path.
+This module owns the token-choice -> expert-choice -> token-choice conversion
+boundary used by the sparse routed attention path. Its public behavior is fixed:
+- setup_packing() prepares the auxiliary ordering data forwarded through packing
+  and unpacking.
+- pack_experts() converts routed token-choice tensors into padded expert-choice
+  tensors.
+- unpack_experts() restores token-choice ordering from padded expert-choice output.
+Packed expert-choice tensors are expert-major and left-justified. For each expert,
+routed token copies occupy the prefix of that expert's packed block; padding occupies
+the suffix. Every packed entry uses the same ordering and transfer artifact, so
+hidden states, positions, masks, and probabilities remain aligned across the boundary.
+pack_experts() returns a flat transfer index together with the packed entries. This
+index replaces the old boolean unpacking artifact as the source of truth for
+pack/unpack data movement: packing writes to those flat packed slots, and unpacking
+reads from those same slots.
 """
 ) -> dict[str, torch.Tensor]:
     """Prepare the auxiliary ordering data used by pack/unpack.
     Args:
         selected_heads: Routed token-choice head selections I of shape (B, N, K).
     Returns:
         Auxiliary payload dict with keys:
           - "flattened_selected_heads": H of shape (B, N*K)
+          - "permutation": expert-major permutation Pi of shape (B, N*K)
           - "inverse_permutation": inverse permutation Pi^{-1} of shape (B, N*K)
         This dict is forwarded whole to pack_experts and unpack_experts.
     """
         batch_size,
         sequence_length * num_selected_heads,
     )
+    # -----------------------------------------------------------------------
+    # Establish the expert-major ordering invariant.
+    #
+    # BEA later applies a triangular causal mask inside each expert bucket. That
+    # mask is only meaningful if routed copies for the same expert preserve their
+    # source-token order. Stable sorting by selected head establishes that order.
+    # -----------------------------------------------------------------------
     permutation = torch.argsort(flattened_selected_heads, dim=-1, stable=True)
     inverse_permutation = torch.argsort(permutation, dim=-1)
         "flattened_selected_heads": flattened_selected_heads,
         "permutation": permutation,
         "inverse_permutation": inverse_permutation,
     }
 ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
     """Pack token-choice tensors into expert-choice padded form.
     Args:
         entries: Mapping from string keys to (tensor, padding_value) pairs. Each
             tensor has shape (B, N, ...) and is rearranged into expert-choice layout
         selected_heads: Routed head selections I of shape (B, N, K).
         num_experts: Total number of experts L.
         packed_length: Static packed time dimension T. All per-expert buffers are
+            allocated to exactly this length. Raises if any actual per-expert token
+            count exceeds this value.
     Returns:
         Tuple of:
           - packed_entries: Dict with same keys as entries; each value is the
             packed tensor of shape (B, L, T, ...).
+          - flat_packed_transfer_indices: Long tensor of shape (B*N*K,). Each value
+            is the flattened padded expert-choice slot occupied by the corresponding
+            routed-copy row. Pass this to unpack_experts().
     """
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
+    num_routed_copies_per_batch = sequence_length * num_selected_heads
+    num_routed_copies = batch_size * num_routed_copies_per_batch
     flattened_selected_heads = setup["flattened_selected_heads"]
     permutation = setup["permutation"]
     # -----------------------------------------------------------------------
+    # Algorithm overview.
+    #
+    # Packing first builds one routed-copy row for each selected token/expert
+    # pair, ordered by the stable expert-major permutation. Those rows contain
+    # no padding. The final packed tensor reserves packed_length slots per expert.
+    # The flat transfer index bridges those layouts by adding back the cumulative
+    # padding skipped before each expert block.
+    # -----------------------------------------------------------------------
+    # -----------------------------------------------------------------------
+    # Build the shared routed-copy source rows.
     #
+    # This tensor identifies the source token row for each selected token/expert
+    # pair after the stable expert-major permutation. Every packed entry uses this
+    # same row plan, so all entries remain aligned before padded materialization.
     # -----------------------------------------------------------------------
     source_token_indices = torch.arange(
         sequence_length,
         sequence_length,
         num_selected_heads,
     )
+    flattened_source_token_indices = source_token_indices.reshape(
         batch_size,
+        num_routed_copies_per_batch,
     )
+    sorted_source_token_indices = flattened_source_token_indices.gather(
         dim=1,
         index=permutation,
     )
     # -----------------------------------------------------------------------
+    # Establish packed expert occupancy and capacity.
     #
+    # tokens_per_expert tells how many routed-copy rows occupy the prefix of each
+    # expert block. The padded layout is valid only when every prefix fits inside
+    # the configured packed_length.
     # -----------------------------------------------------------------------
     tokens_per_expert = _count_tokens_per_expert(flattened_selected_heads, num_experts)
+    _enforce_no_overflow(tokens_per_expert, packed_length)
     # -----------------------------------------------------------------------
+    # Build the flat insertion points for the padded expert frame.
     #
+    # Routed-copy rows omit padding, while the packed frame reserves packed_length
+    # slots for every expert. The transfer index adds back the cumulative padding
+    # skipped before each expert block, producing one flat destination slot for
+    # every routed-copy row. This tensor is forwarded to unpack_experts so removal
+    # uses the same positions that insertion used.
     # -----------------------------------------------------------------------
+    flat_tokens_per_expert = tokens_per_expert.reshape(-1)
+    flat_padding_per_expert = packed_length - flat_tokens_per_expert
+    flat_padding_before_expert = (
+        flat_padding_per_expert.cumsum(dim=0) - flat_padding_per_expert
+    )
+    flat_padding_for_routed_rows = torch.repeat_interleave(
+        flat_padding_before_expert,
+        flat_tokens_per_expert,
+        output_size=num_routed_copies,
+    )
+    flat_routed_row_indices = torch.arange(
+        num_routed_copies,
         device=flattened_selected_heads.device,
         dtype=torch.long,
+    )
+    flat_packed_transfer_indices = (
+        flat_routed_row_indices + flat_padding_for_routed_rows
+    )
     # -----------------------------------------------------------------------
+    # Materialize each entry through the shared routing and transfer artifacts.
     #
+    # Each entry first gathers into the shared routed-copy order. The flat packed
+    # allocation supplies padding, and the transfer index writes each routed-copy
+    # row into its padded expert slot before the public shape is restored.
     # -----------------------------------------------------------------------
     packed_entries: dict[str, torch.Tensor] = {}
     for key, (tensor, padding_value) in entries.items():
         extra_shape = tensor.shape[2:]
+        # The sorted source index is shared across all entries; expanding it over
+        # trailing dimensions lets the same routing/order plan apply to hidden
+        # states, positions, masks, probabilities, and any other packed tensor.
+        sorted_gather_indices = sorted_source_token_indices.view(
             batch_size,
+            num_routed_copies_per_batch,
             *(1,) * len(extra_shape),
         ).expand(-1, -1, *extra_shape)
+        sorted_tensor = tensor.gather(dim=1, index=sorted_gather_indices)
         packed_tensor = tensor.new_full(
+            (batch_size * num_experts * packed_length, *extra_shape),
             fill_value=padding_value,
         )
+        packed_tensor[flat_packed_transfer_indices] = sorted_tensor.reshape(
+            num_routed_copies,
+            *extra_shape,
+        )
+        packed_entries[key] = packed_tensor.reshape(
+            batch_size,
+            num_experts,
+            packed_length,
+            *extra_shape,
+        )
+    return packed_entries, flat_packed_transfer_indices
 # ---------------------------------------------------------------------------
 def unpack_experts(
     expert_outputs: torch.Tensor,
     setup: dict[str, torch.Tensor],
+    flat_packed_transfer_indices: torch.Tensor,
     selected_heads: torch.Tensor,
 ) -> torch.Tensor:
     """Restore token-choice ordering from BEA expert-choice output.
     Args:
         expert_outputs: Expert-choice BEA output y of shape (B, L, T, d).
         setup: Auxiliary payload returned by setup_packing().
+        flat_packed_transfer_indices: Transfer index returned by pack_experts().
+            Each value identifies a routed-copy slot in the flattened padded
+            expert-choice frame.
         selected_heads: Routed head selections I of shape (B, N, K).
     Returns:
     inverse_permutation = setup["inverse_permutation"]
     batch_size, sequence_length, num_selected_heads = selected_heads.shape
+    num_routed_copies_per_batch = sequence_length * num_selected_heads
     hidden_dim = expert_outputs.shape[-1]
+    # -----------------------------------------------------------------------
+    # Recover routed-copy rows from the same packed slots used at insertion.
+    #
+    # Packing writes into the forwarded flat slots, and unpacking reads from those
+    # same slots before applying the inverse routing permutation back to
+    # token-choice order.
+    # -----------------------------------------------------------------------
+    flat_expert_outputs = expert_outputs.reshape(-1, hidden_dim)
+    flat_routed_copy_outputs = flat_expert_outputs[flat_packed_transfer_indices]
+    sorted_token_choice_outputs = flat_routed_copy_outputs.reshape(
         batch_size,
+        num_routed_copies_per_batch,
         hidden_dim,
     )
     restored_outputs = sorted_token_choice_outputs.gather(
 # Helpers
 # ---------------------------------------------------------------------------
+def _enforce_no_overflow(tokens_per_expert: torch.Tensor, packed_length: int) -> None:
+    """Enforce that no expert bucket exceeds the preallocated packed length.
+    This check fires when the number of tokens assigned to any expert in any batch
+    item exceeds mosrah_packed_length. When that limit is exceeded, the packed buffer
+    is too small to hold all assignments and data would be dropped. Increase
+    mosrah_overallocation_factor in ShramConfig to resolve.
     Args:
+        tokens_per_expert: Per-expert token counts, shape (B, num_experts).
+        packed_length: The preallocated packed time dimension.
     """
     if torch.compiler.is_compiling():
+        torch._assert_async(
+            tokens_per_expert.max() <= packed_length,
+            "Expert packing overflow: expert bucket exceeds mosrah_packed_length. "
+            "Increase mosrah_overallocation_factor in ShramConfig.",
+        )
     else:
+        max_count = tokens_per_expert.max().item()
+        if max_count > packed_length:
             raise RuntimeError(
                 "Expert packing overflow: at least one expert bucket contains more "
                 "tokens than mosrah_packed_length allows. Increase "
                 "mosrah_overallocation_factor in ShramConfig to resolve.\n"
+                f"Packed length: {packed_length}\n"
+                f"Head lengths: {tokens_per_expert}\n"
             )
 ) -> torch.Tensor:
     """Count how many routed token copies are assigned to each expert per batch item.
+    Uses scatter_add into a pre-sized (B, num_experts) buffer. Each position in
     flattened_selected_heads contributes one count to the corresponding expert slot.
     Args:
         Counts tensor of shape (B, num_experts).
     """
     batch_size = flattened_selected_heads.shape[0]
+    tokens_per_expert = torch.zeros(
         batch_size,
         num_experts,
         device=flattened_selected_heads.device,
+        dtype=torch.long,
     )
+    tokens_per_expert.scatter_add_(
         dim=1,
         index=flattened_selected_heads,
+        src=torch.ones_like(flattened_selected_heads, dtype=torch.long),
     )
+    return tokens_per_expert
 # -----------
 # Inlined from: router.py
 # -----------
         self.expert_bias = nn.Parameter(torch.zeros(config.num_mosrah_heads))
     @staticmethod
+    def get_best_proposals(
             tensor: torch.Tensor,
             dim: int,
             n: int | torch.Tensor,
             choices_deficit = (min_choices - accepted_per_token).clamp_min(0)
             unproposed_logits = logits.masked_fill(proposals, float('-inf'))
+            new_proposals = cls.get_best_proposals(
                 unproposed_logits, dim=-1, n=choices_deficit, capacity_scalar=min_choices,
             )
             proposals = proposals | new_proposals
             # Acceptances are recomputed from scratch each round so that a
             # stronger new proposal can displace a weaker prior one.
             proposed_logits = logits.masked_fill(~proposals, float('-inf'))
+            acceptances = cls.get_best_proposals(
                 proposed_logits, dim=-2, n=remaining_capacity, capacity_scalar=capacity_scalar,
             )
             "position_ids": (position_ids, 0),
             "active_mask": (active_mask, False),
         }
+        packed, unpacking_map = pack_experts(entries, setup, selected_heads, self.num_experts, self.packed_length)
         packed_hidden_states = packed["hidden_states"]
         packed_positions = packed["position_ids"]
         active_mask = packed["active_mask"]
         token_choice_outputs = unpack_experts(
             expert_outputs=packed_outputs,
             setup=setup,
+            flat_packed_transfer_indices=unpacking_map,
             selected_heads=selected_heads,
         )
         final_output = (
     @staticmethod
     def create_masks_for_generate(
         attention_mask: torch.Tensor | None,
         **kwargs: Any,
     ) -> torch.Tensor | None:
         """Return the 2D attention_mask unchanged.
             raise ValueError(
                 "position_ids must match the current input_ids shape exactly."
             )
+        if position_ids.dtype != torch.long:
             raise TypeError("position_ids must be an long tensor.")
     def _validate_labels(
             raise ValueError("labels must have shape (batch, seq_len).")
         if labels.shape != input_ids.shape:
             raise ValueError("labels must have the same shape as input_ids.")
+        if labels.dtype != torch.long:
             raise TypeError("labels must be a long tensor.")
     def _validate_cache_inputs(
                 (violated).
         """
         if torch.compiler.is_compiling():
+            torch._assert_async(
+                condition,
+                "Uncached ShramForCausalLM: nonzero starting positions. "
+                "Supply a ShramCache with prefix or rebase sequence to start at 0.",
+            )
         else:
             if not condition.item():
                 raise RuntimeError(
                     "uncached sequence to start at 0.",
                 )
     def _standardize_full_attention_mask(
         self,
         input_ids: torch.Tensor,
         # This keeps the main sequence readable while ensuring invalid states
         # fail before they can silently contaminate backbone execution.
         # ------------------------------------------------------------------
         self._validate_input_ids(input_ids)
         self._validate_attention_mask(input_ids, attention_mask)
         self._validate_position_ids(input_ids, position_ids)