SHRAM-dev / huggingface.py

Update architecture and tokenizer

102f1bb verified 13 days ago

195 kB

	# This file is auto-generated by stage_for_hub.py from the source repository.
	# Do not edit it directly — changes will be overwritten on the next release.

	"""HuggingFace causal-LM wrapper for SHRAM.

	ShramForCausalLM is the HuggingFace-facing language-model boundary for SHRAM.
	It owns token embedding lookup, LM-head projection, wrapper-level next-token
	cross-entropy loss, config-controlled tied embeddings, and generation/cache
	orchestration at the wrapper boundary.

	The backbone remains a pure transformer stack. ShramModel accepts pre-embedded
	hidden states together with current position IDs, a current active mask, and an
	optional ShramCache. It has no knowledge of token IDs, vocabulary projection,
	or causal-LM loss.

	HuggingFace generation reaches this wrapper with two different tensor
	conventions:

	- ``position_ids`` is a current-step tensor. GenerationMixin updates the total
	sequence state between steps, then slices position-bearing tensors back down
	before calling ``forward()``.
	- ``attention_mask`` is a full 2D mask over the total sequence so far. This
	wrapper slices its recent chunk to produce the current semantic liveness mask
	expected by the backbone.

	Generation-created caches are handled in ``_prepare_cache_for_generation``.
	That hook ensures HuggingFace generation uses ShramCache rather than a generic
	dynamic cache. The direct ``forward()`` path does not silently create caches;
	when ``use_cache=True`` it expects a truthful ShramCache to have been supplied.
	"""
	from dataclasses import dataclass
	from typing import Any
	import torch
	import torch.nn as nn
	from transformers import GenerationMixin
	from transformers import PreTrainedModel
	from transformers.cache_utils import Cache
	from transformers.generation.configuration_utils import GenerationMode
	from transformers.modeling_outputs import CausalLMOutputWithPast
	import math
	from transformers import PretrainedConfig
	from transformers.cache_utils import CacheLayerMixin
	from torch import nn
	from torch.nn.attention.flex_attention import create_block_mask
	from torch.nn.attention.flex_attention import flex_attention
	import torch.nn.functional as F











	# -----------
	# Inlined from: shram_cache.py
	# -----------
	"""SHRAM top-level cache — model-wide owner for the full SHRAM decoder stack.

	The HuggingFace Cache protocol expects a single top-level Cache object that owns one
	CacheLayerMixin per decoder layer. The actual SHRAM caching responsibilities live one level
	lower in ShramLayerCache — each of which owns a LocalSlidingWindowLayerCache and a MoSRAHCache.
	ShramCache bridges those two levels: it constructs one ShramLayerCache per decoder layer,
	presents them through the Cache interface, and transparently forwards model-wide operations
	across all of them.

	ShramCache does not define a composite update() interface. The two attention paths inside each
	SHRAM layer have different update semantics, and neither the layer-level boundary (Unit 6.B)
	nor the model-level boundary here can meaningfully unify them. Callers must reach down to the
	relevant sub-cache directly. ShramCache's role is ownership, construction, and model-wide
	coordination of the layer caches — not routing attention inputs.

	Sequence length is reported by delegating to the local sliding-window sub-cache of the
	specified layer, which tracks the cumulative count of token positions processed. This is
	what HuggingFace generation reads through get_seq_length().
	"""




	# -----------
	# Inlined from: configuration.py
	# -----------
	"""Configuration for the SHRAM transformer.

	All architectural parameters that vary across model scales or are meaningful research
	variables are expressed here. Architectural constants (no bias in linear layers,
	SwiGLU activation with SiLU gate) are implemented in the relevant modules and
	documented at the point of use — they are not config parameters because they do not
	vary and changing them produces a different architecture, not a different scale.

	RoPE configuration is owned entirely by this config. Each attention path reads its
	parameters directly and constructs its own RotaryEmbedding instance explicitly — no
	HuggingFace rope infrastructure is used. See Unit 5.A design decisions in plan.md.
	"""






	class ShramConfig(PretrainedConfig):
	"""Configuration class for the SHRAM decoder-only transformer.

	SHRAM (Sparse Hybrid Token Routed Attention Mixture) replaces every standard
	attention layer with a hybrid layer H(x) = h_l(x) + h_s(x), where h_l is a
	local sliding-window causal attention path and h_s is the MoSRAH sparse routed
	path. All other components follow the Llama 3 baseline.

	This config is the single source of truth for every architectural dimension of the
	model. Nothing in the architecture may use a literal number that belongs here.

	Two independent RoPE configurations exist — one per attention path:

	- h_l always uses standard RoPE with ``local_rope_theta``.
	- BEA always uses YaRN with ``mosrah_rope_theta``, ``training_sequence_length``,
	``inference_sequence_length``, ``alpha``, and ``beta``. When
	``inference_sequence_length == training_sequence_length`` the YaRN scale factor
	``s = 1`` and YaRN reduces exactly to standard RoPE — this is the default state
	and the correct setting for experiments that do not require context extension.

	Registered with HuggingFace AutoClass via ``auto_map``. Instantiate from the Hub::

	config = AutoConfig.from_pretrained(
	"your-namespace/advanced-transformers-lib",
	trust_remote_code=True,
	num_decoder_layers=12,
	)
	model = AutoModelForCausalLM.from_config(config)

	Args:
	vocab_size: Vocabulary size. Controls the embedding table and output logits
	dimension. Must match the tokenizer.
	embedding_width: Model width ``d``. The dimension of the residual stream.
	mlp_width: FFN hidden dimension.
	num_decoder_layers: Number of transformer blocks stacked in sequence.
	num_sliding_window_heads: Number of heads in the local sliding-window path h_l.
	num_mosrah_heads: Total MoSRAH expert heads available ``L``.
	num_selected_heads: MoSRAH heads each token selects ``K``.
	head_dim: Per-head dimension, shared by both attention paths. Must be even
	(RoPE rotates dimensions in pairs). Paper uses 16.
	window_size: Sliding window size for h_l. Paper uses 128.
	rope_mode: RoPE position encoding mode for BEA. ``"main_sequence"`` supplies
	original sequence positions; ``"semantic_sequence"`` supplies local slot
	indices. Both are required; experimentally correct mode is undetermined
	(paper §4). Default ``"main_sequence"``.
	rms_norm_eps: Epsilon for RMSNorm layers.
	local_rope_theta: RoPE base frequency ``b`` for the local attention path h_l.
	Paper uses b=10000.
	mosrah_rope_theta: RoPE base frequency ``b`` for the BEA path. Paper uses
	b=10000.
	training_sequence_length: Context length ``C_train`` the model was or will be
	trained at. Used to compute the YaRN scale factor for BEA.
	inference_sequence_length: Context length ``C_target`` the model must support
	at inference. Optional; defaults to ``training_sequence_length`` so that
	``scale=1`` and YaRN reduces to standard RoPE unless explicitly extended.
	alpha: YaRN ramp lower boundary α (paper §A.2). Frequency dimensions with
	``r(d) < alpha`` are fully interpolated by scale s. Paper value: 1.0.
	beta: YaRN ramp upper boundary β (paper §A.2). Frequency dimensions with
	``r(d) > beta`` are left unscaled. Paper value: 32.0.
	attention_dropout: Dropout probability on attention weights. Default 0.0.
	use_cache: Whether to return past_key_values for KV caching.
	output_hidden_states: Whether to return hidden states after each layer.
	tie_word_embeddings: Whether input embedding and LM head share weights.
	use_residual_gate: When True, each DecoderLayer gates its residual contributions
	with a learnable scalar parameter (init: zero). When False, uses a fixed
	``1/√num_decoder_layers`` scale instead, which preserves O(1) residual
	variance at depth with no learnable gate. Default True.
	"""

	model_type = "shram"

	auto_map = {
	"AutoConfig": "configuration.ShramConfig",
	"AutoModelForCausalLM": "huggingface.ShramForCausalLM",
	}

	def __init__(
	self,
	vocab_size: int = 50277,
	embedding_width: int = 512,
	mlp_width: int = 1366,
	num_decoder_layers: int = 12,
	num_sliding_window_heads: int = 16,
	num_mosrah_heads: int = 16,
	num_selected_heads: int = 16,
	head_dim: int = 16,
	window_size: int = 128,
	rope_mode: str = "main_sequence",
	rms_norm_eps: float = 1e-5,
	local_rope_theta: float = 10000.0,
	mosrah_rope_theta: float = 10000.0,
	training_sequence_length: int = 1024,
	inference_sequence_length: int \| None = None,
	alpha: float = 1.0,
	beta: float = 32.0,
	attention_dropout: float = 0.0,
	use_cache: bool = True,
	output_hidden_states: bool = False,
	tie_word_embeddings: bool = False,
	use_residual_gate: bool = True,
	**kwargs
	):
	if head_dim % 2 != 0:
	raise ValueError(
	f"head_dim must be even (RoPE rotates dimensions in pairs). "
	f"Got head_dim={head_dim}."
	)

	if rope_mode not in {"main_sequence", "semantic_sequence"}:
	raise ValueError(
	f"rope_mode must be 'main_sequence' or 'semantic_sequence', "
	f"got '{rope_mode}'."
	)

	if training_sequence_length <= 0:
	raise ValueError(
	f"training_sequence_length must be positive, "
	f"got {training_sequence_length}."
	)

	if inference_sequence_length is None:
	inference_sequence_length = training_sequence_length
	if inference_sequence_length <= 0:
	raise ValueError(
	f"inference_sequence_length must be positive, "
	f"got {inference_sequence_length}."
	)

	if num_mosrah_heads % num_selected_heads != 0:
	raise ValueError(
	f"num_mosrah_heads must be exactly divisible by num_selected_heads. "
	f"Mechanical load balancing partitions the sequence into blocks of "
	f"W = num_mosrah_heads // num_selected_heads tokens; each block covers "
	f"every expert exactly once, which requires an integer W. "
	f"Got num_mosrah_heads={num_mosrah_heads}, num_selected_heads={num_selected_heads}."
	)

	self.vocab_size = vocab_size
	self.embedding_width = embedding_width
	self.mlp_width = mlp_width
	self.num_decoder_layers = num_decoder_layers
	self.num_sliding_window_heads = num_sliding_window_heads
	self.num_mosrah_heads = num_mosrah_heads
	self.num_selected_heads = num_selected_heads
	self.head_dim = head_dim
	self.window_size = window_size
	self.rope_mode = rope_mode
	self.rms_norm_eps = rms_norm_eps
	self.local_rope_theta = local_rope_theta
	self.mosrah_rope_theta = mosrah_rope_theta
	self.training_sequence_length = training_sequence_length
	self.inference_sequence_length = inference_sequence_length
	self.alpha = alpha
	self.beta = beta
	self.attention_dropout = attention_dropout
	self.use_cache = use_cache
	self.use_residual_gate = use_residual_gate

	super().__init__(
	tie_word_embeddings=tie_word_embeddings,
	output_hidden_states=output_hidden_states,
	**kwargs
	)

	# Promote auto_map to an instance attribute so PretrainedConfig.to_dict()
	# serialises it into config.json.
	self.auto_map = type(self).auto_map

	@property
	def scale(self) -> float:
	"""YaRN context extension scale factor s = inference_sequence_length / training_sequence_length.

	When scale == 1.0, YaRN reduces exactly to standard RoPE — all frequency
	adjustments cancel and A_rope = 1. This is the default state.
	"""
	return self.inference_sequence_length / self.training_sequence_length

	@property
	def mosrah_packed_length(self) -> int:
	"""Static packed time dimension T for expert packing.

	Mechanical load balancing guarantees exactly
	``training_sequence_length * num_selected_heads / num_mosrah_heads``
	tokens per expert. The ceiling handles non-integer results when
	training_sequence_length is not divisible by the block length W.

	All consumers of the packed buffer size must read this property rather
	than deriving T independently.
	"""
	return math.ceil(
	self.training_sequence_length
	* self.num_selected_heads
	/ self.num_mosrah_heads
	) + self.block_length

	@property
	def mosrah_cache_length(self) -> int:
	"""Static per-(batch, head) slot capacity for the MoSRAH inference cache.

	Mechanical load balancing guarantees exactly
	``inference_sequence_length * num_selected_heads / num_mosrah_heads``
	tokens per expert over the full inference context. The ceiling handles
	non-integer results when inference_sequence_length is not divisible by
	the block length W.

	Distinct from ``mosrah_packed_length``, which sizes the training packing
	buffer using ``training_sequence_length``. This property uses
	``inference_sequence_length`` because the cache must hold the full
	accumulated token history across the entire inference run.

	All consumers of the MoSRAH cache buffer size must read this property
	rather than deriving the capacity independently.
	"""
	return math.ceil(
	self.inference_sequence_length
	* self.num_selected_heads
	/ self.num_mosrah_heads
	) + self.block_length

	@property
	def block_length(self) -> int:
	"""Routing block length W = num_mosrah_heads // num_selected_heads.

	Within each block of W consecutive tokens every expert is used exactly once,
	giving perfect load balance by construction. The E % K == 0 constraint
	enforced at construction guarantees W is an exact integer.

	All consumers of the routing block length must read this property rather
	than deriving W independently.
	"""
	return self.num_mosrah_heads // self.num_selected_heads

	# -----------
	# Inlined from: shram_layer_cache.py
	# -----------
	"""SHRAM per-layer cache — composite owner for one SHRAM decoder layer.

	A SHRAM decoder layer contains two distinct attention pathways at one attention slot: the
	local sliding-window path and the MoSRAH sparse path. Each path has its own cache with
	different semantics and a different downstream consumer. ShramLayerCache owns both, satisfies
	the HuggingFace per-layer cache role, and exposes each sub-cache directly so its attention
	path can interact with it without indirection.

	ShramLayerCache does not define a composite update() interface. The two paths have materially
	different update semantics — the local side uses chunk-local key/value/mask concatenation
	while the MoSRAH side uses expert-choice scatter with an active mask — and merging these
	behind a single update() would hide those differences behind a misleading abstraction. Instead,
	each attention path calls update() on the sub-cache it owns. ShramLayerCache acts as the
	ownership, coordination, and reset/reorder boundary for one decoder layer.

	Sequence length at this boundary is reported by delegating to the local sliding-window
	sub-cache, which tracks the cumulative count of token positions processed. This is the
	quantity HuggingFace generation reads through get_seq_length().
	"""





	# -----------
	# Inlined from: mosrah_cache.py
	# -----------
	"""MoSRAH sparse KV cache — single-layer implementation.

	MoSRAH routes each token to K of L available expert heads, so its KV cache is indexed
	by head rather than by sequence position. The routing is dynamic and produces a ragged
	distribution of token counts across (batch, head) slots — different batch items may
	route different numbers of tokens to the same head, and different heads accumulate at
	different rates. DynamicCache cannot represent this correctly: it concatenates along
	the sequence dimension and assumes uniform token counts across the batch. MoSRAHCache
	therefore uses a custom buffer design.

	Keys and values are stored in the CacheLayerMixin-standard self.keys and self.values
	attributes as (B, L, T, u) tensors, where B is batch size, L is the number of expert
	heads (num_mosrah_heads), T is the current buffer capacity, and u is the bottlenecked
	head embedding width (head_dim). A (B, L) integer count tensor _counts tracks the
	valid occupancy of each (batch, head) slot. Buffer capacity is exposed as the
	buffer_capacity property and is derived directly from self.keys rather than tracked
	as a separate variable.

	The primary interface is update(key_states, value_states, active_mask), which accepts
	expert-choice layout, stores only active entries in causal order, and returns the full
	accumulated (keys, values, active_mask) for immediate use by BEA. The returned
	active_mask identifies valid cached positions; everything beyond each slot's count is
	junk data that downstream attention must exclude.

	BEA applies RoPE and calls update() with post-RoPE keys (K̃). The occupancy counts
	exposed by get_heads_lengths() must be read before update() if the caller needs the
	pre-update occupancy for position computation (Unit 10.A). update() increments counts
	in-place and the pre-update values are not recoverable afterward.

	All buffers are allocated at construction time. MoSRAHCache is constructed by
	ShramLayerCache, which has access to batch size, device, and all model config parameters
	needed to fully specify the storage layout upfront.
	"""





	class MoSRAHCache(CacheLayerMixin):
	"""KV cache for the MoSRAH sparse attention path — single decoder layer.

	Subclasses CacheLayerMixin to satisfy the HuggingFace per-layer cache role.
	Stores keys and values in the mixin-standard self.keys and self.values attributes
	using a custom (B, L, T, u) layout rather than delegating to DynamicCache,
	which cannot represent MoSRAH's ragged per-(batch, head) token counts correctly.

	All storage is allocated at construction time and is_initialized is True
	immediately. The caller (ShramLayerCache) provides batch size, device, and model
	config parameters so no lazy allocation is needed.

	Input is expected in expert-choice layout: (B, L, T, u) key/value tensors with a
	(B, L, T) boolean active_mask. Only positions where active_mask is True are written.
	This matches the packed representation produced by expert packing in the MoSRAH
	forward pass, where BEA has already applied RoPE before calling update().

	Args:
	num_mosrah_heads: Total number of MoSRAH expert heads (L). Determines the
	second dimension of all storage tensors.
	head_dim: Bottlenecked head embedding width (u). Determines the fourth
	dimension of all storage tensors.
	batch_size: Number of sequences in the batch. Determines the first dimension
	of all storage tensors.
	device: Device on which to allocate all tensors. Should match the model device.
	mosrah_cache_length: Static sequence capacity per (batch, head) slot. Equal to
	config.mosrah_cache_length. The buffer never grows; if any slot would exceed
	this capacity, update() raises in both eager and compiled modes. Increase
	mosrah_overallocation_factor in ShramConfig to resolve an overflow.
	"""

	is_compileable = True
	is_sliding = False

	def __init__(
	self,
	num_mosrah_heads: int,
	head_dim: int,
	batch_size: int,
	device: torch.device,
	mosrah_cache_length: int,
	) -> None:
	super().__init__()
	self.num_mosrah_heads = num_mosrah_heads
	self.head_dim = head_dim
	self.batch_size = batch_size
	self.device = device
	self.mosrah_cache_length = mosrah_cache_length

	# Allocate primary storage into the mixin-standard self.keys / self.values so
	# that inherited methods (offload, prefetch) operate on real tensors. _counts
	# tracks valid occupancy per (batch, head) slot.
	self.keys: torch.Tensor = torch.zeros(
	batch_size, num_mosrah_heads, mosrah_cache_length, head_dim, device=device
	)
	self.values: torch.Tensor = torch.zeros(
	batch_size, num_mosrah_heads, mosrah_cache_length, head_dim, device=device
	)
	self._counts: torch.Tensor = torch.zeros(
	batch_size, num_mosrah_heads, dtype=torch.long, device=device
	)

	# Storage is fully allocated at construction — the cache is initialized.
	self.is_initialized = True

	# ---------------------------------------------------------------------------
	# Properties
	# ---------------------------------------------------------------------------

	@property
	def buffer_capacity(self) -> int:
	"""Current number of slots allocated per (batch, head) pair.

	Equal to mosrah_cache_length as supplied at construction. Derived from
	self.keys so it remains consistent with the actual buffer shape.
	"""
	return self.keys.shape[2]

	# ---------------------------------------------------------------------------
	# Primary API
	# ---------------------------------------------------------------------------

	def update( # type: ignore[override]
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	active_mask: torch.Tensor,
	cache_kwargs: dict \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Scatter active key/value states into the buffer and return the full cache state.

	Accepts expert-choice layout: key_states and value_states are (B, L, T, u);
	active_mask is (B, L, T) bool with True marking real tokens. Only active
	positions are written; inactive positions are ignored.

	Uses a fixed-shape destination mask constructed from per-slot write intervals
	to transfer active tokens into the buffer without any data-dependent shape
	operations. Active tokens are left-justified within each packed slot by the
	packing machinery, so the destination positions are a contiguous range
	starting at the current slot count — no cumsum or torch.where needed.

	Returns the full accumulated (keys, values, active_mask) across the cached
	sparse sequence. The returned active_mask is True exactly for slots t <
	counts[b, l]; everything beyond is junk data that BEA must exclude.

	Note: get_heads_lengths() must be called before update() if the caller needs
	the pre-update occupancy for position computation (Unit 10.A). update()
	increments counts in-place and the pre-update values are not recoverable.

	Args:
	key_states: Shape (B, L, T, u) — post-RoPE key vectors in expert-choice layout.
	value_states: Shape (B, L, T, u) — value vectors in expert-choice layout.
	active_mask: Shape (B, L, T) bool — True for real tokens, False for padding.
	cache_kwargs: Unused; present to satisfy the CacheLayerMixin signature.

	Returns:
	Tuple of (keys, values, active_mask):
	keys: (B, L, mosrah_cache_length, u) float — full key buffer including junk slots.
	values: (B, L, mosrah_cache_length, u) float — full value buffer including junk slots.
	active_mask: (B, L, mosrah_cache_length) bool — True iff slot t has been written.
	"""
	incoming_delta = active_mask.long().sum(dim=2) # (B, L)

	post_counts = self._counts + incoming_delta
	self._check_no_overflow(post_counts.max(), self.mosrah_cache_length)

	# Build a fixed-shape destination mask in cache space. Active tokens within
	# each (b, l) slot are left-justified by the packing machinery, so they occupy
	# positions 0..s-1 in their packed slot. The corresponding cache positions are
	# write_start[b,l]..write_start[b,l]+write_count[b,l]-1. Broadcasting a
	# time arange against these per-slot intervals selects exactly the target
	# positions without any data-dependent shape query.
	write_start = self._counts.unsqueeze(-1) # cache position where new tokens begin
	write_count = incoming_delta.unsqueeze(-1) # number of new tokens arriving per slot
	time_arange = torch.arange(
	self.mosrah_cache_length, device=active_mask.device
	)
	dest_mask = (time_arange >= write_start) & (time_arange < write_start + write_count)
	# dest_mask: (B, L, mosrah_cache_length)

	# Transfer key and value vectors. Left-justification guarantees that
	# dest_mask and active_mask have equal True counts per (b, l) slot, so the
	# boolean-mask transfer is correct without any explicit count verification.
	self.keys[dest_mask] = key_states[active_mask]
	self.values[dest_mask] = value_states[active_mask]
	self._counts[:] = post_counts[:]

	return self.keys, self.values, self._make_active_mask()

	def get_heads_lengths(self) -> torch.Tensor:
	"""Return the per-(batch, head) token count for this layer.

	This is the authoritative occupancy tensor consumed by BEA for attention
	masking and by position computation (Unit 10.A) for semantic-sequence
	position computation.

	Note: in the MoSRAH forward pass, this must be called before update() if the
	caller needs the pre-update occupancy. update() increments these counts in-place.

	Returns:
	Integer tensor of shape (B, L) where entry [b, h] is the number of valid
	tokens stored in the (b, h) slot. Zero for slots with no writes yet.
	"""
	return self._counts

	# ---------------------------------------------------------------------------
	# CacheLayerMixin — overridden coordination methods
	# ---------------------------------------------------------------------------

	def reset(self) -> None:
	"""Clear all cached key and value tensors.

	Zeroes self.keys, self.values, and _counts in place. Storage remains allocated
	and is_initialized remains True — only the contents are cleared.
	"""
	self.keys.zero_()
	self.values.zero_()
	self._counts.zero_()

	def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
	"""Reorder the batch dimension of all cached tensors for beam search.

	Applied atomically across self.keys, self.values, and _counts. Beam search
	must reorder all three together or the occupancy counts and buffer contents
	will correspond to different beam hypotheses.

	Overrides the parent because the parent's implementation calls get_seq_length(),
	which is not supported for this cache.

	Args:
	beam_idx: Permutation indices of shape (batch,) produced by the beam
	search algorithm.
	"""
	self.keys = self.keys[beam_idx]
	self.values = self.values[beam_idx]
	self._counts = self._counts[beam_idx]

	def batch_repeat_interleave(self, repeats: int) -> None:
	"""Expand the batch dimension by repeating each entry repeats times.

	Used at beam search initialisation to expand the cache from batch size B to
	B * repeats, matching the expanded beam candidate batch. Applied atomically
	across keys, values, and _counts; batch_size is updated to reflect the new size.

	Args:
	repeats: Number of times to repeat each batch entry.
	"""
	self.keys = self.keys.repeat_interleave(repeats, dim=0)
	self.values = self.values.repeat_interleave(repeats, dim=0)
	self._counts = self._counts.repeat_interleave(repeats, dim=0)
	self.batch_size = self.batch_size * repeats

	def batch_select_indices(self, indices: torch.Tensor) -> None:
	"""Select a subset of batch entries by index.

	Used in contrastive search to retain only the selected candidate entries.
	Applied atomically across keys, values, and _counts; batch_size is updated
	to reflect the number of retained entries.

	Args:
	indices: 1-D integer tensor of batch indices to retain.
	"""
	self.keys = self.keys[indices]
	self.values = self.values[indices]
	self._counts = self._counts[indices]
	self.batch_size = indices.shape[0]

	def offload(self) -> None:
	"""Offload all cached tensors to CPU.

	Extends the parent to also offload _counts, which the parent does not know
	about. All three tensors are moved atomically so device state remains consistent.
	"""
	super().offload()
	self._counts = self._counts.to("cpu", non_blocking=True)

	def prefetch(self) -> None:
	"""Move all cached tensors back to the model device ahead of time.

	Extends the parent to also prefetch _counts, which the parent does not know
	about. _counts is synced to self.keys.device after the parent moves keys and
	values, so all three remain consistent.
	"""
	super().prefetch()
	if self._counts.device != self.keys.device:
	self._counts = self._counts.to(self.keys.device, non_blocking=True)

	def lazy_initialization( # type: ignore[override]
	self, key_states: torch.Tensor, value_states: torch.Tensor
	) -> None:
	"""No-op — storage is fully allocated at construction time."""
	pass

	# ---------------------------------------------------------------------------
	# CacheLayerMixin — unsupported abstract methods
	# ---------------------------------------------------------------------------

	def get_seq_length(self) -> int: # type: ignore[override]
	"""Not supported — no single sequence length represents this cache's state.

	MoSRAH heads accumulate independently; (batch, head) slots have different
	lengths depending on routing history. There is no meaningful scalar summary.
	Use get_heads_lengths() for per-head occupancy.
	"""
	raise NotImplementedError(
	"MoSRAHCache has no single sequence length. "
	"Use get_heads_lengths() for per-head occupancy."
	)

	def get_max_cache_shape(self) -> int: # type: ignore[override]
	"""Return the static per-(batch, head) slot capacity of this cache.

	Equal to mosrah_cache_length as supplied at construction, which is derived
	from config.mosrah_cache_length. Required by the HuggingFace static cache
	contract; generation machinery uses this to size attention masks.
	"""
	return self.mosrah_cache_length

	def get_mask_sizes( # type: ignore[override]
	self,
	cache_position: torch.Tensor,
	) -> tuple[int, int]:
	"""Not supported — MoSRAHCache does not participate in HF mask construction."""
	raise NotImplementedError(
	"MoSRAHCache does not support get_mask_sizes()."
	)

	# ---------------------------------------------------------------------------
	# Internal helpers
	# ---------------------------------------------------------------------------

	def _make_active_mask(self) -> torch.Tensor:
	"""Construct the (B, L, T) active mask from current counts.

	Returns True at position [b, l, t] iff t < _counts[b, l], i.e. the slot
	has been written. Positions at or beyond the count are junk and must be
	excluded by downstream attention.
	"""
	cap = self.buffer_capacity
	return (
	torch.arange(cap, device=self.keys.device)
	.expand(self.batch_size, self.num_mosrah_heads, cap)
	< self._counts.unsqueeze(-1)
	)

	@staticmethod
	def _check_no_overflow(max_count: torch.Tensor, capacity: int) -> None:
	"""Raise if any (batch, head) slot would exceed the static buffer capacity.

	Branches on whether the graph is being compiled. In compiled mode,
	torch._assert_async fires asynchronously on the GPU when the condition
	tensor is False. In eager mode, a plain RuntimeError is raised with a
	descriptive message.

	Args:
	max_count: Scalar tensor — the maximum post-update count across all slots.
	capacity: The static buffer capacity (mosrah_cache_length).
	"""
	if torch.compiler.is_compiling():
	torch._assert_async(
	max_count <= capacity,
	"MoSRAHCache overflow: buffer capacity exceeded. "
	"Increase mosrah_overallocation_factor in ShramConfig.",
	)
	else:
	if max_count.item() > capacity:
	raise RuntimeError(
	f"MoSRAHCache overflow: a (batch, head) slot would reach "
	f"{max_count.item()} tokens but the static buffer capacity is "
	f"{capacity}. Increase mosrah_overallocation_factor in ShramConfig."
	)


	# -----------
	# Inlined from: router_cache.py
	# -----------
	"""Block-state cache for the MoSRAH causal block-balanced router.

	The block-balanced router partitions the token sequence into non-overlapping blocks
	of W = L/K tokens. Within each block every expert is assigned exactly once, giving
	perfect load balance by construction. During training the full sequence is available
	and block state is managed locally in MoSRAHRouter.forward(). During inference tokens
	arrive one at a time and the router must remember which experts have been claimed in
	the current partial block across decode steps.

	RouterCache holds two pieces of state across decode steps:

	- _used_in_block: Boolean mask (B, L) tracking which experts have been claimed by
	earlier tokens in the current block. The decode router masks these to -inf before
	TopK, preserving the one-usage-per-block invariant.

	- _step_in_block: Integer counter (B,) of how many tokens have been processed in
	the current block. Reaches block_length W when the block completes, at which
	point both tensors are reset in-place for the next block.

	All decode-step operations (update_decode) use fixed-shape in-place tensor ops and
	are fully compileable under torch.compile(dynamic=False, fullgraph=True). The prefill
	update (update_prefill) may use data-dependent indexing and must not be called inside
	a compiled graph; prefill runs in eager mode before the compiled decode loop in
	standard HuggingFace generate().

	RouterCache is constructed by ShramLayerCache and passed directly to
	MoSRAHRouter.forward(). ShramLayerCache.reset() clears the router state atomically
	with the KV caches it also owns.
	"""





	class RouterCache(CacheLayerMixin):
	"""Block-state cache for the MoSRAH causal block-balanced router.

	Tracks which experts have been claimed in the current routing block and how
	far into that block the current decode step is. This allows the router to
	maintain its one-usage-per-block contract across decode steps without
	reprocessing the full accumulated sequence.

	All state is pre-allocated at construction time. The primary decode method
	(update_decode) uses only in-place fixed-shape operations and is fully
	compileable.

	Args:
	block_length: Tokens per routing block, W = num_mosrah_heads // num_selected_heads.
	The router resets block state after every W consecutive decode tokens.
	num_mosrah_heads: Total expert count L. Determines the width of the
	used-expert mask.
	batch_size: Number of sequences in the batch.
	device: Device on which to allocate state tensors.
	"""

	is_compileable = True
	is_sliding = False

	def __init__(
	self,
	block_length: int,
	num_mosrah_heads: int,
	batch_size: int,
	device: torch.device,
	) -> None:
	super().__init__()
	self._block_length = block_length
	self._device = device

	# used_in_block: which experts are already claimed in the current block.
	# False = expert is still available for the next decode token that needs it.
	# Reset to all-False when step_in_block reaches block_length.
	self._used_in_block = torch.zeros(
	batch_size, num_mosrah_heads, dtype=torch.bool, device=device
	)

	# step_in_block: how many tokens have been processed in the current block.
	# Range [0, block_length - 1]. Resets to 0 when a block completes.
	self._step_in_block = torch.zeros(batch_size, dtype=torch.int64, device=device)

	# ---------------------------------------------------------------------------
	# is_initialized — pre-allocated at construction, always True
	# ---------------------------------------------------------------------------

	@property
	def is_initialized(self) -> bool:
	"""True always — RouterCache pre-allocates all state at construction."""
	return True

	@is_initialized.setter
	def is_initialized(self, value: bool) -> None:
	# CacheLayerMixin.__init__ assigns self.is_initialized = False as an
	# instance attribute. Absorb it silently — state is always initialized.
	pass

	# ---------------------------------------------------------------------------
	# Public interface for the router
	# ---------------------------------------------------------------------------

	def get_used_in_block(self) -> torch.Tensor:
	"""Return the current block's used-expert mask.

	Returns:
	Boolean mask of shape (B, L). True entries mark experts already claimed
	by earlier tokens in the current block and must be excluded from TopK.
	"""
	return self._used_in_block

	def update_decode(self, step_heads: torch.Tensor) -> None:
	"""Record a single decode-step expert selection and advance the block counter.

	Marks the K selected experts as used in the current block, then either
	advances the per-batch step counter or resets both tensors in-place when
	the block completes. All operations are in-place and compile-compatible.

	Args:
	step_heads: Expert indices selected at this decode step, shape (B, K).
	"""
	# Mark the K selected experts as unavailable for the rest of this block.
	self._used_in_block.scatter_(-1, step_heads, True)

	# Detect block completion before incrementing: step was W-1 (0-indexed),
	# meaning this token is the last one in the current block.
	block_done = self._step_in_block.eq(self._block_length - 1) # (B,) bool

	# Advance step counter, then zero it for any batch item that just finished a block.
	self._step_in_block.add_(1)
	self._step_in_block.masked_fill_(block_done, 0)

	# Clear expert availability for batch items that completed a block, so the
	# next decode token for those items starts with a clean slate.
	self._used_in_block.masked_fill_(block_done.unsqueeze(-1), False)

	def update_prefill(
	self,
	selected_heads_blocked: torch.Tensor,
	seq_len: int,
	) -> None:
	"""Record the partial block state left over at the end of a prefill pass.

	After processing a prefill sequence of length seq_len with the training-style
	block solver, the last block may be incomplete when seq_len is not a multiple
	of block_length. This method saves the partial block state so decode steps can
	continue the current block without a gap.

	Not compile-compatible: uses a data-dependent slice [:seq_mod] on the W
	dimension. Must only be called in eager mode. Standard HuggingFace generate()
	runs prefill in eager before entering the compiled decode loop.

	Args:
	selected_heads_blocked: Block-solver assignment output from the prefill pass,
	shape (B, num_blocks, W, K). The final block entry contains expert
	assignments for both real tokens (steps 0..seq_mod-1) and padding
	artefacts (steps seq_mod..W-1) which must be discarded.
	seq_len: Actual prefill sequence length before block padding. Determines
	how many steps of the last block contain real assignments.
	"""
	B = selected_heads_blocked.shape[0]
	seq_mod = seq_len % self._block_length

	self._used_in_block.zero_()

	if seq_mod == 0:
	# All blocks were complete — start fresh for the next decode token.
	self._step_in_block.zero_()
	else:
	# Last block is partial: only the first seq_mod steps are real assignments.
	# Rebuild the used-expert mask from those steps and record the step position.
	last_block_real_steps = selected_heads_blocked[:, -1, :seq_mod, :] # (B, seq_mod, K)
	real_experts_flat = last_block_real_steps.reshape(B, -1) # (B, seq_mod * K)
	self._used_in_block.scatter_(-1, real_experts_flat, True)
	self._step_in_block.fill_(seq_mod)

	# ---------------------------------------------------------------------------
	# CacheLayerMixin — reset and beam-search coordination
	# ---------------------------------------------------------------------------

	def reset(self) -> None:
	"""Clear block state for a new generation session.

	Zeros both state tensors in-place. Called by ShramLayerCache.reset()
	atomically with the KV cache reset.
	"""
	self._used_in_block.zero_()
	self._step_in_block.zero_()

	def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
	"""Reorder the batch dimension for beam search.

	Args:
	beam_idx: Permutation indices of shape (batch,).
	"""
	self._used_in_block = self._used_in_block[beam_idx]
	self._step_in_block = self._step_in_block[beam_idx]

	def batch_repeat_interleave(self, repeats: int) -> None:
	"""Expand the batch dimension for beam search initialisation.

	Args:
	repeats: Number of times to repeat each batch entry along the batch dimension.
	"""
	self._used_in_block = self._used_in_block.repeat_interleave(repeats, dim=0)
	self._step_in_block = self._step_in_block.repeat_interleave(repeats, dim=0)

	def batch_select_indices(self, indices: torch.Tensor) -> None:
	"""Select a subset of batch entries for contrastive search.

	Args:
	indices: 1-D integer tensor of batch indices to retain.
	"""
	self._used_in_block = self._used_in_block[indices]
	self._step_in_block = self._step_in_block[indices]

	def offload(self) -> None:
	"""Move state tensors to CPU for memory management between decode steps."""
	self._used_in_block = self._used_in_block.cpu()
	self._step_in_block = self._step_in_block.cpu()

	def prefetch(self) -> None:
	"""Move state tensors back to model device ahead of the next decode step."""
	self._used_in_block = self._used_in_block.to(self._device)
	self._step_in_block = self._step_in_block.to(self._device)

	# ---------------------------------------------------------------------------
	# CacheLayerMixin — unsupported abstract methods
	# ---------------------------------------------------------------------------

	def lazy_initialization( # type: ignore[override]
	self, key_states: torch.Tensor, value_states: torch.Tensor
	) -> None:
	"""No-op — RouterCache pre-allocates all state at construction."""
	pass

	def update( # type: ignore[override]
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	cache_kwargs: dict \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""Not supported — use update_decode() or update_prefill() instead."""
	raise NotImplementedError(
	"RouterCache has no composite key/value update interface. "
	"Use update_decode() for single decode steps or update_prefill() after prefill."
	)

	def get_seq_length(self) -> int:
	"""Not supported — RouterCache tracks block position, not sequence length."""
	raise NotImplementedError("RouterCache does not track sequence length.")

	def get_max_cache_shape(self) -> int:
	"""Not supported — RouterCache does not hold KV pairs."""
	raise NotImplementedError("RouterCache does not have a KV cache shape.")

	def get_mask_sizes( # type: ignore[override]
	self,
	cache_position: torch.Tensor,
	) -> tuple[int, int]:
	"""Not supported — RouterCache does not participate in KV attention masking."""
	raise NotImplementedError("RouterCache does not participate in KV masking.")

	# -----------
	# Inlined from: sliding_window_cache.py
	# -----------
	# src/shram/model/cache/sliding_window_cache.py

	"""Local sliding-window cache for the SHRAM local attention path.

	This file defines `LocalSlidingWindowLayerCache`, the local sub-cache owned by
	`ShramLayerCache` and consumed by `SlidingWindowAttention`.

	Its job is narrow:

	- accept the current chunk's local key/value tensors and active mask
	- return the current-step local frame consumed by local attention
	- separately retain the next-step sliding-window cache state

	It does not decide local causal visibility. That is owned by
	`SlidingWindowAttention`, which consumes the returned key/value/mask frame and
	constructs the effective local attention mask from it.
	"""





	class LocalSlidingWindowLayerCache(CacheLayerMixin):
	"""Fixed-width local cache for one SHRAM decoder layer.

	The cache keeps a retained local sliding-window buffer and an aligned active
	mask. On update, it returns the current-step local frame formed by
	concatenating retained cache state with the new chunk, then remembers only
	the last `sliding_window` positions for the next step.

	Dead positions are allowed to remain in both the returned frame and the
	retained cache. Correctness is carried by the aligned active mask.

	Args:
	sliding_window: Width of the retained local sliding-window buffer.
	num_heads: Number of local attention heads.
	head_dim: Per-head embedding width for the local path.
	batch_size: Number of sequences in the batch.
	device: Device on which to allocate cache storage.
	"""

	is_compileable = True
	is_sliding = True

	def __init__(
	self,
	sliding_window: int,
	num_heads: int,
	head_dim: int,
	batch_size: int,
	device: torch.device,
	) -> None:
	super().__init__()

	if sliding_window < 1:
	raise ValueError(
	f"sliding_window must be >= 1, got {sliding_window}."
	)
	if num_heads < 1:
	raise ValueError(f"num_heads must be >= 1, got {num_heads}.")
	if head_dim < 1:
	raise ValueError(f"head_dim must be >= 1, got {head_dim}.")
	if batch_size < 1:
	raise ValueError(f"batch_size must be >= 1, got {batch_size}.")

	self.sliding_window = sliding_window
	self.num_heads = num_heads
	self.head_dim = head_dim
	self.batch_size = batch_size
	self.device = device

	# Retained next-step local cache state. Storage is fixed-width from the
	# start; semantic validity is carried by `active_mask`.
	self.keys = torch.zeros(
	batch_size,
	num_heads,
	sliding_window,
	head_dim,
	device=device,
	)
	self.values = torch.zeros(
	batch_size,
	num_heads,
	sliding_window,
	head_dim,
	device=device,
	)
	self.active_mask = torch.zeros(
	batch_size,
	sliding_window,
	dtype=torch.bool,
	device=device,
	)

	# Absolute sequence positions of each retained slot. Inactive slots
	# retain zero; correctness is carried by active_mask.
	self.positions = torch.zeros(
	batch_size,
	sliding_window,
	dtype=torch.long,
	device=device,
	)

	self.is_initialized = True

	# Cumulative count of all token positions presented through update() for
	# this cache instance. This is the quantity HuggingFace generation reads
	# through get_seq_length() to track how far along the sequence we are.
	self._total_processed = torch.tensor(0)

	def update( # type: ignore[override]
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	active_mask: torch.Tensor,
	positions: torch.Tensor,
	cache_kwargs: dict \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Return the current-step local frame and retain the next-step window.

	Args:
	key_states: Shape `(B, H, T_new, D)` local key vectors for the
	current chunk.
	value_states: Shape `(B, H, T_new, D)` local value vectors for the
	current chunk.
	active_mask: Shape `(B, T_new)` bool. `True` means the
	corresponding token position in the current chunk is active.
	positions: Shape `(B, T_new)` long. Absolute sequence position of
	each token in the current chunk.
	cache_kwargs: Present only to satisfy the `CacheLayerMixin`
	interface. Unused by this cache.

	Returns:
	Tuple of:
	- visible_keys: `(B, H, sliding_window + T_new, D)`
	- visible_values: `(B, H, sliding_window + T_new, D)`
	- visible_active_mask: `(B, sliding_window + T_new)`
	- visible_positions: `(B, sliding_window + T_new)`

	These are the tensors the local attention path should consume
	directly for the current step.
	"""
	self._ensure_state_compatibility(
	key_states=key_states,
	value_states=value_states,
	)

	# The current-step local frame is just retained cache state followed by
	# the current chunk in chronological order.
	composite_keys, composite_values, composite_mask, composite_positions = self._make_composite_frame(
	key_states=key_states,
	value_states=value_states,
	active_mask=active_mask,
	positions=positions,
	)

	# The cache remembers only the last raw sliding-window positions of that
	# composite frame for the next step. Dead positions are allowed to
	# survive; downstream local attention will ignore them using the mask.
	self._retain_next_window(
	composite_keys=composite_keys,
	composite_values=composite_values,
	composite_mask=composite_mask,
	composite_positions=composite_positions,
	)

	self._total_processed += key_states.shape[2]

	return composite_keys, composite_values, composite_mask, composite_positions

	def _ensure_state_compatibility(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	) -> None:
	"""Keep retained cache buffers compatible with the incoming update tensors.

	The cache is allocated eagerly for simplicity. If later updates arrive on
	a different device or in a different floating dtype, move the retained
	state to match while preserving its contents.
	"""
	if self.keys.dtype != key_states.dtype or self.keys.device != key_states.device:
	self.keys = self.keys.to(
	device=key_states.device,
	dtype=key_states.dtype,
	)

	if (
	self.values.dtype != value_states.dtype
	or self.values.device != value_states.device
	):
	self.values = self.values.to(
	device=value_states.device,
	dtype=value_states.dtype,
	)

	if self.active_mask.device != key_states.device:
	self.active_mask = self.active_mask.to(
	key_states.device,
	non_blocking=True,
	)

	if self.positions.device != key_states.device:
	self.positions = self.positions.to(
	key_states.device,
	non_blocking=True,
	)

	def _make_composite_frame(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	active_mask: torch.Tensor,
	positions: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Build the current-step local frame in chronological order."""
	return (
	torch.cat([self.keys, key_states], dim=-2),
	torch.cat([self.values, value_states], dim=-2),
	torch.cat([self.active_mask, active_mask], dim=-1),
	torch.cat([self.positions, positions], dim=-1),
	)

	def _retain_next_window(
	self,
	composite_keys: torch.Tensor,
	composite_values: torch.Tensor,
	composite_mask: torch.Tensor,
	composite_positions: torch.Tensor,
	) -> None:
	"""Remember the next-step retained local state.

	This is a raw positional trim to the last `sliding_window` positions, not
	a semantic live-token trim.
	"""
	self.keys[:] = composite_keys[:, :, -self.sliding_window :, :]
	self.values[:] = composite_values[:, :, -self.sliding_window :, :]
	self.active_mask[:] = composite_mask[:, -self.sliding_window :]
	self.positions[:] = composite_positions[:, -self.sliding_window :]

	def get_seq_length(self) -> int:
	"""Return the cumulative number of token positions processed by this cache.

	This is the total count of token positions presented across all update()
	calls since construction or the last reset(). It is the quantity HuggingFace
	generation reads to track sequence progress and is not the same as active-token
	count or current window occupancy.
	"""
	return int(self._total_processed)

	def get_max_cache_shape(self) -> int:
	return self.sliding_window

	def get_mask_sizes( # type: ignore[override]
	self,
	cache_position: torch.Tensor,
	) -> tuple[int, int]:
	raise NotImplementedError(
	"LocalSlidingWindowLayerCache does not support get_mask_sizes()."
	)

	def reset(self) -> None:
	"""Restore fresh-cache behavior."""
	self.keys.zero_()
	self.values.zero_()
	self.active_mask.zero_()
	self.positions.zero_()
	self._total_processed = 0

	def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
	"""Reorder the batch dimension for beam search."""
	self.keys = self.keys[beam_idx]
	self.values = self.values[beam_idx]
	self.active_mask = self.active_mask[beam_idx]
	self.positions = self.positions[beam_idx]

	def batch_repeat_interleave(self, repeats: int) -> None:
	"""Expand the batch dimension for beam-search initialisation."""
	self.keys = self.keys.repeat_interleave(repeats, dim=0)
	self.values = self.values.repeat_interleave(repeats, dim=0)
	self.active_mask = self.active_mask.repeat_interleave(repeats, dim=0)
	self.positions = self.positions.repeat_interleave(repeats, dim=0)
	self.batch_size = self.batch_size * repeats

	def batch_select_indices(self, indices: torch.Tensor) -> None:
	"""Select a subset of batch entries for contrastive search."""
	self.keys = self.keys[indices]
	self.values = self.values[indices]
	self.active_mask = self.active_mask[indices]
	self.positions = self.positions[indices]
	self.batch_size = int(indices.shape[0])

	def offload(self) -> None:
	"""Offload cache tensors to CPU."""
	super().offload()
	self.active_mask = self.active_mask.to("cpu", non_blocking=True)
	self.positions = self.positions.to("cpu", non_blocking=True)

	def prefetch(self) -> None:
	"""Move cache tensors back to the model device ahead of time."""
	super().prefetch()
	if self.active_mask.device != self.keys.device:
	self.active_mask = self.active_mask.to(
	self.keys.device,
	non_blocking=True,
	)
	self.positions = self.positions.to(
	self.keys.device,
	non_blocking=True,
	)

	def crop(self, max_length: int) -> None:
	raise NotImplementedError(
	"LocalSlidingWindowLayerCache does not support crop()."
	)

	def lazy_initialization(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	) -> None:
	"""No-op — this cache allocates its fixed buffers at construction time."""
	return


	class ShramLayerCache(CacheLayerMixin):
	"""Cache subsystem for one SHRAM decoder layer.

	Owns and coordinates three sub-caches:
	- sliding_window_cache: LocalSlidingWindowLayerCache for the local sliding-window path.
	- mosrah_cache: MoSRAHCache for the MoSRAH sparse attention path.
	- router_cache: RouterCache for the block-balanced router's block state.

	Satisfies the HuggingFace per-layer cache role (CacheLayerMixin). The sub-caches are
	exposed directly for their downstream consumers — no composite update() interface is
	provided, because the paths have materially different update semantics.

	Sequence length is reported by delegating to the local sliding-window sub-cache, which
	tracks the cumulative count of token positions processed across all update() calls.

	Args:
	config: ShramConfig instance. All sub-cache dimensions and capacities are derived
	from config so that a single source of truth governs every buffer size.
	batch_size: Number of sequences in the batch.
	device: Device on which to allocate cache tensors.
	"""

	is_compileable = True
	is_sliding = False

	def __init__(
	self,
	config: ShramConfig,
	batch_size: int,
	device: torch.device,
	) -> None:
	super().__init__()
	self._inference_sequence_length = config.inference_sequence_length
	self.sliding_window_cache = LocalSlidingWindowLayerCache(
	sliding_window=config.window_size,
	num_heads=config.num_sliding_window_heads,
	head_dim=config.head_dim,
	batch_size=batch_size,
	device=device,
	)
	self.mosrah_cache = MoSRAHCache(
	num_mosrah_heads=config.num_mosrah_heads,
	head_dim=config.head_dim,
	batch_size=batch_size,
	device=device,
	mosrah_cache_length=config.mosrah_cache_length,
	)
	self.router_cache = RouterCache(
	block_length=config.block_length,
	num_mosrah_heads=config.num_mosrah_heads,
	batch_size=batch_size,
	device=device,
	)

	# ---------------------------------------------------------------------------
	# Properties
	# ---------------------------------------------------------------------------

	@property
	def is_initialized(self) -> bool:
	"""True iff both sub-caches have allocated their storage.

	Both LocalSlidingWindowLayerCache and MoSRAHCache pre-allocate at construction,
	so this is True immediately after ShramLayerCache.__init__ returns.
	"""
	return (
	self.sliding_window_cache.is_initialized
	and self.mosrah_cache.is_initialized
	and self.router_cache.is_initialized
	)

	@is_initialized.setter
	def is_initialized(self, value: bool) -> None:
	# CacheLayerMixin.__init__ assigns self.is_initialized = False as an instance
	# attribute. Since property is a data descriptor it takes precedence, but Python
	# still routes the assignment through __set__. Absorb it silently — state is
	# derived from sub-caches, not stored here.
	pass

	# ---------------------------------------------------------------------------
	# CacheLayerMixin — composite-meaningful methods
	# ---------------------------------------------------------------------------

	def get_seq_length(self) -> int: # type: ignore[override]
	"""Return the cumulative sequence length from the local sliding-window path.

	The local path is authoritative for sequence progress: it sees every token
	presented to this layer and accumulates a truthful total. Delegates to
	sliding_window_cache.get_seq_length().
	"""
	return self.sliding_window_cache.get_seq_length()

	def reset(self) -> None:
	"""Clear both sub-caches.

	Delegates reset to each sub-cache. Both are cleared atomically so the sliding-window
	state and MoSRAH sparse state remain consistent.
	"""
	self.sliding_window_cache.reset()
	self.mosrah_cache.reset()
	self.router_cache.reset()

	def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
	"""Reorder the batch dimension of both sub-caches for beam search.

	Delegates to each sub-cache. Both are reordered atomically so the sliding-window
	and MoSRAH state correspond to the same beam hypotheses after reordering.

	Args:
	beam_idx: Permutation indices of shape (batch,) produced by beam search.
	"""
	self.sliding_window_cache.reorder_cache(beam_idx)
	self.mosrah_cache.reorder_cache(beam_idx)
	self.router_cache.reorder_cache(beam_idx)

	def batch_repeat_interleave(self, repeats: int) -> None:
	"""Expand the batch dimension of both sub-caches for beam search initialisation.

	Delegates atomically to each sub-cache. Both must be expanded together so the
	sliding-window and MoSRAH state correspond to the same beam candidates.

	Args:
	repeats: Number of times to repeat each batch entry.
	"""
	self.sliding_window_cache.batch_repeat_interleave(repeats)
	self.mosrah_cache.batch_repeat_interleave(repeats)
	self.router_cache.batch_repeat_interleave(repeats)

	def batch_select_indices(self, indices: torch.Tensor) -> None:
	"""Select a subset of batch entries in both sub-caches for contrastive search.

	Delegates atomically to each sub-cache. Both must be trimmed together so the
	sliding-window and MoSRAH state remain consistent.

	Args:
	indices: 1-D integer tensor of batch indices to retain.
	"""
	self.sliding_window_cache.batch_select_indices(indices)
	self.mosrah_cache.batch_select_indices(indices)
	self.router_cache.batch_select_indices(indices)

	def offload(self) -> None:
	"""Offload both sub-caches to CPU.

	Delegates to each sub-cache's offload method. Does not call super() — ShramLayerCache
	does not own self.keys/self.values directly; all cached data lives in the sub-caches.
	"""
	self.sliding_window_cache.offload()
	self.mosrah_cache.offload()
	self.router_cache.offload()

	def prefetch(self) -> None:
	"""Move both sub-caches back to their model device ahead of time.

	Delegates to each sub-cache's prefetch method. Does not call super() — ShramLayerCache
	does not own self.keys/self.values directly; all cached data lives in the sub-caches.
	"""
	self.sliding_window_cache.prefetch()
	self.mosrah_cache.prefetch()
	self.router_cache.prefetch()

	def lazy_initialization( # type: ignore[override]
	self, key_states: torch.Tensor, value_states: torch.Tensor
	) -> None:
	"""No-op — both sub-caches handle their own initialization."""
	pass

	# ---------------------------------------------------------------------------
	# CacheLayerMixin — unsupported abstract methods
	# ---------------------------------------------------------------------------

	def update( # type: ignore[override]
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	cache_kwargs: dict \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""Not supported — ShramLayerCache has no composite update interface.

	The two sub-caches have materially different update semantics: the sliding-window
	side uses standard key/value concatenation while the MoSRAH side uses expert-choice
	scatter with an active mask. Callers must update each sub-cache directly via
	sliding_window_cache.update() or mosrah_cache.update().
	"""
	raise NotImplementedError(
	"ShramLayerCache has no composite update interface. "
	"Update sliding_window_cache or mosrah_cache directly."
	)

	def get_max_cache_shape(self) -> int: # type: ignore[override]
	"""Return the maximum sequence length this layer cache can serve.

	The authoritative upper bound is ``config.inference_sequence_length``, which
	governs the full accumulated token history the model is configured to handle.
	HuggingFace's static-cache machinery reads this value to determine whether the
	cache is compileable and to size generation loops.
	"""
	return self._inference_sequence_length

	def get_mask_sizes( # type: ignore[override]
	self,
	cache_position: torch.Tensor,
	) -> tuple[int, int]:
	"""Return the KV dimensions for HuggingFace causal mask construction.

	Returns (inference_sequence_length, 0): the full static cache capacity as
	kv_length and zero offset. HuggingFace reads these values to size the causal
	attention mask when is_compileable is True.
	"""
	return self._inference_sequence_length, 0



	class ShramCache(Cache):
	"""Top-level cache for the full SHRAM model.

	Owns one ShramLayerCache per decoder layer. Satisfies the HuggingFace top-level Cache
	role and transparently forwards reset, reorder, and sequence-length queries across all
	owned layer caches.

	No composite update() interface is provided. The two attention paths inside each SHRAM
	layer have materially different update semantics; callers must update sub-caches directly
	via cache.layers[layer_idx].sliding_window_cache or cache.layers[layer_idx].mosrah_cache.

	ShramCache also tracks per-batch cumulative active token counts via
	``_active_token_counts``. ``total_active_tokens(active_mask)`` returns the accumulated
	count before the current step and updates the buffer in-place; the caller uses this as a
	per-batch position bias for contiguous arange-based position ID resolution. All counter
	updates are in-place to satisfy CUDAGraph fixed-memory requirements. ``reset()``
	zeroes the buffer along with all layer caches.

	Args:
	config: ShramConfig instance. All layer counts, buffer sizes, and sub-cache
	dimensions are derived from config so that a single source of truth governs
	every buffer size across the full cache stack.
	batch_size: Number of sequences in the batch.
	device: Device on which to allocate cache tensors.
	"""

	is_compileable = True

	def __init__(
	self,
	config: ShramConfig,
	batch_size: int,
	device: torch.device,
	) -> None:
	layers = [
	ShramLayerCache(
	config=config,
	batch_size=batch_size,
	device=device,
	)
	for _ in range(config.num_decoder_layers)
	]
	super().__init__(layers=layers)

	# Active token counter for position ID resolution (Unit 23.B). Pre-allocated
	# at construction so all updates remain in-place across forward passes,
	# satisfying CUDAGraph fixed-memory requirements.
	self._active_token_counts: torch.Tensor = torch.zeros(
	batch_size, dtype=torch.long, device=device
	)

	# ---------------------------------------------------------------------------
	# Cache — composite-meaningful methods
	# ---------------------------------------------------------------------------
	#
	# reset(): Overridden. Zeroes _active_token_counts in-place, then delegates to
	# the inherited implementation to reset all layer caches.
	#
	# reorder_cache(beam_idx): Inherited. Iterates all layer caches and reorders each.
	#
	# is_initialized: Inherited property. True iff all layer caches are initialized.
	# Since ShramLayerCache.is_initialized is True from construction, this is True
	# immediately after ShramCache.__init__ returns.

	def total_active_tokens(self, active_mask: torch.BoolTensor) -> torch.Tensor:
	"""Return the per-batch accumulated active token count before this step, then update.

	Reads the current per-batch accumulated count as a position bias for the caller,
	then increments the internal counter in-place by the number of active tokens in
	``active_mask`` for each batch item. The pre-update count is returned so the
	caller can offset an arange-based position tensor to the correct starting position
	for this forward pass.

	All updates are in-place to satisfy CUDAGraph fixed-memory requirements. The
	counter persists across forward passes until ``reset()`` is called.

	Args:
	active_mask: Boolean mask of shape ``(B, N)`` for the current forward step,
	where True marks an active (non-padding) token position.

	Returns:
	Integer tensor of shape ``(B,)`` — the accumulated count before this update.
	"""
	prior_counts = self._active_token_counts.clone()
	self._active_token_counts.add_(active_mask.sum(dim=-1))
	return prior_counts

	def reset(self) -> None:
	"""Clear all layer caches and reset the active token counter.

	Zeroes ``_active_token_counts`` in-place, then delegates to the inherited
	implementation to reset all ShramLayerCache instances. In-place mutation of
	the counter is required for CUDAGraph compatibility — the buffer must remain
	at the same memory address across steps.
	"""
	self._active_token_counts.zero_()
	super().reset()

	def get_seq_length(self, layer_idx: int = 0) -> int: # type: ignore[override]
	"""Return the cumulative sequence length for the specified layer.

	Delegates to the layer cache at layer_idx, which in turn delegates to the
	local sliding-window sub-cache. That sub-cache is authoritative for sequence
	progress: it sees every token presented to the layer and accumulates a truthful
	total count. Defaults to layer 0, which is sufficient for HuggingFace generation.
	"""
	return self.layers[layer_idx].get_seq_length()

	# ---------------------------------------------------------------------------
	# Cache — unsupported methods
	# ---------------------------------------------------------------------------

	def update( # type: ignore[override]
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	layer_idx: int,
	cache_kwargs: dict \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""Not supported — ShramCache has no composite update interface.

	The two attention paths inside each SHRAM layer have different update semantics.
	Callers must update sub-caches directly:
	cache.layers[layer_idx].sliding_window_cache.update(key_states, value_states)
	cache.layers[layer_idx].mosrah_cache.update(key_states, value_states, active_mask)
	"""
	raise NotImplementedError(
	"ShramCache has no composite update interface. "
	"Update sliding_window_cache or mosrah_cache on the relevant layer directly."
	)

	def crop(self, max_length: int) -> None:
	"""Not supported — ShramCache layers do not implement crop()."""
	raise NotImplementedError("ShramCache does not support crop().")

	@property
	def max_batch_size(self) -> int:
	"""Not supported — ShramCache does not track a uniform batch size across layers."""
	raise NotImplementedError("ShramCache does not expose max_batch_size.")

	@property
	def max_cache_len(self) -> int:
	"""Return the maximum sequence length the cache can serve.

	Delegates to layers[0].get_max_cache_shape(), which returns
	config.inference_sequence_length. HuggingFace's static-cache machinery reads
	this value to size generation loops and verify compileable cache contracts.
	"""
	return self.layers[0].get_max_cache_shape()


	# -----------
	# Inlined from: model.py
	# -----------
	"""Transformer backbone for Shram.

	ShramModel is a pure PyTorch module: a sequence of DecoderLayer blocks followed
	by a final RMSNorm. It accepts pre-embedded hidden states and returns contextual
	representations. It has no knowledge of tokens, vocabulary, generation, or the
	HuggingFace causal-LM wrapper contract.

	Keeping the embedding out of the backbone is the correct convention and makes
	the backbone genuinely modality-agnostic. The token interface — embedding lookup,
	LM head, weight tying, and generation-facing naming conventions — belongs on the
	task wrapper (ShramForCausalLM), which is the only class that knows this
	backbone is being used for language modelling.

	The final RMSNorm is necessary because the decoder stack uses pre-norm throughout:
	each sublayer normalises its own input, leaving the residual stream itself
	unnormalised. After many layers of accumulated residuals, that stream arrives at
	the top with uncontrolled magnitude. The final norm brings it to a well-scaled
	state before any projection. Without it, the LM head would receive signals of
	arbitrary scale.

	Caching is caller-managed. If a ShramCache is provided, ShramModel threads the
	corresponding per-layer ShramLayerCache into each DecoderLayer and returns the
	same top-level ShramCache object in the output dict. If None is provided, no
	caching occurs.

	Returns a plain dict with keys:
	- "last_hidden_state": normed backbone output, shape (batch, seq_len, hidden_size)
	- "past_key_values": the ShramCache object passed in, or None
	- "hidden_states": tuple of per-layer activations if output_hidden_states=True, else None
	- "regret_loss": scalar sum of per-layer SHRAM regret losses
	- "logit_regret": detached scalar mean per-layer logit-space regret
	- "logit_std": detached scalar mean per-layer per-token routing logit spread
	"""






	# -----------
	# Inlined from: decoder_layer.py
	# -----------
	"""Decoder layer — a single transformer block.

	Each block applies pre-norm hybrid attention followed by pre-norm MLP, with
	gated residual connections around both sublayers:

	normed_attn = RMSNorm(x)
	attn_out, router_diagnostics = SHRAMHybridLayer(normed_attn, ...)
	h = x + attn_residual_scale * attn_out

	normed_mlp = RMSNorm(h)
	mlp_out = SwiGLUMLP(normed_mlp)
	out = h + mlp_residual_scale * mlp_out

	``attn_residual_scale`` and ``mlp_residual_scale`` are always present. Their nature
	depends on ``config.use_residual_gate``:

	- ``True`` (default): learnable scalar ``nn.Parameter`` initialised to zero. The layer
	is a pure identity at initialisation and the scales open during training.
	- ``False``: fixed buffer ``1/√num_decoder_layers``. No learnable parameter; residual
	variance sums to O(1) across depth by construction.

	Pre-norm keeps the residual stream unnormalised. Gradients flow more cleanly
	through unnormalised residuals at depth, and each sublayer receives a stable,
	normalised view of the signal.

	Two independent RMSNorm instances are used — one before attention, one before
	MLP. They learn different scalings because they precede layers with different
	dynamic ranges. Sharing them would be wrong.

	torch.nn.RMSNorm is used directly (available from PyTorch 2.4+). It omits mean
	subtraction, is faster than LayerNorm, and proved more stable at scale.
	"""






	# -----------
	# Inlined from: shram.py
	# -----------
	"""SHRAM hybrid attention layer.

	This module implements the hybrid attention construction H(x) = h_l(x) + h_s(x)
	used at one decoder attention slot in SHRAM.

	The local sliding-window path and the MoSRAH sparse path are already verified
	independently. The responsibility here is therefore not to introduce new
	attention logic, but to preserve the bridge contracts between them: both paths
	must consume the same input hidden state, each path must receive the sub-cache
	it actually owns, the two model-space outputs must be summed directly, and the
	sparse-path load-balance loss must remain visible to the caller.
	"""






	# -----------
	# Inlined from: sliding_window_attention.py
	# -----------
	# src/shram/model/attention/sliding_window_attention.py

	"""Local sliding-window attention path for SHRAM.

	This file defines `SlidingWindowAttention`, the local short-range attention path
	used inside the SHRAM hybrid layer.

	In the masked-continuation variant, the local cache no longer returns a
	semantically dense visible frame. Instead, `LocalSlidingWindowLayerCache`
	returns:

	- the retained local window memory concatenated with the current chunk
	- an aligned active mask over that returned frame

	This module consumes that returned frame directly and constructs effective local
	causal/window visibility from the mask. It does not own cache retention policy;
	it owns only local attention semantics.
	"""










	# -----------
	# Inlined from: rope.py
	# -----------
	"""Rotary Position Embeddings (RoPE).

	RoPE encodes position in the relationship between query and key vectors. When the
	attention dot product Q·Kᵀ is computed, the per-position rotations cancel to produce
	a score that depends only on the relative distance — not on absolute positions.

	Two modes are supported:

	default Standard RoPE with base frequency b. Each dimension pair d is assigned
	frequency θ_d = b^{-2d/u} where u is the head dimension. The attention
	scaling A_rope = 1.

	yarn YaRN frequency interpolation for long-context extrapolation (Peng et al.,
	"YaRN: Efficient Context Window Extension of Large Language Models", 2023,
	§A.2). Three frequency regimes:
	- Low-frequency dimensions (r < α): fully interpolated by scale s.
	These dimensions have long wavelengths relative to the training window
	and must be compressed to avoid out-of-distribution positions.
	- High-frequency dimensions (r > β): left unchanged. Short-wavelength
	dimensions already encode relative position accurately at any scale.
	- Intermediate dimensions (α ≤ r ≤ β): linearly blended via ramp γ(r).
	Returns A_rope = (0.1·ln(s)+1)². When s = 1, YaRN reduces exactly to
	standard RoPE.

	Each attention path (h_l and BEA) constructs its own RotaryEmbedding with explicit
	parameters — no shared instance, no config reading. See Unit 5.A design decisions.

	Cache sharing: all instances with identical parameters share one cos/sin table via a
	class-level registry. The first instance that needs a particular (parameters, device,
	dtype) combination builds the table; all subsequent instances reference it directly.
	This avoids redundant builds across the num_hidden_layers instances that share the
	same parametrisation.
	"""






	# ---------------------------------------------------------------------------
	# Rotation helper
	# ---------------------------------------------------------------------------

	def _rotate_half(x: torch.Tensor) -> torch.Tensor:
	"""Apply the 90° rotation used in the RoPE update formula.

	Splits the last dimension into two halves [x1, x2] and returns [-x2, x1].
	Combined with ``x * cos + rotate_half(x) * sin``, this implements a 2D rotation
	on each consecutive pair of dimensions, matching the block-diagonal operator
	R^u_{Θ,p} in the paper.
	"""
	d = x.shape[-1] // 2
	x1, x2 = x[..., :d], x[..., d:]
	return torch.cat([-x2, x1], dim=-1)


	# ---------------------------------------------------------------------------
	# RotaryEmbedding
	# ---------------------------------------------------------------------------

	class RotaryEmbedding(nn.Module):
	"""Rotary Position Embeddings with explicit mode and parameter control.

	Each caller constructs its own instance with the exact parameters it needs.
	h_l always uses ``mode="default"``; BEA always uses ``mode="yarn"``. No
	config object is read inside this module.

	The cos/sin table is built at construction time to cover all positions in
	``[0, maximum_sequence_length)``. In forward, the table is rebuilt only if
	the query tensor's dtype or device has changed since construction.

	Instances with identical parameters share one cos/sin table via the class-level
	``_cache`` registry, avoiding redundant computation across decoder layers.

	Args:
	mode: ``"default"`` for standard RoPE; ``"yarn"`` for YaRN extrapolation.
	head_dim: Per-head embedding dimension ``u``. Must be even.
	theta: Base frequency ``b`` in θ_d = b^{-2d/u}.
	maximum_sequence_length: Maximum number of positions the table must cover.
	The cos/sin table is preallocated to this length at construction time.
	For ``mode="yarn"``, the training context length C_train is derived
	internally as ``round(maximum_sequence_length / dilation)``.
	dilation: Scale factor ``s = C_target / C_train`` — how much the context
	window is extended beyond training length. Required for ``mode="yarn"``.
	When ``dilation=1.0``, YaRN reduces to standard RoPE.
	alpha: YaRN ramp lower boundary α. Dimensions with r(d) < α are fully
	interpolated. Required for ``mode="yarn"``.
	beta: YaRN ramp upper boundary β. Dimensions with r(d) > β are left
	unchanged. Required for ``mode="yarn"``.
	device: Optional device for initial buffer placement.

	Raises:
	NotImplementedError: If ``mode`` is not ``"default"`` or ``"yarn"``.
	ValueError: If ``mode="yarn"`` and any of ``dilation``, ``alpha``,
	``beta`` are absent.
	"""

	# Maps (freq_key, device_str, dtype_str) → (cos_table, sin_table).
	# Shared across all RotaryEmbedding instances in the process. Keys include device
	# and dtype so that tables built on different devices or in different precisions
	# are stored independently.
	_cache: dict = {}

	def __init__(
	self,
	mode: str,
	head_dim: int,
	theta: float,
	maximum_sequence_length: int,
	dilation: float \| None = None,
	alpha: float \| None = None,
	beta: float \| None = None,
	device: torch.device \| None = None,
	) -> None:
	super().__init__()

	self._validate_mode(mode)
	self._validate_yarn_params(mode, dilation, alpha, beta)
	self.mode = mode
	self._maximum_sequence_length = maximum_sequence_length
	device = torch.device("cpu") if device is None else device

	# Compute per-dimension rotation frequencies θ_d (default) or θ_d' (yarn).
	# d_index ranges over 0, 2, 4, ..., head_dim-2 — one index per dimension pair,
	# so rotation_freqs has head_dim/2 entries.
	d_index = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
	base_freqs = 1.0 / (theta ** (d_index / head_dim)) # θ_d = b^{-2d/u}

	if mode == "default":
	rotation_freqs = base_freqs
	self.attention_scaling: float = 1.0

	else: # yarn
	s = dilation

	# C_train is the training context length, recovered from the inference
	# context length and the dilation factor. round() guards against floating
	# point error since both underlying quantities are integers.
	c_train: int = round(maximum_sequence_length / dilation)

	# r(d) = C_train · θ_d / (2π) — normalized frequency used by the ramp
	# function to classify each dimension into one of three regimes.
	normalized_freqs = c_train * base_freqs / (2.0 * math.pi)

	# γ(r) ramp: 0 for r < α (fully interpolate), 1 for r > β (unchanged),
	# linear blend between α and β.
	blend_weights = ((normalized_freqs - alpha) / (beta - alpha)).clamp(0.0, 1.0)

	# θ_d' = (1 − γ) · θ_d / s + γ · θ_d
	rotation_freqs = (1.0 - blend_weights) * (base_freqs / s) + blend_weights * base_freqs

	# A_rope = (0.1 · ln(s) + 1)² — attention logit scaling returned to caller.
	self.attention_scaling = (0.1 * math.log(s) + 1.0) ** 2

	# freq_key uniquely identifies the parameter set that produced rotation_freqs,
	# including maximum_sequence_length so instances with different table sizes
	# do not collide in the registry.
	if mode == "default":
	self._freq_key: tuple = ("default", head_dim, theta, maximum_sequence_length)
	else:
	self._freq_key = ("yarn", head_dim, theta, maximum_sequence_length, dilation, alpha, beta)

	# rotation_freqs is a plain instance attribute, not a registered buffer.
	# This keeps it out of the state dict and prevents HuggingFace's fast-init
	# path from turning it into a meta tensor, which would break _build_cache.
	self.rotation_freqs = rotation_freqs

	# Cache tensors are plain instance attributes (not registered buffers) so that
	# sharing across identically-parametrised instances survives .to() calls.
	# Registered buffers are copied on device move; plain attributes are aliased,
	# preserving the shared-tensor identity that the cache design depends on.
	self._cos_cached: torch.Tensor \| None = None
	self._sin_cached: torch.Tensor \| None = None

	# Build the table at construction time. Forward rebuilds only on dtype or
	# device change. If no device is specified, build on CPU as the default.
	self._build_cache(device=device, dtype=torch.float32)

	# ---------------------------------------------------------------------------
	# Validation helpers
	# ---------------------------------------------------------------------------

	@staticmethod
	def _validate_mode(mode: str) -> None:
	"""Raise NotImplementedError if mode is not a supported value."""
	if mode not in {"default", "yarn"}:
	raise NotImplementedError(
	f"RoPE mode '{mode}' is not supported. Supported modes: 'default', 'yarn'."
	)

	@staticmethod
	def _validate_yarn_params(
	mode: str,
	dilation: float \| None,
	alpha: float \| None,
	beta: float \| None,
	) -> None:
	"""Raise ValueError if mode='yarn' and any required parameter is absent."""
	if mode != "yarn":
	return
	missing = [
	name for name, val in [
	("dilation", dilation),
	("alpha", alpha),
	("beta", beta),
	]
	if val is None
	]
	if missing:
	raise ValueError(f"mode='yarn' requires {missing}.")

	# ---------------------------------------------------------------------------
	# Cache management
	# ---------------------------------------------------------------------------

	def _build_cache(self, device: torch.device, dtype: torch.dtype) -> None:
	"""Build the cos/sin table to cover positions [0, maximum_sequence_length).

	Checks the class-level registry first. If a table already exists for this
	exact (parameters, device, dtype) combination it is reused directly;
	otherwise it is computed and stored. The instance attributes are pointed at
	the registry entry so that all layers sharing the same parametrisation
	reference the same tensor.
	"""
	cache_key = (self._freq_key, str(device), str(dtype))

	if cache_key not in RotaryEmbedding._cache:
	positions = torch.arange(
	self._maximum_sequence_length, device=device, dtype=torch.float32
	)
	# outer product → (maximum_sequence_length, head_dim // 2);
	# duplicate to (maximum_sequence_length, head_dim)
	freqs = torch.outer(
	positions,
	self.rotation_freqs.to(device=device, dtype=torch.float32),
	)
	angle_embedding = torch.cat((freqs, freqs), dim=-1)
	RotaryEmbedding._cache[cache_key] = (
	angle_embedding.cos().to(dtype),
	angle_embedding.sin().to(dtype),
	)

	self._cos_cached, self._sin_cached = RotaryEmbedding._cache[cache_key]

	def forward(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	position_ids: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor, float]:
	"""Apply rotary embeddings to query and key tensors.

	The cos/sin table is built at construction time. It is rebuilt here only
	if ``q``'s dtype or device differs from the cached table — for example,
	after moving the model to a different device via ``.cuda()``.

	``position_ids`` may be any integer tensor shape. Its values must be in
	``[0, maximum_sequence_length)``:

	- h_l (standard causal): position_ids (B, N), q/k (B, H, N, head_dim).
	- BEA (packed): position_ids (B, L, T), q/k (B, L, T, head_dim).

	When q/k have head dimensions absent from position_ids, broadcast dimensions
	are inserted automatically at dim 1.

	Args:
	q: Query tensor of shape (batch, [heads,] *pos_dims, head_dim).
	k: Key tensor of shape (batch, [heads,] *pos_dims, head_dim).
	position_ids: Integer positions of shape (batch, *pos_dims).

	Returns:
	Tuple of (q_rotated, k_rotated, attention_scaling). attention_scaling is
	1.0 for default mode; YaRN returns (0.1·ln(s)+1)² which the caller must
	apply to attention logits before softmax.
	"""
	wrong_dtype = self._cos_cached.dtype != q.dtype
	wrong_device = self._cos_cached.device != q.device

	if wrong_dtype or wrong_device:
	self._build_cache(device=q.device, dtype=q.dtype)

	cos = self._cos_cached[position_ids]
	sin = self._sin_cached[position_ids]

	# Insert broadcast dimensions for any head axes present in q/k but absent
	# from position_ids. Standard: pos (B,N) → cos (B,N,D), q (B,H,N,D) → unsqueeze once.
	# BEA: pos (B,L,T) → cos (B,L,T,D), q (B,L,T,D) → no unsqueeze needed.
	while cos.ndim < q.ndim:
	cos = cos.unsqueeze(1)
	sin = sin.unsqueeze(1)

	q_rotated = q * cos + _rotate_half(q) * sin
	k_rotated = k * cos + _rotate_half(k) * sin

	return q_rotated, k_rotated, self.attention_scaling



	class SlidingWindowAttention(nn.Module):
	"""Causal local sliding-window attention for one SHRAM layer.

	Args:
	config: SHRAM config. Must expose `hidden_size`,
	`num_sliding_window_heads`, `head_dim`, `window_size`,
	`attention_dropout`, and `local_rope_theta`.

	Raises:
	NotImplementedError: If `attention_dropout != 0.0`.
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()

	self.hidden_size = config.embedding_width
	self.num_heads = config.num_sliding_window_heads
	self.head_dim = config.head_dim
	self.window_size = config.window_size
	self.attention_dropout = config.attention_dropout

	if self.attention_dropout != 0.0:
	raise NotImplementedError(
	"SlidingWindowAttention currently supports only "
	"attention_dropout == 0.0."
	)

	self.inner_dim = self.num_heads * self.head_dim

	# Standard MHA projections for the local path.
	self.q_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
	self.k_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
	self.v_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False)
	self.o_proj = nn.Linear(self.inner_dim, self.hidden_size, bias=False)

	# The local path always uses default-mode RoPE with its own theta.
	self.rope = RotaryEmbedding(
	mode="default",
	head_dim=self.head_dim,
	theta=config.local_rope_theta,
	maximum_sequence_length=config.inference_sequence_length,
	)

	def forward(
	self,
	x: torch.Tensor,
	position_ids: torch.Tensor,
	active_mask: torch.Tensor,
	cache: LocalSlidingWindowLayerCache \| None = None,
	) -> torch.Tensor:
	"""Apply local causal sliding-window attention.

	Args:
	x: Input tensor of shape `(B, N, hidden_size)`.
	position_ids: Position tensor of shape `(B, N)`.
	active_mask: Current-chunk active mask of shape `(B, N)`, where
	`True` means active.
	cache: Optional `LocalSlidingWindowLayerCache`.

	Returns:
	Output tensor of shape `(B, N, hidden_size)`.
	"""
	batch_size, query_len, _ = x.shape

	self._validate_position_shape(x, position_ids)
	self._validate_active_mask_shape(x, active_mask)

	# (B, N, H*D) -> (B, H, N, D)
	q = self.q_proj(x).view(
	batch_size,
	query_len,
	self.num_heads,
	self.head_dim,
	).transpose(1, 2)
	k = self.k_proj(x).view(
	batch_size,
	query_len,
	self.num_heads,
	self.head_dim,
	).transpose(1, 2)
	v = self.v_proj(x).view(
	batch_size,
	query_len,
	self.num_heads,
	self.head_dim,
	).transpose(1, 2)

	q, k, attention_scaling = self.rope(q, k, position_ids)

	# The cache returns the current-step visible local frame, not merely the
	# retained next-step cache buffer.
	if cache is not None:
	k_full, v_full, full_active_mask, full_positions = cache.update(
	k, v, active_mask, position_ids
	)
	else:
	k_full, v_full, full_active_mask, full_positions = k, v, active_mask, position_ids

	block_mask = self._make_block_mask(
	active_mask=full_active_mask,
	positions=full_positions,
	batch_size=batch_size,
	num_heads=self.num_heads,
	query_len=query_len,
	kv_len=k_full.shape[-2],
	window_size=self.window_size,
	device=x.device,
	)

	attn_output = flex_attention(
	q,
	k_full,
	v_full,
	block_mask=block_mask,
	scale=attention_scaling / math.sqrt(self.head_dim),
	)

	# (B, H, N, D) -> (B, N, H*D) -> (B, N, hidden_size)
	attn_output = (
	attn_output.transpose(1, 2)
	.contiguous()
	.view(batch_size, query_len, self.inner_dim)
	)

	return self.o_proj(attn_output)

	def _validate_position_shape(
	self,
	x: torch.Tensor,
	position_ids: torch.Tensor,
	) -> None:
	"""Validate the position tensor shape expected by local RoPE."""
	if position_ids.shape != x.shape[:2]:
	raise ValueError(
	f"position_ids must have shape {tuple(x.shape[:2])}, "
	f"got {tuple(position_ids.shape)}."
	)

	def _validate_active_mask_shape(
	self,
	x: torch.Tensor,
	active_mask: torch.Tensor,
	) -> None:
	"""Validate the current-chunk active-mask contract."""
	if active_mask.shape != x.shape[:2]:
	raise ValueError(
	f"active_mask must have shape {tuple(x.shape[:2])}, "
	f"got {tuple(active_mask.shape)}."
	)
	if active_mask.dtype != torch.bool:
	raise ValueError(
	f"active_mask must have dtype torch.bool, got {active_mask.dtype}."
	)

	def _make_block_mask(
	self,
	active_mask: torch.Tensor,
	positions: torch.Tensor,
	batch_size: int,
	num_heads: int,
	query_len: int,
	kv_len: int,
	window_size: int,
	device: torch.device,
	) -> Any:
	"""Create the FlexAttention block mask for masked local continuation.

	The returned local frame is chronological in raw buffer order; dead
	positions may remain inside it. Liveness is carried by `active_mask`.
	Causality and window distance are determined from `positions`, which
	holds the absolute sequence position of every slot in the composite
	frame. Using absolute positions rather than a cumsum over the active
	mask eliminates the data-dependent computation that blocks torch.compile.
	"""
	query_offset = kv_len - query_len

	def sliding_window_mask(
	batch_idx: torch.Tensor,
	head_idx: torch.Tensor,
	q_idx: torch.Tensor,
	kv_idx: torch.Tensor,
	) -> torch.Tensor:

	q_abs = query_offset + q_idx

	query_is_active = active_mask[batch_idx, q_abs]
	key_is_active = active_mask[batch_idx, kv_idx]

	q_pos = positions[batch_idx, q_abs]
	k_pos = positions[batch_idx, kv_idx]

	is_causal = k_pos <= q_pos
	in_window = (q_pos - k_pos) < window_size

	return query_is_active & key_is_active & is_causal & in_window

	return create_block_mask(
	sliding_window_mask,
	B=batch_size,
	H=num_heads,
	Q_LEN=query_len,
	KV_LEN=kv_len,
	device=device,
	)
	# -----------
	# Inlined from: mosrah.py
	# -----------
	"""Full MoSRAH sparse path for SHRAM.

	This module coordinates the routed sparse attention path used inside the SHRAM
	hybrid attention layer. The underlying mechanics already live in verified
	subunits. The responsibility here is to connect those subunits without
	corrupting their bridge contracts.

	In particular, this path must preserve three architectural distinctions:

	- selected head indices are not routing probabilities
	- packed position semantics are chosen before BEA, not inside it
	- weighted reduction must consume the router's unbiased renormalized
	probabilities after token-choice order has been restored
	"""







	# -----------
	# Inlined from: bottlenecked_ensemble_attention.py
	# -----------
	"""Bottlenecked Ensemble Attention (BEA) for the MoSRAH sparse path.

	BEA is the packed expert-choice attention operator over the MoSRAH sparse path.
	It consumes packed expert-choice tensors, a supplied position tensor, an active
	token mask, and an optional layer-local MoSRAH cache. It returns outputs in the
	same packed expert-choice space expected by later unpacking.

	BEA does not compute positions and does not choose packed-position semantics.
	Those are supplied by the caller. If caching is used, BEA stores post-RoPE keys
	(K̃) and raw values (V) into the sparse cache and attends against the
	accumulated cached state returned by that cache.
	"""












	class BottleneckedEnsembleAttention(nn.Module):
	"""
	Packed expert-choice attention operator for the MoSRAH sparse path.
	Operates per-head independently on an ensemble of tokens.
	FlexAttention saves flops on dead tokens.

	Architectural properties:
	- consumes packed expert-choice tensors of shape (B, L, T, d)
	- uses independent per-head Q/K/V/O projection parameters
	- applies YaRN-capable RoPE using supplied position_ids
	- stores post-RoPE K̃ and raw V in MoSRAHCache when caching is enabled
	- uses a fast fused attention path
	- returns outputs in the same packed expert-choice space (B, L, T, d)

	Args:
	config: SHRAM config. Must expose `hidden_size`, `num_mosrah_heads`,
	`head_dim`, `mosrah_rope_theta`, `inference_sequence_length`,
	`scale`, `alpha`, and `beta`.
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()

	self.hidden_size = config.embedding_width
	self.num_heads = config.num_mosrah_heads
	self.head_dim = config.head_dim

	# Independent per-head projections. No cross-head parameter sharing.
	self.q_proj = nn.Parameter(
	torch.empty(self.num_heads, self.hidden_size, self.head_dim)
	)
	self.k_proj = nn.Parameter(
	torch.empty(self.num_heads, self.hidden_size, self.head_dim)
	)
	self.v_proj = nn.Parameter(
	torch.empty(self.num_heads, self.hidden_size, self.head_dim)
	)
	self.o_proj = nn.Parameter(
	torch.empty(self.num_heads, self.head_dim, self.hidden_size)
	)

	self._reset_parameters()

	# BEA uses the YaRN-capable RoPE path. The caller supplies the position tensor;
	# this unit only consumes it. In training modes, dilation will be 1.0 and so
	# no yarn dilation occurs.
	#
	# The required table size depends on position semantics:
	# main_sequence — positions are original token positions, bounded by
	# inference_sequence_length.
	# semantic_sequence — positions are local per-expert slot indices, bounded
	# by mosrah_packed_length.
	maximum_rope_length = (
	config.mosrah_packed_length
	if config.rope_mode == "semantic_sequence"
	else config.inference_sequence_length
	)
	self.rope = RotaryEmbedding(
	mode="yarn",
	head_dim=self.head_dim,
	theta=config.mosrah_rope_theta,
	maximum_sequence_length=maximum_rope_length,
	dilation=config.scale,
	alpha=config.alpha,
	beta=config.beta,
	)

	def forward(
	self,
	packed_embeddings: torch.Tensor,
	position_ids: torch.Tensor,
	active_mask: torch.Tensor,
	cache: MoSRAHCache \| None = None,
	) -> torch.Tensor:
	"""Apply BEA to packed expert-choice tensors.

	Args:
	packed_embeddings: Packed expert-choice hidden states of shape (B, L, T, d).
	position_ids: Supplied packed positions of shape (B, L, T).
	active_mask: Boolean active-token mask of shape (B, L, T).
	cache: Optional layer-local MoSRAH cache.

	Returns:
	Packed expert-choice output tensor of shape (B, L, T, d).
	"""
	batch_size, _, query_length, _ = packed_embeddings.shape
	self._validate_tensor_shape(packed_embeddings)
	self._validate_position_shape(packed_embeddings, position_ids)
	self._validate_active_mask_shape(packed_embeddings, active_mask)

	# Independent per-head projections:
	# (B, L, T, d) x (L, d, u) -> (B, L, T, u)
	query_states = torch.einsum("bltd,ldu->bltu", packed_embeddings, self.q_proj)
	key_states = torch.einsum("bltd,ldu->bltu", packed_embeddings, self.k_proj)
	value_states = torch.einsum("bltd,ldu->bltu", packed_embeddings, self.v_proj)

	rotated_query_states, rotated_key_states, attention_scaling = self.rope(
	query_states,
	key_states,
	position_ids,
	)

	if cache is not None:
	# In cached execution, the current query tensor uses local tensor rows
	# 0..Q-1, but the key tensor returned by the cache is the full accumulated
	# packed sequence for each (batch, head) slot. The only additional data
	# needed to align those two views is the pre-update cached prefix length.
	# which will indicate how many queries were processed before now.
	num_tokens_processed = cache.get_heads_lengths().clone()
	key_states, value_states, key_active_mask = cache.update(
	rotated_key_states,
	value_states,
	active_mask,
	)
	else:
	num_tokens_processed = torch.zeros(
	batch_size,
	self.num_heads,
	dtype=torch.long,
	device=packed_embeddings.device,
	)
	key_states = rotated_key_states
	key_active_mask = active_mask

	block_mask = self._make_block_mask(
	query_active_mask=active_mask,
	key_active_mask=key_active_mask,
	num_tokens_processed=num_tokens_processed,
	query_length=query_length,
	key_length=key_states.shape[2],
	device=packed_embeddings.device,
	)

	attended_states = flex_attention(
	rotated_query_states,
	key_states,
	value_states,
	block_mask=block_mask,
	scale=attention_scaling / math.sqrt(self.head_dim),
	)

	# Project back to model width:
	# (B, L, T, u) x (L, u, d) -> (B, L, T, d)
	return torch.einsum("bltu,lud->bltd", attended_states, self.o_proj)

	def _reset_parameters(self) -> None:
	"""Initialize per-head projection weights."""
	for weight in (self.q_proj, self.k_proj, self.v_proj, self.o_proj):
	nn.init.xavier_uniform_(weight)

	def _validate_tensor_shape(self, packed_embeddings: torch.Tensor) -> None:
	"""Validate the local packed-embedding shape contract required by BEA."""
	if packed_embeddings.shape[1] != self.num_heads:
	raise ValueError(
	f"Expected packed_embeddings.shape[1] == num_mosrah_heads={self.num_heads}, "
	f"got {packed_embeddings.shape[1]}."
	)

	if packed_embeddings.shape[-1] != self.hidden_size:
	raise ValueError(
	f"Expected packed_embeddings last dim == hidden_size={self.hidden_size}, "
	f"got {packed_embeddings.shape[-1]}."
	)

	def _validate_position_shape(
	self,
	packed_embeddings: torch.Tensor,
	position_ids: torch.Tensor,
	) -> None:
	"""Validate the supplied packed-position tensor shape."""
	if position_ids.shape != packed_embeddings.shape[:3]:
	raise ValueError(
	f"position_ids must have shape {tuple(packed_embeddings.shape[:3])}, "
	f"got {tuple(position_ids.shape)}."
	)

	def _validate_active_mask_shape(
	self,
	packed_embeddings: torch.Tensor,
	active_mask: torch.Tensor,
	) -> None:
	"""Validate the supplied active-token mask shape."""
	if active_mask.shape != packed_embeddings.shape[:3]:
	raise ValueError(
	f"active_mask must have shape {tuple(packed_embeddings.shape[:3])}, "
	f"got {tuple(active_mask.shape)}."
	)

	def _make_block_mask(
	self,
	query_active_mask: torch.Tensor,
	key_active_mask: torch.Tensor,
	num_tokens_processed: torch.Tensor,
	query_length: int,
	key_length: int,
	device: torch.device,
	):
	"""Create the packed-sequence causal mask for FlexAttention.

	At the root, causality is still triangular. The only nuance is cached
	execution: query rows are indexed locally as 0..Q-1 inside the current
	query tensor, but the key tensor may already contain a cached prefix for
	that (batch, head) slot. The causal horizon for query tensor row q is
	therefore:

	cached_prefix_lengths[b, h] + q

	Query and key activity masks are then composed with that triangular rule
	so FlexAttention can skip padded query rows and ignore inactive key slots.
	"""
	batch_size, num_heads, _ = query_active_mask.shape

	# Build the per-(batch, head, query_row) triangular horizon from a simple
	# arange over query rows plus the cached prefix lengths for each slot.
	relative_query_positions = torch.arange(
	query_length,
	device=device,
	dtype=torch.long,
	).view(1, 1, query_length)
	causal_query_positions = num_tokens_processed.unsqueeze(-1) + relative_query_positions

	def packed_causal_mask(
	batch_idx: torch.Tensor,
	head_idx: torch.Tensor,
	query_idx: torch.Tensor,
	key_idx: torch.Tensor,
	) -> torch.Tensor:
	query_is_active = query_active_mask[batch_idx, head_idx, query_idx]
	key_is_active = key_active_mask[batch_idx, head_idx, key_idx]
	is_causal = key_idx <= causal_query_positions[batch_idx, head_idx, query_idx]
	return query_is_active & key_is_active & is_causal

	return create_block_mask(
	packed_causal_mask,
	B=batch_size,
	H=num_heads,
	Q_LEN=query_length,
	KV_LEN=key_length,
	device=device,
	)
	# -----------
	# Inlined from: expert_packing.py
	# -----------
	"""Expert packing and unpacking for the MoSRAH path.

	This module owns the token-choice -> expert-choice -> token-choice conversion
	boundary used by the sparse routed attention path. Its public behavior is fixed:

	- setup_packing() prepares the auxiliary ordering data forwarded through packing
	and unpacking.
	- pack_experts() converts routed token-choice tensors into padded expert-choice
	tensors.
	- unpack_experts() restores token-choice ordering from padded expert-choice output.

	Packed expert-choice tensors are expert-major and left-justified. For each expert,
	routed token copies occupy the prefix of that expert's packed block; padding occupies
	the suffix. Every packed entry uses the same ordering and transfer artifact, so
	hidden states, positions, masks, and probabilities remain aligned across the boundary.

	pack_experts() returns a flat transfer index together with the packed entries. This
	index replaces the old boolean unpacking artifact as the source of truth for
	pack/unpack data movement: packing writes to those flat packed slots, and unpacking
	reads from those same slots.
	"""





	# ---------------------------------------------------------------------------
	# Setup
	# ---------------------------------------------------------------------------

	def setup_packing(
	selected_heads: torch.Tensor,
	) -> dict[str, torch.Tensor]:
	"""Prepare the auxiliary ordering data used by pack/unpack.

	Args:
	selected_heads: Routed token-choice head selections I of shape (B, N, K).

	Returns:
	Auxiliary payload dict with keys:
	- "flattened_selected_heads": H of shape (B, N*K)
	- "permutation": expert-major permutation Pi of shape (B, N*K)
	- "inverse_permutation": inverse permutation Pi^{-1} of shape (B, N*K)
	This dict is forwarded whole to pack_experts and unpack_experts.
	"""
	batch_size, sequence_length, num_selected_heads = selected_heads.shape
	flattened_selected_heads = selected_heads.reshape(
	batch_size,
	sequence_length * num_selected_heads,
	)

	# -----------------------------------------------------------------------
	# Establish the expert-major ordering invariant.
	#
	# BEA later applies a triangular causal mask inside each expert bucket. That
	# mask is only meaningful if routed copies for the same expert preserve their
	# source-token order. Stable sorting by selected head establishes that order.
	# -----------------------------------------------------------------------
	permutation = torch.argsort(flattened_selected_heads, dim=-1, stable=True)
	inverse_permutation = torch.argsort(permutation, dim=-1)

	return {
	"flattened_selected_heads": flattened_selected_heads,
	"permutation": permutation,
	"inverse_permutation": inverse_permutation,
	}


	# ---------------------------------------------------------------------------
	# Packing
	# ---------------------------------------------------------------------------

	def pack_experts(
	entries: dict[str, tuple[torch.Tensor, Any]],
	setup: dict[str, torch.Tensor],
	selected_heads: torch.Tensor,
	num_experts: int,
	packed_length: int,
	) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
	"""Pack token-choice tensors into expert-choice padded form.

	Args:
	entries: Mapping from string keys to (tensor, padding_value) pairs. Each
	tensor has shape (B, N, ...) and is rearranged into expert-choice layout
	(B, L, T, ...). The returned dict carries the same keys.
	setup: Auxiliary payload returned by setup_packing().
	selected_heads: Routed head selections I of shape (B, N, K).
	num_experts: Total number of experts L.
	packed_length: Static packed time dimension T. All per-expert buffers are
	allocated to exactly this length. Raises if any actual per-expert token
	count exceeds this value.

	Returns:
	Tuple of:
	- packed_entries: Dict with same keys as entries; each value is the
	packed tensor of shape (B, L, T, ...).
	- flat_packed_transfer_indices: Long tensor of shape (BNK,). Each value
	is the flattened padded expert-choice slot occupied by the corresponding
	routed-copy row. Pass this to unpack_experts().
	"""
	batch_size, sequence_length, num_selected_heads = selected_heads.shape
	num_routed_copies_per_batch = sequence_length * num_selected_heads
	num_routed_copies = batch_size * num_routed_copies_per_batch

	flattened_selected_heads = setup["flattened_selected_heads"]
	permutation = setup["permutation"]

	# -----------------------------------------------------------------------
	# Algorithm overview.
	#
	# Packing first builds one routed-copy row for each selected token/expert
	# pair, ordered by the stable expert-major permutation. Those rows contain
	# no padding. The final packed tensor reserves packed_length slots per expert.
	# The flat transfer index bridges those layouts by adding back the cumulative
	# padding skipped before each expert block.
	# -----------------------------------------------------------------------

	# -----------------------------------------------------------------------
	# Build the shared routed-copy source rows.
	#
	# This tensor identifies the source token row for each selected token/expert
	# pair after the stable expert-major permutation. Every packed entry uses this
	# same row plan, so all entries remain aligned before padded materialization.
	# -----------------------------------------------------------------------
	source_token_indices = torch.arange(
	sequence_length,
	device=flattened_selected_heads.device,
	dtype=torch.long,
	).view(1, sequence_length, 1).expand(
	batch_size,
	sequence_length,
	num_selected_heads,
	)
	flattened_source_token_indices = source_token_indices.reshape(
	batch_size,
	num_routed_copies_per_batch,
	)
	sorted_source_token_indices = flattened_source_token_indices.gather(
	dim=1,
	index=permutation,
	)

	# -----------------------------------------------------------------------
	# Establish packed expert occupancy and capacity.
	#
	# tokens_per_expert tells how many routed-copy rows occupy the prefix of each
	# expert block. The padded layout is valid only when every prefix fits inside
	# the configured packed_length.
	# -----------------------------------------------------------------------
	tokens_per_expert = _count_tokens_per_expert(flattened_selected_heads, num_experts)
	_enforce_no_overflow(tokens_per_expert, packed_length)

	# -----------------------------------------------------------------------
	# Build the flat insertion points for the padded expert frame.
	#
	# Routed-copy rows omit padding, while the packed frame reserves packed_length
	# slots for every expert. The transfer index adds back the cumulative padding
	# skipped before each expert block, producing one flat destination slot for
	# every routed-copy row. This tensor is forwarded to unpack_experts so removal
	# uses the same positions that insertion used.
	# -----------------------------------------------------------------------
	flat_tokens_per_expert = tokens_per_expert.reshape(-1)
	flat_padding_per_expert = packed_length - flat_tokens_per_expert
	flat_padding_before_expert = (
	flat_padding_per_expert.cumsum(dim=0) - flat_padding_per_expert
	)

	flat_padding_for_routed_rows = torch.repeat_interleave(
	flat_padding_before_expert,
	flat_tokens_per_expert,
	output_size=num_routed_copies,
	)
	flat_routed_row_indices = torch.arange(
	num_routed_copies,
	device=flattened_selected_heads.device,
	dtype=torch.long,
	)
	flat_packed_transfer_indices = (
	flat_routed_row_indices + flat_padding_for_routed_rows
	)

	# -----------------------------------------------------------------------
	# Materialize each entry through the shared routing and transfer artifacts.
	#
	# Each entry first gathers into the shared routed-copy order. The flat packed
	# allocation supplies padding, and the transfer index writes each routed-copy
	# row into its padded expert slot before the public shape is restored.
	# -----------------------------------------------------------------------
	packed_entries: dict[str, torch.Tensor] = {}
	for key, (tensor, padding_value) in entries.items():
	extra_shape = tensor.shape[2:]

	# The sorted source index is shared across all entries; expanding it over
	# trailing dimensions lets the same routing/order plan apply to hidden
	# states, positions, masks, probabilities, and any other packed tensor.
	sorted_gather_indices = sorted_source_token_indices.view(
	batch_size,
	num_routed_copies_per_batch,
	(1,) len(extra_shape),
	).expand(-1, -1, *extra_shape)
	sorted_tensor = tensor.gather(dim=1, index=sorted_gather_indices)

	packed_tensor = tensor.new_full(
	(batch_size * num_experts * packed_length, *extra_shape),
	fill_value=padding_value,
	)
	packed_tensor[flat_packed_transfer_indices] = sorted_tensor.reshape(
	num_routed_copies,
	*extra_shape,
	)
	packed_entries[key] = packed_tensor.reshape(
	batch_size,
	num_experts,
	packed_length,
	*extra_shape,
	)

	return packed_entries, flat_packed_transfer_indices


	# ---------------------------------------------------------------------------
	# Unpacking
	# ---------------------------------------------------------------------------

	def unpack_experts(
	expert_outputs: torch.Tensor,
	setup: dict[str, torch.Tensor],
	flat_packed_transfer_indices: torch.Tensor,
	selected_heads: torch.Tensor,
	) -> torch.Tensor:
	"""Restore token-choice ordering from BEA expert-choice output.

	Args:
	expert_outputs: Expert-choice BEA output y of shape (B, L, T, d).
	setup: Auxiliary payload returned by setup_packing().
	flat_packed_transfer_indices: Transfer index returned by pack_experts().
	Each value identifies a routed-copy slot in the flattened padded
	expert-choice frame.
	selected_heads: Routed head selections I of shape (B, N, K).

	Returns:
	Restored token-choice tensor y_tilde of shape (B, N, K, d).
	"""
	inverse_permutation = setup["inverse_permutation"]

	batch_size, sequence_length, num_selected_heads = selected_heads.shape
	num_routed_copies_per_batch = sequence_length * num_selected_heads
	hidden_dim = expert_outputs.shape[-1]

	# -----------------------------------------------------------------------
	# Recover routed-copy rows from the same packed slots used at insertion.
	#
	# Packing writes into the forwarded flat slots, and unpacking reads from those
	# same slots before applying the inverse routing permutation back to
	# token-choice order.
	# -----------------------------------------------------------------------
	flat_expert_outputs = expert_outputs.reshape(-1, hidden_dim)
	flat_routed_copy_outputs = flat_expert_outputs[flat_packed_transfer_indices]

	sorted_token_choice_outputs = flat_routed_copy_outputs.reshape(
	batch_size,
	num_routed_copies_per_batch,
	hidden_dim,
	)
	restored_outputs = sorted_token_choice_outputs.gather(
	dim=1,
	index=inverse_permutation.unsqueeze(-1).expand(-1, -1, hidden_dim),
	)
	return restored_outputs.reshape(
	batch_size,
	sequence_length,
	num_selected_heads,
	hidden_dim,
	)


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------


	def _enforce_no_overflow(tokens_per_expert: torch.Tensor, packed_length: int) -> None:
	"""Enforce that no expert bucket exceeds the preallocated packed length.

	This check fires when the number of tokens assigned to any expert in any batch
	item exceeds mosrah_packed_length. When that limit is exceeded, the packed buffer
	is too small to hold all assignments and data would be dropped. Reduce the input
	sequence length or increase training_sequence_length (for training) or
	inference_sequence_length (for inference) in ShramConfig to resolve.

	Args:
	tokens_per_expert: Per-expert token counts, shape (B, num_experts).
	packed_length: The preallocated packed time dimension.
	"""
	if torch.compiler.is_compiling():
	torch._assert_async(
	tokens_per_expert.max() <= packed_length,
	"Expert packing overflow: expert bucket exceeds mosrah_packed_length. "
	"Reduce sequence length or increase training_sequence_length / "
	"inference_sequence_length in ShramConfig.",
	)
	else:
	max_count = tokens_per_expert.max().item()
	if max_count > packed_length:
	raise RuntimeError(
	"Expert packing overflow: at least one expert bucket contains more "
	"tokens than mosrah_packed_length allows. Reduce sequence length or "
	"increase training_sequence_length / inference_sequence_length in "
	"ShramConfig to resolve.\n"
	f"Packed length: {packed_length}\n"
	f"Head lengths: {tokens_per_expert}\n"
	)


	def _count_tokens_per_expert(
	flattened_selected_heads: torch.Tensor,
	num_experts: int,
	) -> torch.Tensor:
	"""Count how many routed token copies are assigned to each expert per batch item.

	Uses scatter_add into a pre-sized (B, num_experts) buffer. Each position in
	flattened_selected_heads contributes one count to the corresponding expert slot.

	Args:
	flattened_selected_heads: Expert assignments of shape (B, N*K) with values
	in [0, num_experts).
	num_experts: Total number of experts L.

	Returns:
	Counts tensor of shape (B, num_experts).
	"""
	batch_size = flattened_selected_heads.shape[0]
	tokens_per_expert = torch.zeros(
	batch_size,
	num_experts,
	device=flattened_selected_heads.device,
	dtype=torch.long,
	)
	tokens_per_expert.scatter_add_(
	dim=1,
	index=flattened_selected_heads,
	src=torch.ones_like(flattened_selected_heads, dtype=torch.long),
	)
	return tokens_per_expert
	# -----------
	# Inlined from: router.py
	# -----------
	"""Token-choice router for the MoSRAH sparse attention path.

	This module implements mechanically load-balanced routing for MoSRAH. Given an
	input hidden state x, the router produces two outputs used downstream:

	- selected_heads (I): which K of the L available expert heads each token
	routes to, determined by a block-balanced causal solver.
	- routing_probs (P): the weights used for the weighted output reduction,
	gathered from the softmax routing scores at the selected indices and
	renormalized to sum to 1 per token.

	Routing uses a single learnable projection:

	- routing_weight: shape (L, embedding_width). Maps input to per-head routing
	scores. Task loss trains this parameter through routing_probs; regret_loss
	trains it to prefer expert assignments at positions of peak preference.

	Block-balanced routing partitions the sequence into non-overlapping blocks of
	W = L/K tokens. Within each block every expert is assigned to exactly one token,
	guaranteeing perfect load balance by construction. The L % K == 0 compatibility
	constraint (enforced in ShramConfig) makes W an exact integer.

	Selection is causal within each block: at each of the W steps the current
	token chooses its K experts from those not yet claimed by earlier tokens in
	the same block. All W steps execute in parallel across blocks and batch via
	a fully-unrolled Python for loop, keeping the compiled graph flat.

	Paper ref: Appendix A.Routing.
	"""









	class MoSRAHRouter(nn.Module):
	"""Token-choice router for MoSRAH sparse attention.

	Each input token independently selects K of the L available expert heads
	through a block-balanced causal solver. Within each block of W = L/K
	consecutive tokens every expert is used exactly once, giving perfect load
	balance by construction.

	routing_weight is nn.Parameter rather than nn.Linear so that HuggingFace
	_init_weights does not override its kaiming initialization at construction.

	Attributes:
	routing_weight: Shape (L, embedding_width). Maps input hidden states to
	per-head routing scores.
	block_length: Tokens per routing block W = L / K. Within each block
	every expert is used exactly once.

	Args:
	config: Model configuration. Must expose ``embedding_width``,
	``num_mosrah_heads`` (L), ``num_selected_heads`` (K), and
	``block_length`` (W).
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()
	self.num_mosrah_heads = config.num_mosrah_heads
	self.num_selected_heads = config.num_selected_heads
	self.block_length = config.block_length

	# Routing projection: maps input (B, N, d) to per-head routing scores (B, N, L).
	# nn.Parameter ensures HuggingFace _init_weights does not override kaiming init.
	self.routing_weight = nn.Parameter(
	torch.empty(config.num_mosrah_heads, config.embedding_width)
	)
	nn.init.kaiming_normal_(self.routing_weight)

	def forward(
	self,
	x: torch.Tensor,
	active_mask: torch.Tensor,
	router_cache: RouterCache \| None = None,
	) -> tuple[torch.Tensor, torch.Tensor, dict[str, torch.Tensor]]:
	"""Route input tokens to K expert heads each and compute routing probabilities.

	Args:
	x: Input hidden states of shape (batch, seq_len, embedding_width).
	active_mask: Current-chunk active mask of shape (batch, seq_len), where
	True marks a semantically live token. Dead tokens do not contribute
	to regret_loss or logit_regret.

	Returns:
	selected_heads: Head indices I of shape (batch, seq_len, num_selected_heads).
	Each token's K selected head indices from the block-balanced solver.
	routing_probs: Routing probabilities P of shape (batch, seq_len,
	num_selected_heads). Gathered from the pre-balance softmax at
	selected_heads and renormalized to sum to 1 per token.
	router_diagnostics: Dict of routing scalars:
	- ``regret_loss``: gradient-carrying mean regret, mean of
	max(p_max_active − p_chosen, 0) over live (B, num_blocks, L)
	entries. In [0, 1]. Zero when every expert is assigned at its
	peak-preference token within the block.
	- ``logit_regret``: detached logit-space regret; same formula
	applied to routing logits rather than softmax probabilities.
	In [0, ∞). Monitoring only.
	- ``logit_std``: detached mean per-token std of routing logits.
	"""
	# ── Algorithm overview ──────────────────────────────────────────────────────
	#
	# Problem: each token independently selects its top-K heads with no knowledge
	# of what other tokens in the same sequence will choose. Independent selection
	# means a single popular head can be chosen by every token while another is
	# never used — statistics-based corrections (auxiliary losses, bias vectors)
	# can only push routing probabilistically and have proven unstable when tuned
	# strongly enough to prevent degeneracy.
	#
	# Approach: the compatibility constraint E % K == 0 (enforced in ShramConfig)
	# makes W = E / K an exact integer. A block of W consecutive tokens contains
	# exactly W × K = E selection slots — one per expert. Enforcing that each
	# expert is used exactly once per block makes the block perfectly balanced by
	# construction, eliminating any need for auxiliary losses or correction steps.
	# Enforcement is causal: at each of the W steps the current position picks its
	# K experts from those not yet claimed earlier in the same block, by masking
	# claimed experts with -inf before top-K. All W steps run simultaneously across
	# blocks and batch via a Python for loop that is fully unrolled at compile time.

	B, N, _ = x.shape
	L = self.num_mosrah_heads
	K = self.num_selected_heads
	W = self.block_length

	# ── Phase: pre-balance scoring ─────────────────────────────────────────
	#
	# Establishes the clean routing distribution before any -inf masking.
	# logit_std is captured here because the block solver's masking would
	# corrupt the standard deviation. routing_scores is used both for
	# regret_loss and for the final routing_probs.
	routing_logits = self._compute_routing_logits(x) # (B, N, L)
	logit_std = routing_logits.std(dim=-1).mean().detach()
	routing_scores = F.softmax(routing_logits, dim=-1) # (B, N, L)

	# ── Phase: block-balanced causal selection ─────────────────────────────
	#
	# Three execution modes, distinguished by router_cache and sequence length:
	#
	# Training (router_cache is None): the full sequence is available. All W
	# steps of the block solver run simultaneously across every block in the
	# sequence. No cache interaction.
	#
	# Prefill (router_cache is not None, N > 1): identical to training, but
	# the partial last-block state is written to the cache so decode steps can
	# continue within the same block without a gap.
	#
	# Decode (router_cache is not None, N == 1): one token arrives at a known
	# position within the current block. The cached used_in_block mask is
	# applied before TopK to enforce the one-usage-per-block contract, then
	# the cache is updated in-place with this step's selections.

	if router_cache is not None and N == 1:
	# ── Decode mode ───────────────────────────────────────────────────
	#
	# Single token; block position and claimed-expert state come from the
	# cache. Treating this as a one-token, one-step block means the regret
	# computation downstream sees a (B, 1, 1, K) assignment tensor and
	# produces exactly zero regret, which is correct: with only one active
	# token per "block" there is no alternative assignment with higher
	# preference.
	used_in_block = router_cache.get_used_in_block() # (B, L)
	step_logits = routing_logits[:, 0, :] # (B, L)
	available = step_logits.masked_fill(used_in_block, float('-inf'))
	step_heads = available.topk(K, dim=-1).indices # (B, K)

	router_cache.update_decode(step_heads)

	selected_heads = step_heads.unsqueeze(1) # (B, 1, K)
	else:
	# ── Training / prefill mode ───────────────────────────────────────
	#
	# The full N-token sequence is available. Padding extends it to a
	# multiple of W; padded tokens occupy the tail of the last block and
	# never consume experts needed by real tokens because the real tokens
	# preceding them have already had their pick each step. The pad is
	# discarded after the solver.
	num_blocks = (N + W - 1) // W
	N_pad = num_blocks * W
	pad_len = N_pad - N

	if pad_len > 0:
	padded_logits = torch.cat(
	[routing_logits, routing_logits.new_zeros(B, pad_len, L)], dim=1
	) # (B, N_pad, L)
	else:
	padded_logits = routing_logits

	blocked_logits = padded_logits.view(B, num_blocks, W, L) # (B, blk, W, L)

	# used_in_block tracks which experts have been claimed within each block.
	# No gradient here — expert availability is a hard structural constraint,
	# not a differentiable quantity. Gradient flows through routing_probs.
	used_in_block = torch.zeros(B, num_blocks, L, dtype=torch.bool, device=x.device)
	step_heads_list = []

	for step in range(W):
	step_logits = blocked_logits[:, :, step, :] # (B, blk, L)

	# Claimed experts receive -inf so top-K never selects them.
	available = step_logits.masked_fill(used_in_block, float('-inf'))
	step_heads = available.topk(K, dim=-1).indices # (B, blk, K)
	step_heads_list.append(step_heads)

	# Mark the K chosen experts as unavailable for the rest of this block.
	used_in_block = used_in_block.scatter(-1, step_heads, True)

	# Stack W steps and reshape to (B, N_pad, K), then unpad.
	selected_heads_blocked = torch.stack(step_heads_list, dim=2) # (B, blk, W, K)
	selected_heads = selected_heads_blocked.view(B, N_pad, K)[:, :N, :] # (B, N, K)

	if router_cache is not None:
	# Prefill: persist the partial last-block state so decode steps
	# that follow can continue within the same block.
	router_cache.update_prefill(selected_heads_blocked, N)

	# ── Phase: regret loss ─────────────────────────────────────────────────
	#
	# Regret measures how much routing preference was sacrificed at each expert
	# assignment relative to the peak active preference within the same block.
	# A non-zero regret at expert l in block bl means some other active token
	# in that block would have preferred expert l more than the one assigned.
	# Minimising regret trains the router to save experts for the tokens that
	# want them most.
	#
	# Decode mode returns zeros: regret is only defined over complete W-token
	# blocks, and a single decode step is not a complete block. Backward is
	# never called during inference so the zero is a correct no-op.
	if router_cache is not None and N == 1:
	regret_loss = routing_logits.new_zeros(())
	logit_regret = routing_logits.new_zeros(()).detach()
	else:
	regret_loss, logit_regret = self._compute_regret(
	routing_scores,
	routing_logits,
	selected_heads_blocked,
	active_mask,
	)

	# ── Phase: routing probabilities ────────────────────────────────────────
	#
	# Gathered from the pre-balance routing_scores to reflect genuine routing
	# preference; renormalized so they sum to 1 per token.
	gathered = routing_scores.gather(dim=-1, index=selected_heads) # (B, N, K)
	routing_probs = gathered / gathered.sum(dim=-1, keepdim=True) # (B, N, K)

	router_diagnostics = {
	"regret_loss": regret_loss,
	"logit_regret": logit_regret,
	"logit_std": logit_std,
	}
	return selected_heads, routing_probs, router_diagnostics

	def _compute_routing_logits(self, x: torch.Tensor) -> torch.Tensor:
	"""Compute per-head routing logits from input hidden states.

	Args:
	x: Input hidden states, shape (batch, seq_len, embedding_width).

	Returns:
	Routing logits, shape (batch, seq_len, num_mosrah_heads).
	"""
	return F.linear(x, self.routing_weight) # (B, N, L)

	@staticmethod
	def _compute_regret(
	routing_scores: torch.Tensor,
	routing_logits: torch.Tensor,
	selected_heads_blocked: torch.Tensor,
	active_mask: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""Compute regret_loss and logit_regret from a completed block assignment.

	Regret at expert l in block bl = max(p_max_active − p_chosen, 0), where
	p_max_active is the highest routing probability any active token holds for
	expert l within the block, and p_chosen is the routing probability of the
	token actually assigned to expert l (0 if that token is dead).

	regret_loss is the mean over live (batch, block, expert) triples. A block is
	live iff it contains at least one active token; all L experts in a live block
	contribute. Result is in [0, 1].

	logit_regret applies the same formula to routing_logits and is returned
	detached — it is a monitoring scalar only, in [0, ∞).

	Args:
	routing_scores: Softmax routing probabilities, shape (B, N, L).
	Gradient flows through this tensor into regret_loss.
	routing_logits: Pre-softmax routing logits, shape (B, N, L).
	Used only for the detached logit_regret.
	selected_heads_blocked: Expert assignments from the block solver,
	shape (B, num_blocks, W, K). Block geometry
	(num_blocks, W) is derived from this shape.
	active_mask: Boolean live-token mask, shape (B, N).

	Returns:
	regret_loss: Gradient-carrying scalar in [0, 1].
	logit_regret: Detached scalar in [0, ∞).
	"""
	B, num_blocks, W, _K = selected_heads_blocked.shape
	L = routing_scores.shape[-1]
	N = routing_scores.shape[1]
	N_pad = num_blocks * W

	# ── Reshape into block form ─────────────────────────────────────────
	#
	# Block geometry is read from selected_heads_blocked — no recomputation
	# needed here. Padded tail positions receive zero scores and False
	# activity; they do not contribute to any block metric.
	if N_pad > N:
	pad_len = N_pad - N
	scores_blocked = torch.cat(
	[routing_scores, routing_scores.new_zeros(B, pad_len, L)], dim=1
	).view(B, num_blocks, W, L) # (B, nb, W, L)
	logits_blocked = torch.cat(
	[routing_logits, routing_logits.new_zeros(B, pad_len, L)], dim=1
	).view(B, num_blocks, W, L) # (B, nb, W, L)
	active_blocked = torch.cat(
	[active_mask, active_mask.new_zeros(B, pad_len)], dim=1
	).view(B, num_blocks, W) # (B, nb, W)
	else:
	scores_blocked = routing_scores.view(B, num_blocks, W, L)
	logits_blocked = routing_logits.view(B, num_blocks, W, L)
	active_blocked = active_mask.view(B, num_blocks, W)

	active_float = active_blocked.float() # (B, nb, W)
	block_active = active_blocked.any(dim=-1) # (B, nb)

	# ── Assignment mask ─────────────────────────────────────────────────
	#
	# One-hot indicator of which token was assigned to each expert. Block
	# balance guarantees exactly one entry per (b, bl, l) triple, so
	# summing over W recovers exactly one score value per expert.
	assignment_mask = scores_blocked.new_zeros(B, num_blocks, W, L)
	assignment_mask.scatter_(dim=-1, index=selected_heads_blocked, value=1.0)
	# (B, nb, W, L)

	# ── Prob regret (gradient flows through routing_scores) ─────────────
	#
	# p_chosen: routing score at the assigned token, gated by active_float
	# so dead assignments contribute 0 — the expert accrues full regret
	# against the active maximum rather than no penalty.
	# p_max: peak routing score over active tokens; dead tokens zeroed before
	# max (safe because softmax outputs are non-negative).
	p_chosen = (assignment_mask * active_float.unsqueeze(-1) * scores_blocked).sum(dim=2)
	# (B, nb, L)
	p_max = (active_float.unsqueeze(-1) * scores_blocked).max(dim=2).values
	# (B, nb, L)

	regret = (p_max - p_chosen).clamp(min=0.0) # (B, nb, L)

	# Mean over live (B, num_blocks, L) entries. Clamped to 1 for the
	# all-dead edge case where the numerator is already 0.
	num_live = block_active.float().sum() # scalar
	regret_loss = (
	block_active.float().unsqueeze(-1) * regret
	).sum() / num_live.mul(L).clamp(min=1.0)

	# ── Logit regret (detached monitoring) ──────────────────────────────
	#
	# Same formula applied to routing_logits. Dead tokens cannot be zeroed
	# before max (logits may be negative), so they are masked to -inf;
	# dead blocks are replaced with 0 before subtraction. Detached so it
	# never influences any parameter during backward.
	logit_chosen = (
	assignment_mask * active_float.unsqueeze(-1) * logits_blocked
	).sum(dim=2) # (B, nb, L)

	logit_max = logits_blocked.masked_fill(
	~active_blocked.unsqueeze(-1), float('-inf')
	).max(dim=2).values # (B, nb, L)
	logit_max = logit_max.masked_fill(~block_active.unsqueeze(-1), 0.0)

	logit_regret = (
	block_active.float().unsqueeze(-1) * (logit_max - logit_chosen).clamp(min=0.0)
	).sum() / num_live.mul(L).clamp(min=1.0)
	logit_regret = logit_regret.detach()

	return regret_loss, logit_regret

	# -----------
	# Inlined from: positions_converter.py
	# -----------
	"""Position computation for the MoSRAH sparse path.

	This layer computes the packed position tensor P consumed by BEA.

	- In main-sequence mode, P is the packed original-token position tensor from the
	packing path.
	- In semantic-sequence mode, P is a per-expert local sequence over the packed
	expert-choice layout, optionally offset by the current sparse-cache occupancies
	during cached inference.
	"""








	class SparseMoSRAHPositions(nn.Module):
	"""Compute the packed RoPE position tensor for the MoSRAH sparse path.

	This layer operates in the packed expert-choice frame used by BEA. The input
	packed_positions tensor is always the packed original-token position tensor
	produced by the packing path. The configured rope_mode determines whether that
	tensor is forwarded directly or replaced by a semantic local-slot sequence.
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()
	self.rope_mode = config.rope_mode

	def forward(
	self,
	packed_positions: torch.Tensor,
	active_mask: torch.Tensor,
	cache: MoSRAHCache \| None,
	) -> torch.Tensor:
	"""Compute the packed position tensor P consumed by BEA.

	Args:
	packed_positions: Packed original-token positions J' of shape (B, L, T).
	active_mask: Boolean active-token mask of shape (B, L, T). Inactive
	positions are zeroed in the returned tensor regardless of mode —
	their position value is semantically irrelevant and 0 is guaranteed
	to be within any valid RoPE table.
	cache: Optional layer-local MoSRAH cache. When present in semantic-sequence
	mode, the current per-head occupancies offset the local packed sequence.

	Returns:
	Packed position tensor P of shape (B, L, T).
	"""
	if self.rope_mode == "main_sequence":
	positions = self._main_sequence_positions(packed_positions)
	elif self.rope_mode == "semantic_sequence":
	positions = self._semantic_sequence_positions(packed_positions, cache)
	else:
	raise NotImplementedError(
	f"Unsupported MoSRAH rope_mode '{self.rope_mode}'."
	)

	return torch.where(active_mask, positions, torch.zeros_like(positions))

	def _main_sequence_positions(
	self,
	packed_positions: torch.Tensor,
	) -> torch.Tensor:
	"""Forward packed original-token positions unchanged."""
	return packed_positions

	def _semantic_sequence_positions(
	self,
	packed_positions: torch.Tensor,
	cache: MoSRAHCache \| None,
	) -> torch.Tensor:
	"""Compute semantic-sequence packed positions in expert-choice space.

	Without a sparse cache, semantic positions are the local packed sequence
	0, 1, 2, ... over the expert-local T dimension. With a sparse cache, that
	same local sequence is offset by the current per-(batch, expert) occupancies
	returned by get_heads_lengths().
	"""
	batch_size, num_experts, packed_length = packed_positions.shape

	# -------------------------------------------------------------------
	# Construct the local packed sequence 0, 1, 2, ... over the expert-local
	# sequence dimension T. This is then broadcast across batch and experts.
	# -------------------------------------------------------------------
	local_positions = torch.arange(
	packed_length,
	device=packed_positions.device,
	dtype=packed_positions.dtype,
	).view(1, 1, packed_length).expand(
	batch_size,
	num_experts,
	packed_length,
	)

	# -------------------------------------------------------------------
	# In cached semantic-sequence mode, positions continue from the current
	# sparse-cache occupancies rather than restarting at zero for the local
	# chunk.
	# -------------------------------------------------------------------
	if cache is None:
	return local_positions

	cached_lengths = cache.get_heads_lengths().to(
	device=packed_positions.device,
	dtype=packed_positions.dtype,
	).unsqueeze(-1)

	return local_positions + cached_lengths


	class MoSRAHLayer(nn.Module):
	"""Full routed sparse attention path for SHRAM.

	The MoSRAH path consumes model-space hidden states together with
	authoritative per-token positions and returns the model-space sparse-path
	contribution and a diagnostics dict from the router containing
	load-balance loss, routing-imbalance scalar, and load-balance health
	scalars.
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()
	self.num_experts = config.num_mosrah_heads
	if config.use_cache:
	self.packed_length = config.mosrah_cache_length
	else:
	self.packed_length = config.mosrah_packed_length

	self.router = MoSRAHRouter(config)
	self.positions = SparseMoSRAHPositions(config)
	self.bea = BottleneckedEnsembleAttention(config)

	def num_mosrah_parameters(self) -> int:
	"""Return the total number of trainable parameters in this MoSRAH layer."""
	return sum(p.numel() for p in self.parameters())

	def forward(
	self,
	hidden_states: torch.Tensor,
	position_ids: torch.Tensor,
	active_mask: torch.Tensor,
	cache: MoSRAHCache \| None,
	router_cache: RouterCache \| None = None,
	) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
	"""Run the full MoSRAH sparse path.

	Args:
	hidden_states: Model-space hidden states x of shape (B, N, d).
	position_ids: Authoritative per-token positions of shape (B, N).
	active_mask: Current-chunk active mask of shape (B, N), where True
	means the token is semantically live. Forwarded to the router
	so dead tokens are excluded from routing statistics, and to
	pack_experts so dead outer tokens do not become semantically
	active packed entries.
	cache: Optional layer-local MoSRAH cache. Pass None for uncached
	execution and the layer-local cache instance for cached execution.

	Returns:
	sparse_output: Model-space sparse-path output of shape (B, N, d).
	router_diagnostics: Dict of router feedback scalars. Keys:
	``regret_loss`` (has grad), ``logit_regret`` (detached),
	``logit_std`` (detached). See MoSRAHRouter for semantics.
	"""

	# -------------------------------------------------------------------
	# The first transition moves from model-space token-choice input into
	# the packed expert-choice sparse-attention state. Routing decides both
	# which experts each token uses and which unbiased probabilities must be
	# reserved for the final reduction. The active mask is forwarded to the
	# router so dead tokens are excluded from routing statistics, and to
	# pack_experts so outer liveness is faithfully carried into the packed
	# frame. Packing returns both the unpacking mask (slot occupancy, always
	# BNK True entries) and the packed active mask (live slots only);
	# active_mask is rebound to the packed form after this point.
	# -------------------------------------------------------------------
	selected_heads, routing_probs, router_diagnostics = self.router(
	hidden_states, active_mask, router_cache
	)

	setup = setup_packing(selected_heads)
	entries = {
	"hidden_states": (hidden_states, 0.0),
	"position_ids": (position_ids, 0),
	"active_mask": (active_mask, False),
	}
	packed, unpacking_map = pack_experts(entries, setup, selected_heads, self.num_experts, self.packed_length)
	packed_hidden_states = packed["hidden_states"]
	packed_positions = packed["position_ids"]
	active_mask = packed["active_mask"]

	# -------------------------------------------------------------------
	# Sparse attention runs entirely in the packed expert-choice frame, so
	# the RoPE position semantics must also be chosen in that frame. The
	# position layer therefore decides whether BEA should see packed
	# original-token positions or packed local-slot positions. BEA then
	# consumes that packed position tensor together with the packed hidden
	# states and the layer-local sparse cache, which it owns directly.
	# -------------------------------------------------------------------
	bea_positions = self.positions(
	packed_positions=packed_positions,
	active_mask=active_mask,
	cache=cache,
	)
	packed_outputs = self.bea(
	packed_embeddings=packed_hidden_states,
	position_ids=bea_positions,
	active_mask=active_mask,
	cache=cache,
	)

	# -------------------------------------------------------------------
	# The final transition restores token-choice meaning and only then
	# collapses the K routed copies back into model space. This ordering is
	# required because routing_probs live in token-choice space, whereas BEA
	# returns expert-choice packed outputs. The reduction must therefore
	# happen after unpacking, and it must use the router's unbiased
	# renormalized probabilities rather than any biased selection scores.
	# -------------------------------------------------------------------
	token_choice_outputs = unpack_experts(
	expert_outputs=packed_outputs,
	setup=setup,
	flat_packed_transfer_indices=unpacking_map,
	selected_heads=selected_heads,
	)
	final_output = (
	token_choice_outputs * routing_probs.unsqueeze(-1)
	).sum(dim=2)

	return final_output, router_diagnostics



	class SHRAMHybridLayer(nn.Module):
	"""Hybrid attention layer H(x) = h_l(x) + h_s(x) for one decoder slot.

	The local path preserves nearby-token behavior through sliding-window causal
	attention. The sparse path is the theorem-facing MoSRAH routed attention
	path. Both operate over the same model-space hidden state and return
	model-space outputs, so the hybrid composition is a direct sum in model
	space.
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()
	self.local_attention = SlidingWindowAttention(config)
	self.sparse_attention = MoSRAHLayer(config)

	def num_mosrah_parameters(self) -> int:
	"""Return the total number of trainable parameters in the MoSRAH sparse path."""
	return self.sparse_attention.num_mosrah_parameters()

	def forward(
	self,
	hidden_states: torch.Tensor,
	position_ids: torch.Tensor,
	active_mask: torch.Tensor,
	cache: ShramLayerCache \| None,
	) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
	"""Apply the SHRAM hybrid attention layer.

	Args:
	hidden_states: Input hidden states of shape (B, N, d).
	position_ids: Authoritative token positions of shape (B, N).
	active_mask: Current-chunk active mask of shape (B, N), where True
	means the token is semantically live. Forwarded unchanged to
	both the local path and the sparse path.
	cache: Optional per-layer SHRAM cache. When provided, the owned
	sliding-window and MoSRAH sub-caches are dispatched directly to
	their corresponding attention paths.

	Returns:
	hybrid_output: Model-space hybrid attention output of shape (B, N, d).
	router_diagnostics: Dict of router feedback scalars passed through
	unchanged from MoSRAHLayer; see MoSRAHRouter for semantics.
	"""
	# -------------------------------------------------------------------
	# The hybrid layer's first responsibility is cache dispatch. The layer
	# cache already owns the concrete sub-cache objects required by each
	# path, so this unit should forward those exact references rather than
	# reinterpret cache ownership or invent a composite update protocol here.
	# -------------------------------------------------------------------
	if cache is None:
	sliding_window_cache = None
	mosrah_cache = None
	router_cache = None
	else:
	sliding_window_cache = cache.sliding_window_cache
	mosrah_cache = cache.mosrah_cache
	router_cache = cache.router_cache

	# -------------------------------------------------------------------
	# Both attention paths must see the same model-space hidden state for
	# the current decoder layer. The local path preserves short-range
	# structure, while the sparse path provides the routed long-range
	# contribution and emits the load-balance signal used by training.
	# -------------------------------------------------------------------
	local_output = self.local_attention(
	x=hidden_states,
	position_ids=position_ids,
	active_mask=active_mask,
	cache=sliding_window_cache,
	)
	sparse_output, router_diagnostics = self.sparse_attention(
	hidden_states=hidden_states,
	position_ids=position_ids,
	active_mask=active_mask,
	cache=mosrah_cache,
	router_cache=router_cache,
	)

	# -------------------------------------------------------------------
	# The composition rule is intentionally simple at this boundary. Both
	# sublayers already return model-space tensors of matching shape, so the
	# correct hybrid behavior is their direct sum with no additional mixing
	# logic introduced here.
	# -------------------------------------------------------------------
	hybrid_output = local_output + sparse_output

	return hybrid_output, router_diagnostics


	# -----------
	# Inlined from: mlp.py
	# -----------
	"""SwiGLU feed-forward sublayer.

	SwiGLU is a gated linear unit variant that multiplies a SiLU-gated projection
	element-wise against a separate up-projection:

	output = W_down(SiLU(W_gate(x)) ⊙ W_up(x))

	The gating mechanism gives the network more expressive control over which features
	to propagate than a plain two-matrix FFN. It requires three weight matrices instead
	of two, which is why intermediate_size in Llama 3 is set lower than the 4× multiplier
	typical of two-matrix FFNs — the total parameter count remains comparable.

	SiLU is used as the gate activation because Llama 3 committed to SwiGLU specifically
	— a fixed architectural choice.
	"""







	class SwiGLUMLP(nn.Module):
	"""SwiGLU feed-forward sublayer.

	Implements the three-matrix SwiGLU FFN used in Llama 3:

	output = W_down(SiLU(W_gate(x)) ⊙ W_up(x))

	No bias on any projection. SiLU as the gate activation is an architectural
	constant — it is what defines SwiGLU specifically.

	Args:
	config: Model config. Must expose ``hidden_size`` and ``intermediate_size``.
	"""

	def __init__(self, config: PretrainedConfig) -> None:
	super().__init__()
	self.gate_proj = nn.Linear(config.embedding_width, config.mlp_width, bias=False)
	self.up_proj = nn.Linear(config.embedding_width, config.mlp_width, bias=False)
	self.down_proj = nn.Linear(config.mlp_width, config.embedding_width, bias=False)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Apply the SwiGLU feed-forward transformation.

	Args:
	x: Input tensor of shape (batch, seq_len, hidden_size).

	Returns:
	Output tensor of shape (batch, seq_len, hidden_size).
	"""
	return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))



	class DecoderLayer(nn.Module):
	"""A single pre-norm SHRAM decoder block.

	Composes SHRAMHybridLayer and SwiGLUMLP with residual connections and
	independent RMSNorm instances on each sublayer input.

	Args:
	config: SHRAM config. Must expose ``hidden_size`` and ``rms_norm_eps``
	in addition to the fields required by SHRAMHybridLayer and
	SwiGLUMLP.
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()
	self.attn_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
	self.mlp_norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)
	self.attention = SHRAMHybridLayer(config)
	self.mlp = SwiGLUMLP(config)
	scale = 1.0 / math.sqrt(config.num_decoder_layers)
	if config.use_residual_gate:
	self.attn_residual_scale = nn.Parameter(torch.zeros(1))
	self.mlp_residual_scale = nn.Parameter(torch.zeros(1))
	else:
	self.register_buffer("attn_residual_scale", torch.full((1,), scale))
	self.register_buffer("mlp_residual_scale", torch.full((1,), scale))
	def num_mosrah_parameters(self) -> int:
	"""Return the total number of trainable MoSRAH parameters in this decoder layer."""
	return self.attention.num_mosrah_parameters()

	def forward(
	self,
	x: torch.Tensor,
	position_ids: torch.Tensor,
	active_mask: torch.Tensor,
	cache: ShramLayerCache \| None = None,
	) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
	"""Apply one decoder block to the input.

	Args:
	x: Input of shape (batch, seq_len, hidden_size).
	position_ids: Authoritative positions of shape (batch, seq_len).
	active_mask: Current-chunk active mask of shape (batch, seq_len),
	where True means the token is semantically live. Forwarded
	unchanged to the hybrid attention layer.
	cache: Optional per-layer SHRAM cache passed through to the hybrid
	attention layer unchanged.

	Returns:
	output: Tensor of shape (batch, seq_len, hidden_size).
	router_diagnostics: Dict of router feedback scalars passed through
	unchanged from SHRAMHybridLayer; see MoSRAHRouter for semantics.
	"""
	attn_out, router_diagnostics = self.attention(
	hidden_states=self.attn_norm(x),
	position_ids=position_ids,
	active_mask=active_mask,
	cache=cache,
	)
	hidden_states = x + self.attn_residual_scale * attn_out
	output = hidden_states + self.mlp_residual_scale * self.mlp(self.mlp_norm(hidden_states))
	return output, router_diagnostics


	class ShramModel(nn.Module):
	"""Pure transformer backbone: decoder stack and final normalisation.

	Accepts pre-embedded hidden states of shape (batch, seq_len, hidden_size)
	and returns contextual representations of the same shape. No token embedding,
	vocabulary projection, or causal-LM lifecycle concerns.

	RoPE is applied inside each attention layer. Positional information is
	encoded in the relationship between Q and K, not added to the residual
	stream, so the backbone is agnostic to how positions are represented.

	Args:
	config: Model configuration. Must be a ``ShramConfig`` instance.
	"""

	def __init__(self, config: ShramConfig) -> None:
	super().__init__()
	self.config = config
	self.layers = nn.ModuleList(
	[DecoderLayer(config) for _ in range(config.num_decoder_layers)]
	)
	self.norm = nn.RMSNorm(config.embedding_width, eps=config.rms_norm_eps)

	def num_mosrah_parameters(self) -> int:
	"""Return the total number of trainable MoSRAH parameters across all decoder layers."""
	return sum(layer.num_mosrah_parameters() for layer in self.layers)

	def forward(
	self,
	inputs_embeds: torch.Tensor,
	position_ids: torch.Tensor,
	active_mask: torch.Tensor,
	cache: ShramCache \| None = None,
	output_hidden_states: bool = False,
	) -> dict:
	"""Run the transformer stack over a batch of pre-embedded sequences.

	Args:
	inputs_embeds: Pre-embedded input of shape (batch, seq_len, hidden_size).
	position_ids: Absolute positions of shape (batch, seq_len). Required.
	Must be provided explicitly by the caller — this module does not
	infer positions from cache state.
	active_mask: Current-chunk active mask of shape (batch, seq_len),
	where True means the token is semantically live. Forwarded
	unchanged to every decoder layer.
	cache: Optional top-level ShramCache. When provided, each DecoderLayer
	receives its own layer-local cache via ``cache.layers[layer_idx]``.
	The top-level cache object is updated in place and returned unchanged.
	output_hidden_states: When True, the output dict includes a tuple of
	per-layer hidden states: (inputs_embeds, layer_0_out, ..., layer_N_out),
	collected before the final norm.

	Returns:
	Plain dict with keys:
	- ``"last_hidden_state"``: normed backbone output,
	shape (batch, seq_len, hidden_size).
	- ``"past_key_values"``: the cache object passed in, or None.
	- ``"hidden_states"``: tuple of per-layer activations (including
	inputs_embeds as position 0) if ``output_hidden_states`` is True,
	else None. Collected before the final norm so each entry reflects the
	unnormalised residual stream at that depth.
	- ``"regret_loss"``: scalar sum of per-layer SHRAM regret losses.
	Gradient flows through this tensor into the router.
	- ``"logit_regret"``: detached scalar — mean across layers of the
	logit-space regret. Monitoring metric for assignment quality.
	- ``"logit_std"``: detached scalar — mean across layers of the
	per-token routing logit spread. Monitoring metric for routing
	sharpness.
	"""
	hidden_states = inputs_embeds
	all_hidden_states = (hidden_states,) if output_hidden_states else None
	total_regret_loss = inputs_embeds.new_zeros(())
	total_logit_regret = inputs_embeds.new_zeros(())
	total_logit_std = inputs_embeds.new_zeros(())

	for layer_idx, layer in enumerate(self.layers):
	layer_cache = None if cache is None else cache.layers[layer_idx]
	hidden_states, layer_diagnostics = layer(
	hidden_states,
	position_ids,
	active_mask,
	cache=layer_cache,
	)
	total_regret_loss = total_regret_loss + layer_diagnostics["regret_loss"]
	total_logit_regret = total_logit_regret + layer_diagnostics["logit_regret"]
	total_logit_std = total_logit_std + layer_diagnostics["logit_std"]

	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	hidden_states = self.norm(hidden_states)
	num_layers = len(self.layers)

	return {
	"last_hidden_state": hidden_states,
	"past_key_values": cache,
	"hidden_states": all_hidden_states,
	"regret_loss": total_regret_loss,
	"logit_regret": total_logit_regret / num_layers,
	"logit_std": total_logit_std / num_layers,
	}


	@dataclass
	class ShramCausalLMOutput(CausalLMOutputWithPast):
	"""SHRAM causal-LM wrapper output.

	This subclasses HuggingFace's standard ``CausalLMOutputWithPast``.
	Dataclass inheritance is sufficient here: all standard causal-LM fields and
	ModelOutput behavior are inherited from the parent, and this subclass adds
	only the SHRAM-specific wrapper outputs.
	"""

	## Python dataclass inheritance violation: CausalLMOutputWithPast defaults all
	## fields to None, which forces every subclass field to also carry a default.
	## The = None below is a language constraint, not a semantic statement. In
	## practice, regret_loss, logit_regret, and logit_std are always populated
	## by ShramForCausalLM.forward(). ce_loss is genuinely optional — present
	## only when labels are supplied.

	ce_loss: torch.FloatTensor \| None = None
	regret_loss: torch.FloatTensor \| None = None
	logit_regret: torch.Tensor \| None = None
	logit_std: torch.Tensor \| None = None

	class ShramForCausalLM(PreTrainedModel, GenerationMixin):
	"""HuggingFace-facing causal language model wrapper for SHRAM.

	Owns token embeddings, LM-head projection, wrapper-level shifted CE loss,
	tied embedding configuration, and generation/cache boundary behavior.
	Delegates all transformer computation to ``ShramModel``.

	Args:
	config: SHRAM model configuration.
	"""

	config_class = ShramConfig
	base_model_prefix = "model"
	_no_split_modules = ["DecoderLayer"]
	supports_gradient_checkpointing = True
	_supports_assign_param_buffer = False
	def __init__(self, config: ShramConfig) -> None:
	super().__init__(config)
	self.embed_tokens = nn.Embedding(config.vocab_size, config.embedding_width)
	self.model = ShramModel(config)
	self.lm_head = nn.Linear(config.embedding_width, config.vocab_size, bias=False)
	self._configure_tied_embeddings()
	self.post_init()

	def _configure_tied_embeddings(self) -> None:
	"""Apply config-controlled tied embedding behavior on this instance."""
	if self.config.tie_word_embeddings:
	self.lm_head.weight = self.embed_tokens.weight
	self._tied_weights_keys = {
	"lm_head.weight": "embed_tokens.weight",
	}
	else:
	self._tied_weights_keys = {}

	def num_mosrah_parameters(self) -> int:
	"""Return the total number of trainable parameters belonging to MoSRAH layers.

	Aggregates across all decoder layers. Excludes sliding-window path parameters,
	FFN parameters, norms, and embeddings. Use this for experimental plotting of
	MoSRAH parameter count versus performance.

	Returns:
	Total count of trainable MoSRAH parameters.
	"""
	return self.model.num_mosrah_parameters()

	def get_input_embeddings(self) -> nn.Embedding:
	"""Return the token embedding matrix."""
	return self.embed_tokens

	def set_input_embeddings(self, value: nn.Embedding) -> None:
	"""Replace the token embedding matrix."""
	self.embed_tokens = value
	self._configure_tied_embeddings()

	def get_output_embeddings(self) -> nn.Linear:
	"""Return the LM head."""
	return self.lm_head

	def set_output_embeddings(self, value: nn.Linear) -> None:
	"""Replace the LM head."""
	self.lm_head = value
	self._configure_tied_embeddings()

	def _build_shram_cache(
	self,
	batch_size: int,
	device: torch.device,
	) -> ShramCache:
	"""Construct a fresh top-level SHRAM cache."""
	return ShramCache(
	config=self.config,
	batch_size=batch_size,
	device=device,
	)

	def _validate_generation_cache_request(
	self,
	generation_config: Any,
	model_kwargs: dict[str, Any],
	generation_mode: GenerationMode,
	) -> None:
	"""Validate SHRAM's generation-side cache policy."""
	if generation_mode in {
	GenerationMode.ASSISTED_GENERATION,
	GenerationMode.CONTRASTIVE_SEARCH,
	}:
	raise NotImplementedError(
	"ShramForCausalLM does not currently support assisted generation "
	"or contrastive search because ShramCache does not support crop()."
	)

	user_defined_cache = model_kwargs.get("past_key_values")
	if user_defined_cache is not None:
	if generation_config.cache_implementation is not None:
	raise ValueError(
	"Passing both `cache_implementation` and `past_key_values` "
	"is unsupported. Please use only one."
	)
	if isinstance(user_defined_cache, tuple):
	raise ValueError(
	"Passing a tuple of `past_key_values` is not supported. "
	"Please use a `ShramCache` instance."
	)
	if not isinstance(user_defined_cache, ShramCache):
	raise TypeError(
	"ShramForCausalLM requires `past_key_values` to be a "
	"`ShramCache` instance."
	)

	if (
	user_defined_cache is None
	and generation_config.use_cache
	and generation_config.cache_implementation is not None
	):
	raise ValueError(
	"ShramForCausalLM does not support `cache_implementation`. "
	"Generation-created caches must be `ShramCache` objects."
	)

	def _prepare_cache_for_generation(
	self,
	generation_config: Any,
	model_kwargs: dict[str, Any],
	generation_mode: GenerationMode,
	batch_size: int,
	max_cache_length: int,
	) -> None:
	"""Ensure HuggingFace generation uses ShramCache.

	This is the SHRAM-specific generation hook. The rest of the default
	generation plumbing is kept intact as much as possible.

	Args:
	generation_config: Active generation configuration.
	model_kwargs: Generation kwargs, updated in place.
	generation_mode: HuggingFace generation mode.
	batch_size: Effective generation batch size.
	max_cache_length: Requested cache length. Accepted but unused here.
	"""
	self._validate_generation_cache_request(
	generation_config=generation_config,
	model_kwargs=model_kwargs,
	generation_mode=generation_mode,
	)

	if model_kwargs.get("past_key_values") is not None:
	return

	if not generation_config.use_cache:
	return

	num_repeats = max(
	generation_config.num_beams or 1,
	generation_config.num_return_sequences or 1,
	)
	model_kwargs["past_key_values"] = self._build_shram_cache(
	batch_size=batch_size*num_repeats,
	device=self.embed_tokens.weight.device,
	)

	def _reorder_cache(
	self,
	past_key_values: Cache,
	beam_idx: torch.Tensor,
	) -> Cache:
	"""Reorder the cache in place for beam search."""
	past_key_values.reorder_cache(beam_idx)
	return past_key_values

	@staticmethod
	def create_masks_for_generate(
	attention_mask: torch.Tensor \| None,
	**kwargs: Any,
	) -> torch.Tensor \| None:
	"""Return the 2D attention_mask unchanged.

	HuggingFace calls this during compiled generation to convert the 2D
	attention mask into a 4D causal additive-bias mask. SHRAM uses flex
	attention with custom masking and constructs causality internally; the
	4D format is incompatible with the SHRAM masking contract. Overriding
	as a no-op restores symmetry between compiled and non-compiled pathways
	without any loss of correctness or performance (see Unit 19.G.4).
	"""
	return attention_mask

	def _validate_input_ids(self, input_ids: torch.Tensor) -> None:
	"""Validate token IDs at the wrapper boundary."""
	if input_ids.ndim != 2:
	raise ValueError("input_ids must have shape (batch, seq_len).")
	if input_ids.shape[1] == 0:
	raise ValueError("input_ids sequence length must be nonzero.")
	if input_ids.dtype != torch.long:
	raise TypeError("input_ids must be an long int tensor.")

	def _validate_attention_mask(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor \| None,
	) -> None:
	"""Validate the full-sequence attention mask."""
	if attention_mask is None:
	return
	if attention_mask.ndim != 2:
	raise ValueError("attention_mask must have shape (batch, total_seq_len).")
	if attention_mask.shape[0] != input_ids.shape[0]:
	raise ValueError("attention_mask batch dimension must match input_ids.")
	if attention_mask.shape[1] < input_ids.shape[1]:
	raise ValueError(
	"attention_mask must be at least as long as the current input_ids chunk."
	)

	def _validate_position_ids(
	self,
	input_ids: torch.Tensor,
	position_ids: torch.Tensor \| None,
	) -> None:
	"""Validate current-step position IDs."""
	if position_ids is None:
	return
	if position_ids.ndim != 2:
	raise ValueError("position_ids must have shape (batch, seq_len).")
	if position_ids.shape != input_ids.shape:
	raise ValueError(
	"position_ids must match the current input_ids shape exactly."
	)
	if position_ids.dtype != torch.long:
	raise TypeError("position_ids must be an long tensor.")

	def _validate_labels(
	self,
	input_ids: torch.Tensor,
	labels: torch.Tensor \| None,
	) -> None:
	"""Validate label shape at the wrapper boundary."""
	if labels is None:
	return
	if labels.ndim != 2:
	raise ValueError("labels must have shape (batch, seq_len).")
	if labels.shape != input_ids.shape:
	raise ValueError("labels must have the same shape as input_ids.")
	if labels.dtype != torch.long:
	raise TypeError("labels must be a long tensor.")

	def _validate_cache_inputs(
	self,
	use_cache: bool,
	past_key_values: Cache \| None,
	) -> None:
	"""Validate cache policy for direct wrapper calls."""
	if use_cache:
	if past_key_values is None:
	raise ValueError(
	"use_cache=True requires an explicit ShramCache. During "
	"generate(), HuggingFace should supply this through "
	"_prepare_cache_for_generation()."
	)
	if not isinstance(past_key_values, ShramCache):
	raise TypeError(
	"past_key_values must be a ShramCache when use_cache=True."
	)
	return

	if past_key_values is not None:
	raise ValueError("past_key_values was provided while use_cache=False.")

	def _validate_position_sources(
	self,
	use_cache: bool,
	attention_mask: torch.Tensor \| None,
	position_ids: torch.Tensor \| None,
	) -> None:
	"""Validate that cached forward has a truthful source of positions."""
	if use_cache and attention_mask is None and position_ids is None:
	raise ValueError(
	"Cached forward requires either position_ids or attention_mask."
	)

	def _validate_hf_boundary(
	self,
	output_attentions: bool \| None,
	return_dict: bool \| None,
	inputs_embeds: torch.Tensor \| None,
	cache_position: torch.Tensor \| None,
	extra_kwargs: dict[str, Any],
	) -> None:
	"""Validate unsupported HuggingFace-facing wrapper inputs."""
	if output_attentions:
	raise NotImplementedError(
	"ShramForCausalLM does not expose output_attentions."
	)
	if return_dict is False:
	raise ValueError(
	"return_dict=False is not supported. "
	"ShramForCausalLM always returns ShramCausalLMOutput."
	)
	if inputs_embeds is not None:
	raise ValueError(
	"inputs_embeds is not supported at the SHRAM wrapper boundary. "
	"Pass input_ids instead."
	)
	if extra_kwargs:
	unsupported = ", ".join(sorted(extra_kwargs))
	raise TypeError(
	f"Unsupported forward kwargs for ShramForCausalLM: {unsupported}"
	)

	@staticmethod
	def _enforce_uncached_starting_position(condition: torch.Tensor) -> None:
	"""Enforce that an uncached forward pass begins at position 0.

	An uncached forward has no prior KV state. Nonzero starting positions
	produce silently incorrect RoPE encoding and attention outputs with no
	downstream diagnostic. This method intercepts that misuse at the
	outermost boundary before any backbone computation runs.

	To resolve a violation: either supply a ShramCache populated with the
	prefix (for continued decoding), or rebase the sequence so positions
	start at 0.

	Args:
	condition: Scalar bool tensor. True = all batch items start at 0
	(valid); False = at least one batch item starts nonzero
	(violated).
	"""
	if torch.compiler.is_compiling():
	torch._assert_async(
	condition,
	"Uncached ShramForCausalLM: nonzero starting positions. "
	"Supply a ShramCache with prefix or rebase sequence to start at 0.",
	)
	else:
	if not condition.item():
	raise RuntimeError(
	"Uncached ShramForCausalLM forward does not support nonzero "
	"starting positions. Either provide a ShramCache populated "
	"with the prefix for continued decoding, or rebase the "
	"uncached sequence to start at 0.",
	)

	def _standardize_full_attention_mask(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor \| None,
	) -> torch.BoolTensor:
	"""Return a concrete full-sequence boolean attention mask."""
	if attention_mask is None:
	return torch.ones_like(input_ids, dtype=torch.bool)
	return attention_mask.to(dtype=torch.bool)

	def _resolve_current_position_ids(
	self,
	input_ids: torch.Tensor,
	position_ids: torch.Tensor \| None,
	current_active_mask: torch.BoolTensor,
	cache: ShramCache \| None,
	) -> torch.LongTensor:
	"""Resolve concrete current-step position IDs for the backbone.

	Builds a fresh contiguous allocation via arange + per-batch bias. No cumsum
	or stride-based views are produced; the returned tensor is always a new
	allocation safe for Inductor tracing at the FlexAttention boundary.

	When a cache is present, ``total_active_tokens()`` provides the per-batch
	accumulated active token count as a position bias. Uncached calls use a zero
	bias. In both cases positions are ``bias + arange(current_length)``, with
	inactive positions masked to 0.

	Args:
	input_ids: Current token IDs of shape ``(B, N)``.
	position_ids: Explicit positions if supplied by the caller; returned
	unchanged (cast to long). Bias computation is skipped entirely.
	current_active_mask: Boolean mask of shape ``(B, N)`` for the current step.
	cache: Active ``ShramCache``, or ``None`` for uncached forward passes.

	Returns:
	Long tensor of shape ``(B, N)`` — position index per token, 0 for inactive.
	"""
	if position_ids is not None:
	return position_ids.to(dtype=torch.long)

	current_length = input_ids.shape[1]

	if cache is not None:
	position_bias = cache.total_active_tokens(current_active_mask)
	else:
	position_bias = torch.zeros(
	input_ids.shape[0], dtype=torch.long, device=input_ids.device
	)

	positions = position_bias.unsqueeze(1) + torch.arange(
	current_length, device=input_ids.device, dtype=torch.long
	)
	return positions.masked_fill(~current_active_mask, 0)

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor \| None = None,
	position_ids: torch.Tensor \| None = None,
	past_key_values: Cache \| None = None,
	use_cache: bool \| None = None,
	output_hidden_states: bool \| None = None,
	labels: torch.Tensor \| None = None,
	return_dict: bool \| None = None,
	ce_weight: float = 1.0,
	load_balance_weight: float = 0.01,
	**kwargs: Any,
	) -> ShramCausalLMOutput:
	"""Run the SHRAM causal language model wrapper.

	Args:
	input_ids: Current token IDs of shape ``(batch, seq_len)``.
	attention_mask: Optional full 2D mask of shape
	``(batch, total_seq_len)``. The wrapper slices its recent chunk
	to produce the current semantic liveness mask expected by the
	backbone.
	position_ids: Optional current-step position IDs of shape
	``(batch, seq_len)``. In ordinary HuggingFace generation this is
	already the current-step tensor when it reaches ``forward()``.
	past_key_values: Optional SHRAM cache. Required when
	``use_cache=True``.
	use_cache: Whether to use and return a cache. Defaults to
	``config.use_cache``.
	output_hidden_states: Whether to return backbone hidden states.
	Defaults to ``config.output_hidden_states``.
	labels: Optional target token IDs of shape ``(batch, seq_len)``.
	Pass unshifted labels (same alignment as ``input_ids``). This
	wrapper shifts internally: ``logits[:, :-1]`` is compared
	against ``labels[:, 1:]``. Do not pre-shift the caller side.
	return_dict: Must be ``True`` or ``None``.
	ce_weight: Weight applied to the cross-entropy loss when combining with
	the regret loss. Default 1.0.
	load_balance_weight: Weight applied to the regret loss.
	Default 0.01, matching the paper's recommendation.
	**kwargs: Unsupported HuggingFace kwargs fail explicitly.

	Returns:
	``ShramCausalLMOutput`` with:
	- ``logits`` of shape ``(batch, seq_len, vocab_size)``,
	- ``loss`` = ``ce_weight * ce_loss + load_balance_weight * regret_loss``
	when labels are provided (``None`` otherwise),
	- ``ce_loss`` — raw unweighted cross-entropy loss for logging,
	- ``past_key_values`` as the active ``ShramCache`` or ``None``,
	- ``hidden_states`` when requested,
	- ``regret_loss`` — raw unweighted regret loss from the backbone,
	- ``logit_regret`` — detached mean logit-space regret across layers,
	- ``logit_std`` — detached mean per-token routing logit spread across layers.
	"""
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	output_hidden_states = (
	output_hidden_states
	if output_hidden_states is not None
	else self.config.output_hidden_states
	)

	inputs_embeds = kwargs.pop("inputs_embeds", None)
	output_attentions = kwargs.pop("output_attentions", None)
	cache_position = kwargs.pop("cache_position", None)

	# ------------------------------------------------------------------
	# Validation zone.
	#
	# The wrapper boundary is where HuggingFace-facing inputs are judged
	# for truthfulness before any internal work begins. These checks are
	# intentionally front-loaded so the core logic below can assume one
	# coherent interpretation of the call rather than defensively checking
	# shapes, cache policy, or unsupported HF knobs at the point of use.
	# This keeps the main sequence readable while ensuring invalid states
	# fail before they can silently contaminate backbone execution.
	# ------------------------------------------------------------------
	self._validate_input_ids(input_ids)
	self._validate_attention_mask(input_ids, attention_mask)
	self._validate_position_ids(input_ids, position_ids)
	self._validate_labels(input_ids, labels)
	self._validate_cache_inputs(use_cache, past_key_values)
	self._validate_position_sources(use_cache, attention_mask, position_ids)
	self._validate_hf_boundary(
	output_attentions=output_attentions,
	return_dict=return_dict,
	inputs_embeds=inputs_embeds,
	cache_position=cache_position,
	extra_kwargs=kwargs,
	)

	# ------------------------------------------------------------------
	# Standardization zone.
	#
	# HuggingFace and SHRAM use different boundary conventions: generation
	# carries a full-sequence 2D attention mask, while the SHRAM backbone
	# wants a current-step active mask and concrete current position IDs.
	# This zone collapses those wrapper-facing conventions into one valid
	# backbone-facing state. After this point the core no longer reasons
	# about optional or ambiguous input forms; it works only with concrete
	# tensors whose semantics are already fixed.
	# ------------------------------------------------------------------
	full_attention_mask: torch.BoolTensor = self._standardize_full_attention_mask(
	input_ids=input_ids,
	attention_mask=attention_mask,
	)
	current_length: int = input_ids.shape[1]
	current_active_mask: torch.BoolTensor = full_attention_mask[:, -current_length:]
	shram_cache: ShramCache \| None = past_key_values if use_cache else None
	current_position_ids: torch.LongTensor = self._resolve_current_position_ids(
	input_ids=input_ids,
	position_ids=position_ids,
	current_active_mask=current_active_mask,
	cache=shram_cache,
	)

	if shram_cache is None:
	positions_start_sane = torch.all(current_position_ids[:, 0] == 0)
	self._enforce_uncached_starting_position(positions_start_sane)

	# ------------------------------------------------------------------
	# Core wrapper responsibilities.
	#
	# The wrapper's primary job is kept visible here: convert token IDs to
	# embeddings, delegate transformer computation to ShramModel, project
	# hidden states back to vocabulary logits, optionally compute the
	# wrapper-level shifted next-token loss, and return the HuggingFace-
	# facing output object. The backbone remains responsible only for
	# transformer semantics; token/vocabulary/loss concerns stay here.
	# ------------------------------------------------------------------
	token_embeddings: torch.FloatTensor = self.embed_tokens(input_ids)
	backbone_outputs = self.model(
	inputs_embeds=token_embeddings,
	position_ids=current_position_ids,
	active_mask=current_active_mask,
	cache=shram_cache,
	output_hidden_states=output_hidden_states,
	)

	logits: torch.FloatTensor = self.lm_head(backbone_outputs["last_hidden_state"])

	ce_loss: torch.FloatTensor \| None = None
	loss: torch.FloatTensor \| None = None
	if labels is not None:
	shift_logits = logits[:, :-1, :].contiguous()
	shift_labels = labels[:, 1:].contiguous()
	ce_loss = nn.functional.cross_entropy(
	shift_logits.view(-1, self.config.vocab_size),
	shift_labels.view(-1),
	)
	loss = ce_weight * ce_loss + load_balance_weight * backbone_outputs["regret_loss"]

	return ShramCausalLMOutput(
	loss=loss,
	ce_loss=ce_loss,
	logits=logits,
	past_key_values=backbone_outputs["past_key_values"],
	hidden_states=backbone_outputs["hidden_states"],
	regret_loss=backbone_outputs["regret_loss"],
	logit_regret=backbone_outputs["logit_regret"],
	logit_std=backbone_outputs["logit_std"],
	)