Update architecture and tokenizer

Browse files

Files changed (5) hide show

README.md +2 -1
config.json +2 -1
configuration.py +19 -1
huggingface.py +176 -79
tokenizer_config.json +1 -1

README.md CHANGED Viewed

@@ -82,7 +82,8 @@ contains no weights. All values are overridable via kwargs.
 | `embedding_width` | 512 |
 | `head_dim` | 16 |
 | `inference_sequence_length` | 1024 |
-| `load_balance_p` | 2.0 |
 | `local_rope_theta` | 10000.0 |
 | `max_bid_rounds` | 10 |
 | `mlp_width` | 1366 |

 | `embedding_width` | 512 |
 | `head_dim` | 16 |
 | `inference_sequence_length` | 1024 |
+| `load_balance_loss_type` | ce |
+| `load_balance_p` | 1.0 |
 | `local_rope_theta` | 10000.0 |
 | `max_bid_rounds` | 10 |
 | `mlp_width` | 1366 |

config.json CHANGED Viewed

@@ -9,7 +9,8 @@
   "embedding_width": 512,
   "head_dim": 16,
   "inference_sequence_length": 1024,
-  "load_balance_p": 2.0,
   "local_rope_theta": 10000.0,
   "max_bid_rounds": 10,
   "mlp_width": 1366,

   "embedding_width": 512,
   "head_dim": 16,
   "inference_sequence_length": 1024,
+  "load_balance_loss_type": "ce",
+  "load_balance_p": 1.0,
   "local_rope_theta": 10000.0,
   "max_bid_rounds": 10,
   "mlp_width": 1366,

configuration.py CHANGED Viewed

@@ -94,6 +94,11 @@ class ShramConfig(PretrainedConfig):
             cases are not expected under normal training. The bound exists as a
             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
     """
     model_type = "shram"
@@ -127,8 +132,9 @@ class ShramConfig(PretrainedConfig):
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
-        load_balance_p: float = 2.0,
         max_bid_rounds: int = 10,
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -174,6 +180,17 @@ class ShramConfig(PretrainedConfig):
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
@@ -194,6 +211,7 @@ class ShramConfig(PretrainedConfig):
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

             cases are not expected under normal training. The bound exists as a
             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
+        load_balance_loss_type: Formula used for the load-balance auxiliary loss.
+            One of ``"gshard"``, ``"ce"``, or ``"bce"``. ``"ce"`` (cross-entropy)
+            is the default; its log-probability signal scales with violation severity
+            and makes correction magnitude proportional to routing imbalance.
+            Default ``"ce"``.
     """
     model_type = "shram"
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
+        load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
+        load_balance_loss_type: str = "ce",
         **kwargs
     ):
         if head_dim % 2 != 0:
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
+        _supported_loss_types = {"gshard", "ce", "bce"}
+        if load_balance_loss_type not in _supported_loss_types:
+            supported = ", ".join(f'"{t}"' for t in sorted(_supported_loss_types))
+            raise ValueError(
+                f"load_balance_loss_type must be one of {supported}, "
+                f"got {load_balance_loss_type!r}."
+            )
+        if load_balance_loss_type == "ce" and load_balance_p != 1.0:
+            raise ValueError("In cross entropy mode, aggregation of "
+                             "frequencies must be with mean 1.0")
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
+        self.load_balance_loss_type = load_balance_loss_type
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache

huggingface.py CHANGED Viewed

@@ -44,6 +44,7 @@ from torch import nn
 from torch.nn.attention.flex_attention import create_block_mask
 from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
 from typing import Optional
@@ -181,6 +182,11 @@ class ShramConfig(PretrainedConfig):
             cases are not expected under normal training. The bound exists as a
             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
     """
     model_type = "shram"
@@ -214,8 +220,9 @@ class ShramConfig(PretrainedConfig):
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
-        load_balance_p: float = 2.0,
         max_bid_rounds: int = 10,
         **kwargs
     ):
         if head_dim % 2 != 0:
@@ -261,6 +268,17 @@ class ShramConfig(PretrainedConfig):
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
@@ -281,6 +299,7 @@ class ShramConfig(PretrainedConfig):
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
@@ -2714,9 +2733,10 @@ This separation is architecturally critical: expert_bias drives selection (and t
 balancing) but does not corrupt the gradient path from the output through routing_probs
 back to the routing projection weights.
-The router also computes and returns the load balance loss via the LoadBalanceLoss custom
-autograd operator (see load_balance_loss.py). This loss is a scalar that the training
-loop can weight and add to the language modeling loss.
 The router additionally computes and returns MaxVio, a detached scalar summarising
 routing imbalance for the current forward pass:
@@ -2737,94 +2757,165 @@ Paper ref: Appendix A.Routing, Appendix A.Load Balancing, §MaxVio.
 # -----------
 # Inlined from: load_balance_loss.py
 # -----------
-"""Auxiliary-loss-free load balancing operator for MoSRAH routing.
-This module implements the custom autograd Function H(b, f) described in the paper's
-Implementation Concerns section. The operator bridges two requirements that are in
-tension: it must behave like a standard auxiliary loss (scalar output, scalable via
-multiplication) so that existing training loops remain compatible, while simultaneously
-implementing DeepSeek-style bias correction rather than the usual auxiliary-loss gradient
-path through the router weights.
-The resolution is a custom backward pass. The forward emits the load balance imbalance
-as a scalar loss. The backward, instead of differentiating that scalar with respect to
-its inputs, writes a bias-correction gradient directly to expert_bias. This gradient is
-then consumed by the main AdamW optimizer in the normal way, achieving DeepSeek-style
-correction without a standalone SGD update step.
-Paper ref: Appendix A.Implementation Concerns.
 """
-class LoadBalanceLoss(torch.autograd.Function):
-    """Custom autograd operator for DeepSeek-style auxiliary-loss-free load balancing.
-    Forward computes the load balance imbalance:
-        L_load_balance = H(b, f) = sum_l | f_l - 1/L |
-    Backward emits a bias-correction gradient to expert_bias:
-        grad_b = L_grad * sign(f_l - 1/L)
-    expert_bias (b) is included as a forward input so PyTorch registers it as a node
-    in the computation graph and routes gradients through it. routing_freqs (f) receives
-    no gradient — its origin is the discrete TopK operation which has no gradient, so
-    defining a gradient for f here would be mathematically incorrect.
-    Paper ref: Appendix A.Implementation Concerns.
     """
-    @staticmethod
-    def forward(
-        ctx: torch.autograd.function.FunctionCtx,
-        expert_bias: torch.Tensor,
-        routing_freqs: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute the load balance loss.
-        Args:
-            ctx: Autograd context for saving state needed in backward.
-            expert_bias: Learned per-head bias b, shape (L,). Included as an input so
-                PyTorch tracks it as a computation graph node needing a gradient.
-            routing_freqs: Realized routing frequency f_l per head, shape (L,). Computed
-                from the discrete TopK selection — not differentiable.
-        Returns:
-            Scalar loss equal to sum_l |f_l - 1/L|.
-        """
-        L = expert_bias.shape[0]
-        # imbalance = f_l - 1/L for each head: positive means overloaded, negative means
-        # underloaded. Saved for backward where sign(imbalance) determines the direction
-        # of the bias-correction update.
-        imbalance = routing_freqs - 1.0 / L
-        ctx.save_for_backward(imbalance)
-        return imbalance.abs().sum()
-    @staticmethod
-    def backward(
-        ctx: torch.autograd.function.FunctionCtx,
-        grad_output: torch.Tensor,
-    ) -> tuple[torch.Tensor, None]:
-        """Emit the DeepSeek-style bias-correction gradient.
-        Args:
-            ctx: Autograd context carrying imbalance saved in forward.
-            grad_output: Incoming gradient L_grad (scalar). Any rescaling of the loss
-                by the training loop arrives here and is propagated to grad_b, so the
-                correction magnitude is proportional to the loss weight chosen by the
-                consumer.
-        Returns:
-            Gradient for expert_bias: L_grad * sign(f_l - 1/L), shape (L,).
-            None for routing_freqs: no gradient is defined for the discrete routing
-            frequency.
-        """
-        (imbalance,) = ctx.saved_tensors
-        grad_expert_bias = grad_output * imbalance.sign()
-        return grad_expert_bias, None
@@ -2857,6 +2948,7 @@ class MoSRAHRouter(nn.Module):
             self.capacity = config.mosrah_packed_length
         self.max_bid_rounds = config.max_bid_rounds
         # W_r: routing projection, no bias (paper specifies xW_r, no additional term).
         self.routing_projection = nn.Linear(
@@ -3143,12 +3235,13 @@ class MoSRAHRouter(nn.Module):
             logits, self.expert_bias.expand_as(logits), dim=-1
         ).mean().detach()
         routing_scores = F.softmax(logits, dim=-1)             # R, (B, N, L)
         # Biased routing scores R̂ = Softmax(xW_r + b). Used only for TopK head
         # selection. expert_bias is added to logits before softmax so that the bias
         # shifts selection probability without rescaling the unbiased distribution.
-        biased_logits = logits + self.expert_bias
         biased_logits = self.balance_capacity(
             biased_logits,
             used_capacity,
@@ -3189,10 +3282,14 @@ class MoSRAHRouter(nn.Module):
         p = self.load_balance_p
         routing_freqs = (per_item_freqs ** p).mean(dim=0) ** (1.0 / p)  # (L,)
-        # Load balance loss via custom autograd. expert_bias is an input so PyTorch
-        # registers it as a graph node; the custom backward writes the DeepSeek-style
-        # correction gradient to expert_bias.grad for the optimizer to consume.
-        load_balance_loss = LoadBalanceLoss.apply(self.expert_bias, routing_freqs)
         # MaxVio is a detached monitoring scalar following the paper's formula
         # L · max_l(f_l − 1/L) applied to routing_freqs. Must not contribute gradients.

 from torch.nn.attention.flex_attention import create_block_mask
 from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
+from typing import Callable
 from typing import Optional
             cases are not expected under normal training. The bound exists as a
             correctness guard — exhausting it raises ``RuntimeError``. Must be >= 1.
             Default 10.
+        load_balance_loss_type: Formula used for the load-balance auxiliary loss.
+            One of ``"gshard"``, ``"ce"``, or ``"bce"``. ``"ce"`` (cross-entropy)
+            is the default; its log-probability signal scales with violation severity
+            and makes correction magnitude proportional to routing imbalance.
+            Default ``"ce"``.
     """
     model_type = "shram"
         output_hidden_states: bool = False,
         tie_word_embeddings: bool = False,
         mosrah_overallocation_factor: float = 2.0,
+        load_balance_p: float = 1.0,
         max_bid_rounds: int = 10,
+        load_balance_loss_type: str = "ce",
         **kwargs
     ):
         if head_dim % 2 != 0:
                 f"max_bid_rounds must be at least 1, got {max_bid_rounds}."
             )
+        _supported_loss_types = {"gshard", "ce", "bce"}
+        if load_balance_loss_type not in _supported_loss_types:
+            supported = ", ".join(f'"{t}"' for t in sorted(_supported_loss_types))
+            raise ValueError(
+                f"load_balance_loss_type must be one of {supported}, "
+                f"got {load_balance_loss_type!r}."
+            )
+        if load_balance_loss_type == "ce" and load_balance_p != 1.0:
+            raise ValueError("In cross entropy mode, aggregation of "
+                             "frequencies must be with mean 1.0")
         self.vocab_size = vocab_size
         self.embedding_width = embedding_width
         self.mlp_width = mlp_width
         self.mosrah_overallocation_factor = mosrah_overallocation_factor
         self.load_balance_p = load_balance_p
         self.max_bid_rounds = max_bid_rounds
+        self.load_balance_loss_type = load_balance_loss_type
         self.attention_dropout = attention_dropout
         self.use_cache = use_cache
 balancing) but does not corrupt the gradient path from the output through routing_probs
 back to the routing projection weights.
+The router also computes and returns the load balance loss via a log-probability auxiliary
+loss (see load_balance_loss.py). The loss formulation is selected by config; the default
+is cross-entropy. Gradients flow only to expert_bias — routing_projection.weight is
+isolated by detaching logits before computing assignment probabilities.
 The router additionally computes and returns MaxVio, a detached scalar summarising
 routing imbalance for the current forward pass:
 # -----------
 # Inlined from: load_balance_loss.py
 # -----------
+"""Log-probability auxiliary loss functions for MoSRAH load balancing.
+This module provides three load-balance loss formulations and a factory that selects
+among them. All formulations share the same external contract and the same gradient
+isolation property: assignment probabilities are computed from detached logits plus
+expert_bias, so only expert_bias receives gradients from the loss signal. The routing
+projection weights are not reachable from any returned loss.
+The factory is the intended entry point. The caller (MoSRAHRouter) constructs the
+loss callable once at init and invokes it each forward pass.
+Log-probability formulations (ce, bce) are preferred over linear ones (gshard) because
+their gradient magnitude scales with how far the distribution deviates from the target.
+A linear signal can be outrun by routing concentrations that diverge nonlinearly; a
+log-probability signal cannot.
+The external contract for all returned callables is:
+    loss_fn(routing_freqs, assignment_probs) -> scalar Tensor
+    routing_freqs:    (L,) realized routing frequencies f_i, detached.
+    assignment_probs: (L,) soft assignment probabilities p_i with gradient through
+                      expert_bias. Caller must compute these via
+                      softmax(logits.detach() + expert_bias) to preserve isolation.
 """
+# ---------------------------------------------------------------------------
+# Loss functions
+# ---------------------------------------------------------------------------
+def gshard_loss(
+    routing_freqs: torch.Tensor,
+    assignment_probs: torch.Tensor,
+) -> torch.Tensor:
+    """GShard-style linear load-balance loss.
+    Computes (1/L) * Σ_i f_i * p_i, where L is the number of expert heads,
+    f_i is the realized routing frequency for head i, and p_i is the soft
+    assignment probability for head i.
+    The fixed point of this loss under gradient descent is uniform routing:
+    when p_i = 1/L for all i, the loss is minimized at 1/L (independent of f_i).
+    The linear signal is the weakest of the three formulations — gradient magnitude
+    does not grow with deviation from the target. Provided for comparison.
+    Args:
+        routing_freqs: Realized routing frequencies f_i, shape (L,). Detached.
+        assignment_probs: Soft assignment probabilities p_i, shape (L,). Gradient
+            flows to expert_bias through this tensor.
+    Returns:
+        Scalar loss tensor.
     """
+    L = routing_freqs.shape[0]
+    return (routing_freqs * assignment_probs).sum() / L
+def ce_loss(
+    routing_freqs: torch.Tensor,
+    assignment_probs: torch.Tensor,
+) -> torch.Tensor:
+    """Cross-entropy load-balance loss.
+    Computes -(1/(L-1)) * Σ_i (1 - f_i) * log(p_i), where the weight (1 - f_i)
+    suppresses the signal for overloaded heads (high f_i → weight near zero) and
+    amplifies it for underloaded heads (low f_i → weight near 1). This makes the
+    loss push probability mass toward under-utilized experts.
+    The (1/(L-1)) normalization makes the coefficient interpretable as a controller
+    strength independent of expert count. The log-probability signal grows as p_i
+    deviates from the target, providing correction that scales with violation severity.
+    Args:
+        routing_freqs: Realized routing frequencies f_i, shape (L,). Detached.
+        assignment_probs: Soft assignment probabilities p_i, shape (L,). Gradient
+            flows to expert_bias through this tensor.
+    Returns:
+        Scalar loss tensor.
+    """
+    L = routing_freqs.shape[0]
+    # Numerical stability: torch.log is safe here because softmax outputs are
+    # strictly positive. The (1 - f_i) weight goes to zero exactly when f_i = 1,
+    # which can only occur with a single head, so the 0 * (-inf) degenerate case
+    # does not arise in practice.
+    return -(((1.0 - routing_freqs) * torch.log(assignment_probs)).sum()) / (L - 1)
+def bce_loss(
+    routing_freqs: torch.Tensor,
+    assignment_probs: torch.Tensor,
+) -> torch.Tensor:
+    """Binary cross-entropy load-balance loss.
+    Computes -(1/L) * Σ_i [(1 - f_i) * log(p_i) + f_i * log(1 - p_i)], where
+    each head is treated as an independent binary target. Unlike CE, BCE maintains
+    a repulsion signal from saturated experts: when f_i → 1, the weight on
+    log(1 - p_i) drives p_i away from 1, preventing runaway concentration.
+    log(1 - p_i) is computed as log1p(-p_i) for numerical safety near p_i = 1.
+    Args:
+        routing_freqs: Realized routing frequencies f_i, shape (L,). Detached.
+        assignment_probs: Soft assignment probabilities p_i, shape (L,). Gradient
+            flows to expert_bias through this tensor.
+    Returns:
+        Scalar loss tensor.
+    """
+    L = routing_freqs.shape[0]
+    positive_term = (1.0 - routing_freqs) * torch.log(assignment_probs)
+    # log1p(-p) instead of log(1-p): avoids catastrophic cancellation when p is
+    # close to 1, where (1 - p) loses precision and log produces large errors.
+    negative_term = routing_freqs * torch.log1p(-assignment_probs)
+    return -(positive_term + negative_term).sum() / L
+# ---------------------------------------------------------------------------
+# Factory
+# ---------------------------------------------------------------------------
+_LOSS_REGISTRY: dict[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = {
+    "gshard": gshard_loss,
+    "ce": ce_loss,
+    "bce": bce_loss,
+}
+def make_load_balance_loss(
+    loss_type: str,
+) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]:
+    """Return a load-balance loss callable for the requested formulation.
+    All returned callables share the same external contract:
+        loss_fn(routing_freqs: Tensor, assignment_probs: Tensor) -> scalar Tensor
+    The caller is responsible for computing assignment_probs via
+    softmax(logits.detach() + expert_bias) to ensure gradient isolation.
+    Args:
+        loss_type: One of ``"gshard"``, ``"ce"``, or ``"bce"``.
+    Returns:
+        Loss callable matching the shared contract.
+    Raises:
+        ValueError: If loss_type is not one of the supported values.
+    """
+    if loss_type not in _LOSS_REGISTRY:
+        supported = ", ".join(f'"{k}"' for k in _LOSS_REGISTRY)
+        raise ValueError(
+            f"load_balance_loss_type must be one of {supported}, got {loss_type!r}."
+        )
+    return _LOSS_REGISTRY[loss_type]
             self.capacity = config.mosrah_packed_length
         self.max_bid_rounds = config.max_bid_rounds
+        self._load_balance_loss = make_load_balance_loss(config.load_balance_loss_type)
         # W_r: routing projection, no bias (paper specifies xW_r, no additional term).
         self.routing_projection = nn.Linear(
             logits, self.expert_bias.expand_as(logits), dim=-1
         ).mean().detach()
+        # Routing scores. Direct.
         routing_scores = F.softmax(logits, dim=-1)             # R, (B, N, L)
         # Biased routing scores R̂ = Softmax(xW_r + b). Used only for TopK head
         # selection. expert_bias is added to logits before softmax so that the bias
         # shifts selection probability without rescaling the unbiased distribution.
+        biased_logits = logits.detach() + self.expert_bias
         biased_logits = self.balance_capacity(
             biased_logits,
             used_capacity,
         p = self.load_balance_p
         routing_freqs = (per_item_freqs ** p).mean(dim=0) ** (1.0 / p)  # (L,)
+        # Active-token mean softmax probabilities. Detaching logits before softmax
+        # ensures the only differentiable path into p is through expert_bias — the
+        # load balance loss cannot reach routing_projection.weight.
+        biased_probs = biased_routing_scores                                   # (B, N, L)
+        active_float = active_mask.float().unsqueeze(-1)                       # (B, N, 1)
+        assignment_probs = (biased_probs * active_float).sum(dim=(0, 1))       # (L,) unnorm
+        assignment_probs = assignment_probs / active_mask.float().sum()        # (L,) norm
+        load_balance_loss = self._load_balance_loss(routing_freqs, assignment_probs)
         # MaxVio is a detached monitoring scalar following the paper's formula
         # L · max_l(f_l − 1/L) applied to routing_freqs. Must not contribute gradients.

tokenizer_config.json CHANGED Viewed

@@ -4,7 +4,7 @@
   "bos_token": "<|endoftext|>",
   "eos_token": "<|endoftext|>",
   "errors": "replace",
-  "is_local": false,
   "local_files_only": false,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<|padding|>",

   "bos_token": "<|endoftext|>",
   "eos_token": "<|endoftext|>",
   "errors": "replace",
+  "is_local": true,
   "local_files_only": false,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<|padding|>",