smithblack-0
/

SHRAM-dev

@@ -2872,8 +2872,20 @@ class MoSRAHRouter(nn.Module):
         else:
             element_included = ranks < n.unsqueeze(positive_dim)
-        mask = torch.zeros_like(tensor, dtype=torch.bool)
-        mask.scatter_(dim, topk_indices, element_included.expand_as(topk_indices))
         return mask
     @staticmethod
@@ -2910,8 +2922,9 @@ class MoSRAHRouter(nn.Module):
         Tokens propose experts in descending preference order; experts provisionally
         accept their top-``remaining_capacity`` proposed tokens each round. Proposals
-        are monotone (never retracted). The loop continues until every token has at
-        least ``min_choices`` accepted experts or ``max_rounds`` is exhausted.
         Both the column bound (per-expert token count ≤ remaining_capacity) and the
         row bound (per-token expert count ≥ min_choices) are satisfied simultaneously
@@ -2922,35 +2935,26 @@ class MoSRAHRouter(nn.Module):
             remaining_capacity: Per-expert token budget. Scalar int for training;
                 (B, L) tensor for inference.
             min_choices: Minimum experts each token must have accepted (K).
-            max_rounds: Iteration ceiling; raises via ``_check_bidding_converged``
-                if exhausted.
             capacity_scalar: Static upper bound on remaining_capacity, passed to
                 ``get_mask`` as the topk k bound for the acceptance step.
         Returns:
             accepted: (B, N, L) bool — True at positions accepted by the solver.
         """
-        # ── initialise loop variables ─────────────────────────────────────────
-        #
-        # All three loop_vars must be tensors of fixed shape across iterations,
-        # as required by torch.while_loop. logits and remaining_capacity are
-        # captured read-only by the closures; they do not travel as loop_vars.
-        proposals  = torch.zeros_like(logits, dtype=torch.bool)
         acceptances = torch.zeros_like(logits, dtype=torch.bool)
-        round_count = torch.zeros((), device=logits.device, dtype=torch.int64)
-        max_rounds_t = torch.full((), max_rounds, device=logits.device, dtype=torch.int64)
-        def cond_fn(proposals, acceptances, round_count):
-            all_satisfied = (acceptances.sum(dim=-1) >= min_choices).all()
-            return (round_count < max_rounds_t) & ~all_satisfied
-        def body_fn(proposals, acceptances, round_count):
             # ── token proposal step ───────────────────────────────────────────
             #
             # Tokens with fewer than min_choices accepted experts propose their
             # next-best unproposed expert(s). The deficit determines how many new
-            # proposals each token makes this round; already-satisfied tokens
-            # propose nothing (deficit = 0 → get_mask returns all-False).
             accepted_per_token = acceptances.sum(dim=-1)           # (B, N)
             choices_deficit = (min_choices - accepted_per_token).clamp_min(0)
@@ -2969,12 +2973,18 @@ class MoSRAHRouter(nn.Module):
             updated_acceptances = cls.get_mask(
                 proposed_logits, dim=-2, n=remaining_capacity, capacity_scalar=capacity_scalar,
             )
-            return updated_proposals, updated_acceptances, round_count + 1
-        proposals, acceptances, _ = torch.while_loop(
-            cond_fn, body_fn, (proposals, acceptances, round_count),
-        )
         return acceptances
     @classmethod
@@ -2993,13 +3003,10 @@ class MoSRAHRouter(nn.Module):
           - Column bound: per-expert unmasked token count ≤ remaining_capacity.
           - Row bound:    per-token unmasked expert count ≥ min_choices.
-        A training fast path and a column-capacity fast path are attempted before
-        falling back to the bidding solver:
         1. Training with N ≤ capacity: return logits unchanged.
-        2. Column-capacity fast path: if the most permissive column-bound-satisfying
-           mask already gives every token at least min_choices choices, return it.
-        3. Bidding fallback: deferred-acceptance solver guaranteeing both bounds.
         Args:
             logits: Routing scores of shape (B, N, L).
@@ -3029,15 +3036,8 @@ class MoSRAHRouter(nn.Module):
         # terminates when every token has min_choices accepted experts or
         # max_bid_rounds is exhausted (RuntimeError in the latter case).
         #
-        # Two cheaper paths precede the solver:
-        #
-        #   Training fast path — when N ≤ capacity and all experts start empty,
-        #   no expert can overflow regardless of routing. No masking is needed.
-        #
-        #   Column-capacity fast path — the most permissive mask satisfying the
-        #   column bound selects each expert's top-remaining_capacity tokens. If
-        #   that mask also satisfies the row bound, both constraints hold and the
-        #   solver is skipped entirely.
         # Training fast path: N ≤ capacity with empty experts → no overflow possible.
         if used_capacity is None and logits.shape[-2] <= capacity:
@@ -3052,32 +3052,15 @@ class MoSRAHRouter(nn.Module):
         else:
             remaining_capacity = (capacity - used_capacity).clamp(min=0)  # (B, L)
-        # Column-capacity fast path: select each expert's top-remaining_capacity
-        # tokens — the most permissive mask satisfying the column bound. If it
-        # also satisfies the row bound, both constraints hold simultaneously.
-        # Mask computation runs under no_grad: the boolean mask is a hard routing
-        # decision and must not accumulate gradient memory through the solver.
-        def skip(mask: torch.Tensor, logits: torch.Tensor)->torch.Tensor:
-            """Skip bidding on the mask"""
-            return mask.clone()
-        def resolve_mask(mask: torch.Tensor, logits: torch.Tensor) -> torch.Tensor:
-            """Execute full bidding process"""
-            return cls._run_bidding(logits,
-                                    remaining_capacity,
-                                    min_choices,
-                                    max_rounds,
-                                    capacity)
         with torch.no_grad():
-            col_capacity_mask = cls.get_mask(logits,
-                                             dim=-2,
-                                             n=remaining_capacity,
-                                             capacity_scalar=capacity)
-            mask_sufficient = (col_capacity_mask.sum(dim=-1) >= min_choices).all()
-            final_mask = torch.cond(mask_sufficient, skip, resolve_mask, [col_capacity_mask, logits])
             cls._check_bidding_converged(final_mask, min_choices, max_rounds)
         return logits.masked_fill(~final_mask, mask_value)
     def forward(
         self,
         x: torch.Tensor,

         else:
             element_included = ranks < n.unsqueeze(positive_dim)
+        # Allocate from explicit logical shape rather than using zeros_like. This keeps
+        # the output mask tied to tensor.shape, not to any stride/layout metadata carried
+        # by tensor from earlier view operations or compiler lowering.
+        mask = torch.zeros(
+            tuple(tensor.shape),
+            device=tensor.device,
+            dtype=torch.bool,
+        )
+        # Materialize the scatter source shape explicitly. This avoids passing a
+        # broadcast-view source into scatter while preserving the same logical rule:
+        # every selected top-k index receives True iff its rank is within budget.
+        scatter_values = torch.broadcast_to(element_included, topk_indices.shape)
+        mask = mask.scatter(dim, topk_indices, scatter_values)
         return mask
     @staticmethod
         Tokens propose experts in descending preference order; experts provisionally
         accept their top-``remaining_capacity`` proposed tokens each round. Proposals
+        are monotone (never retracted). Runs for exactly ``max_rounds`` iterations;
+        each round is skipped via ``torch.cond`` once all tokens are satisfied, so
+        subsequent iterations are no-ops without data-dependent Python control flow.
         Both the column bound (per-expert token count ≤ remaining_capacity) and the
         row bound (per-token expert count ≥ min_choices) are satisfied simultaneously
             remaining_capacity: Per-expert token budget. Scalar int for training;
                 (B, L) tensor for inference.
             min_choices: Minimum experts each token must have accepted (K).
+            max_rounds: Number of iterations to run. Convergence is checked after
+                all rounds via ``_check_bidding_converged``; raises if not met.
             capacity_scalar: Static upper bound on remaining_capacity, passed to
                 ``get_mask`` as the topk k bound for the acceptance step.
         Returns:
             accepted: (B, N, L) bool — True at positions accepted by the solver.
         """
+        proposals   = torch.zeros_like(logits, dtype=torch.bool)
         acceptances = torch.zeros_like(logits, dtype=torch.bool)
+        # Branch functions defined once so Dynamo sees stable function objects
+        # across all loop iterations. logits, remaining_capacity, min_choices, and
+        # capacity_scalar are captured read-only from the enclosing scope.
+        def body_fn(proposals: torch.Tensor, acceptances: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
             # ── token proposal step ───────────────────────────────────────────
             #
             # Tokens with fewer than min_choices accepted experts propose their
             # next-best unproposed expert(s). The deficit determines how many new
+            # proposals each token makes; satisfied tokens propose nothing.
             accepted_per_token = acceptances.sum(dim=-1)           # (B, N)
             choices_deficit = (min_choices - accepted_per_token).clamp_min(0)
             updated_acceptances = cls.get_mask(
                 proposed_logits, dim=-2, n=remaining_capacity, capacity_scalar=capacity_scalar,
             )
+            return updated_proposals, updated_acceptances
+        def skip_fn(proposals: torch.Tensor, acceptances: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            # Already converged — return clones so torch.cond aliasing rule is satisfied.
+            return proposals.clone(), acceptances.clone()
+        for _ in range(max_rounds):
+            # Skip this round if every token already has min_choices accepted experts.
+            # torch.cond avoids data-dependent Python branches in compiled graphs.
+            not_done = ~(acceptances.sum(dim=-1) >= min_choices).all()
+            proposals, acceptances = torch.cond(not_done, body_fn, skip_fn, [proposals, acceptances])
         return acceptances
     @classmethod
           - Column bound: per-expert unmasked token count ≤ remaining_capacity.
           - Row bound:    per-token unmasked expert count ≥ min_choices.
+        A training fast path is attempted before the bidding solver:
         1. Training with N ≤ capacity: return logits unchanged.
+        2. Bidding: deferred-acceptance solver guaranteeing both bounds simultaneously.
         Args:
             logits: Routing scores of shape (B, N, L).
         # terminates when every token has min_choices accepted experts or
         # max_bid_rounds is exhausted (RuntimeError in the latter case).
         #
+        # Training fast path — when N ≤ capacity and all experts start empty,
+        # no expert can overflow regardless of routing. No masking is needed.
         # Training fast path: N ≤ capacity with empty experts → no overflow possible.
         if used_capacity is None and logits.shape[-2] <= capacity:
         else:
             remaining_capacity = (capacity - used_capacity).clamp(min=0)  # (B, L)
+        # Bidding solver: jointly satisfies column and row bounds. Runs under
+        # no_grad because the boolean mask is a hard routing decision and must
+        # not accumulate gradient memory.
         with torch.no_grad():
+            final_mask = cls._run_bidding(logits, remaining_capacity,
+                                          min_choices, max_rounds, capacity)
             cls._check_bidding_converged(final_mask, min_choices, max_rounds)
         return logits.masked_fill(~final_mask, mask_value)
     def forward(
         self,
         x: torch.Tensor,