MiniCPM-MoE-8x2B_densemixer

PyTorch

custom_code

Model card Files Files and versions

xet

Community

autoprogrammer commited on Nov 20, 2025

Commit

996d29a

verified ·

1 Parent(s): 599977c

Update modeling_minicpm.py

Browse files

Files changed (1) hide show

modeling_minicpm.py +81 -24

modeling_minicpm.py CHANGED Viewed

@@ -314,36 +314,93 @@ class MiniCPMMoE(nn.Module):
         )
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
         self.intermediate_size = config.intermediate_size
     def forward(self, hidden_states):
-        orig_shape = hidden_states.shape
-        orig_dtype = hidden_states.dtype
-        hidden_states = hidden_states.view(-1, orig_shape[-1])
-        token_num = hidden_states.shape[0]
-        scores = self.gate(hidden_states)
-        scores_prob = F.softmax(scores, dim=-1, dtype=torch.float32)
-        expert_weights, expert_indices = torch.topk(scores_prob, self.num_experts_per_tok, dim=-1)
-        expert_weights = expert_weights / expert_weights.sum(dim=-1, keepdim=True)
-        topk_idx_flat = expert_indices.view(-1)
-        expert_weights = expert_weights.to(orig_dtype)
         if self.training:
-            hidden_states = hidden_states.repeat_interleave(self.num_experts_per_tok, dim=0)
-            y = torch.empty_like(hidden_states)
-            for i in range(self.num_experts):
-                y[topk_idx_flat == i] = self.experts[i](hidden_states[topk_idx_flat == i])
-            y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
-            y =  y.view(*orig_shape)
-            load = expert_indices.view(-1).bincount(minlength=self.num_experts)
-            load_mean = load / (token_num * self.num_experts_per_tok)
-            importance_mean = scores_prob.mean(dim=0)
             balance_loss = self.num_experts * torch.sum(importance_mean * load_mean)
-            y = AddAuxiliaryLoss.apply(y, balance_loss)
         else:
-            y = self.moe_infer(hidden_states, topk_idx_flat, expert_weights.view(-1, 1)).view(*orig_shape)
-        return y
     @torch.no_grad()
     def moe_infer(self, x, flat_expert_indices, flat_expert_weights):

         )
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
         self.intermediate_size = config.intermediate_size
     def forward(self, hidden_states):
+        """
+        DenseMixer implementation for MiniCPM MoE:
+        - Sparse forward: only top-k experts compute for selected tokens (inference efficiency)
+        - Dense backward: all experts receive gradients weighted by full routing weights (better router training)
+        - Hook mechanism: only activated tokens produce gradients for each expert (expert sparsity maintained)
+        Forward output uses sparse computation results, backward uses dense gradient via straight-through estimator.
+        """
+        batch_size, seq_length, hidden_dim = hidden_states.shape
+        dtype = hidden_states.dtype
+        device = hidden_states.device
+        flat_hidden = hidden_states.view(-1, hidden_dim)  # (B*seq_len, hidden_dim)
+        N_tokens = flat_hidden.size(0)
+        # Compute routing logic
+        router_logits = self.gate(flat_hidden).to(dtype=dtype)  # (N_tokens, num_experts)
+        routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # (N_tokens, num_experts)
+        # Select top-k experts
+        routing_weights_topk, selected_experts = torch.topk(routing_weights, self.num_experts_per_tok, dim=-1)
+        # MiniCPM always normalizes top-k weights, so we need to adjust routing_weights accordingly
+        norm_ratio = routing_weights_topk.sum(dim=-1, keepdim=True)
+        # Normalize top-k routing weights
+        routing_weights_topk = routing_weights_topk / norm_ratio
+        # Adjust full routing_weights: scale top-k positions by norm_ratio to match sparse computation
+        # This ensures dense_outputs and sparse_outputs use consistent weights
+        mask = F.one_hot(selected_experts, num_classes=self.num_experts).sum(dim=1).to(dtype)
+        # Scale selected experts by norm_ratio, non-selected experts remain unchanged (but won't contribute due to hook)
+        routing_weights = routing_weights * (1.0 - mask) / norm_ratio.detach() + routing_weights * mask / norm_ratio
+        routing_weights_topk = routing_weights_topk.to(dtype=dtype)
+        routing_weights = routing_weights.to(dtype=dtype)
         if self.training:
+            # DenseMixer training mode: sparse forward, dense backward
+            # Prepare accumulators: one for dense_outputs, one for sparse_outputs
+            dense_outputs = torch.zeros((N_tokens, hidden_dim), dtype=dtype, device=device)
+            sparse_outputs = torch.zeros((N_tokens, hidden_dim), dtype=dtype, device=device)
+            # Iterate through all experts
+            for expert_idx in range(self.num_experts):
+                expert_layer = self.experts[expert_idx]
+                # Compute current expert output for all tokens (dense forward for gradient)
+                expert_output = expert_layer(flat_hidden).to(dtype=dtype)  # (N_tokens, hidden_dim)
+                # Register hook to mask non-selected token gradients
+                # This ensures expert parameters only update for activated tokens
+                activation_mask = (selected_experts == expert_idx).any(dim=1).float().unsqueeze(-1).to(dtype)
+                if expert_output.requires_grad:
+                    expert_output.register_hook(lambda grad, mask=activation_mask: grad * mask)
+                # Dense accumulation: multiply by full routing weight and add
+                weight_full = routing_weights[:, expert_idx].unsqueeze(-1)  # (N_tokens, 1)
+                dense_outputs = dense_outputs + expert_output * weight_full
+                # Sparse accumulation: find tokens where this expert is among top-k
+                matches = (selected_experts == expert_idx)
+                if matches.any():
+                    token_indices, k_indices = torch.where(matches)
+                    weights_topk = routing_weights_topk[token_indices, k_indices].unsqueeze(-1)  # (num_matches, 1)
+                    sparse_outputs[token_indices] = sparse_outputs[token_indices] + expert_output[token_indices] * weights_topk
+            # Combine sparse forward output and dense backward output using straight-through estimator
+            # Forward: sparse_outputs, Backward: dense_outputs
+            final_flat = sparse_outputs.detach() + (dense_outputs - dense_outputs.detach())
+            final_flat = final_flat.to(dtype=dtype)
+            final_output = final_flat.view(batch_size, seq_length, hidden_dim)
+            # Compute balance loss
+            load = selected_experts.view(-1).bincount(minlength=self.num_experts)
+            load_mean = load.float() / (N_tokens * self.num_experts_per_tok)
+            importance_mean = F.softmax(router_logits, dim=-1, dtype=torch.float32).mean(dim=0)
             balance_loss = self.num_experts * torch.sum(importance_mean * load_mean)
+            final_output = AddAuxiliaryLoss.apply(final_output, balance_loss)
+            return final_output
         else:
+            # Inference mode: use original sparse implementation for efficiency
+            topk_idx_flat = selected_experts.view(-1)
+            expert_weights = routing_weights_topk
+            y = self.moe_infer(flat_hidden, topk_idx_flat, expert_weights.view(-1, 1)).view(batch_size, seq_length, hidden_dim)
+            return y
     @torch.no_grad()
     def moe_infer(self, x, flat_expert_indices, flat_expert_weights):