DeepSeek-V2-Lite-defaultmoe

Safetensors

deepseek_v2

custom_code

Model card Files Files and versions

xet

Community

autoprogrammer commited on Nov 20, 2025

Commit

c9da69f

verified ·

1 Parent(s): 4c90484

Update modeling_deepseek.py

Browse files

Files changed (1) hide show

modeling_deepseek.py +86 -11

modeling_deepseek.py CHANGED Viewed

@@ -521,12 +521,24 @@ class AddAuxiliaryLoss(torch.autograd.Function):
 class DeepseekV2MoE(nn.Module):
     """
     A mixed expert module containing shared experts.
     """
-    def __init__(self, config):
         super().__init__()
         self.config = config
         self.num_experts_per_tok = config.num_experts_per_tok
         if hasattr(config, "ep_size") and config.ep_size > 1:
             assert config.ep_size == dist.get_world_size()
@@ -565,24 +577,87 @@ class DeepseekV2MoE(nn.Module):
                 config=config, intermediate_size=intermediate_size
             )
     def forward(self, hidden_states):
         identity = hidden_states
         orig_shape = hidden_states.shape
         topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        flat_topk_idx = topk_idx.view(-1)
         if self.training:
-            hidden_states = hidden_states.repeat_interleave(
-                self.num_experts_per_tok, dim=0
             )
-            y = torch.empty_like(hidden_states)
-            for i, expert in enumerate(self.experts):
-                y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
-            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-            y = y.to(hidden_states.dtype).view(*orig_shape)
             y = AddAuxiliaryLoss.apply(y, aux_loss)
         else:
-            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
         if self.config.n_shared_experts is not None:
             y = y + self.shared_experts(identity)
         return y

 class DeepseekV2MoE(nn.Module):
     """
     A mixed expert module containing shared experts.
+    Modified to use Default MoE for dense backpropagation.
     """
+    def __init__(self, config, beta=0.9):
         super().__init__()
+        print("=" * 80)
+        print("初始化 Default MoE 版本的 DeepseekV2MoE")
+        print(f"  - 路由专家数量: {config.n_routed_experts}")
+        print(f"  - Top-K: {config.num_experts_per_tok}")
+        print(f"  - EMA beta: {beta}")
+        print("=" * 80)
         self.config = config
         self.num_experts_per_tok = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        # Default MoE: EMA 参数
+        self.beta = beta
         if hasattr(config, "ep_size") and config.ep_size > 1:
             assert config.ep_size == dist.get_world_size()
                 config=config, intermediate_size=intermediate_size
             )
+        # Default MoE: 为每个路由专家注册 default vector
+        # persistent=False: 不保存到 checkpoint，避免兼容性问题
+        for expert_idx in range(config.n_routed_experts):
+            self.register_buffer(
+                f'default_vector_{expert_idx}',
+                torch.zeros(config.hidden_size),
+                persistent=False
+            )
     def forward(self, hidden_states):
         identity = hidden_states
         orig_shape = hidden_states.shape
+        bsz, seq_len, hidden_dim = hidden_states.shape
         topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
+        flat_hidden = hidden_states.view(-1, hidden_dim)
+        N_tokens = flat_hidden.size(0)
         if self.training:
+            # ========== Default MoE 训练逻辑 ==========
+            dtype = hidden_states.dtype
+            device = hidden_states.device
+            # 重新计算完整的 routing weights (所有专家的 softmax)
+            # 这样未激活的专家也能接收梯度信号
+            router_logits = F.linear(
+                flat_hidden.type(torch.float32),
+                self.gate.weight.type(torch.float32),
+                None
             )
+            routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32)
+            routing_weights = routing_weights.to(dtype=dtype)
+            final_output = torch.zeros((N_tokens, hidden_dim), dtype=dtype, device=device)
+            # 遍历每个专家
+            for expert_idx in range(self.n_routed_experts):
+                expert_layer = self.experts[expert_idx]
+                # 获取该专家的 default vector
+                default_vector = getattr(self, f'default_vector_{expert_idx}').to(dtype=dtype)
+                # 找出哪些 token 激活了这个专家 (在 top-K 中)
+                matches = (topk_idx == expert_idx)  # (N_tokens, top_k)
+                is_activated = matches.any(dim=1)   # (N_tokens,)
+                if is_activated.any():
+                    # ===== 激活的 tokens: 计算真实专家输出 =====
+                    activated_token_indices = torch.where(is_activated)[0]
+                    activated_inputs = flat_hidden[activated_token_indices]
+                    # 计算真实专家输出
+                    real_expert_output = expert_layer(activated_inputs)
+                    real_expert_output = real_expert_output.to(dtype=dtype)
+                    # ===== 更新该专家的 EMA (仅训练模式) =====
+                    mean_output = real_expert_output.mean(dim=0).detach()
+                    new_default = self.beta * default_vector + (1 - self.beta) * mean_output
+                    getattr(self, f'default_vector_{expert_idx}').copy_(new_default)
+                    # ===== 累加真实输出 (使用归一化的 topk_weight) =====
+                    token_indices, k_indices = torch.where(matches)
+                    if len(token_indices) > 0:
+                        # 使用 topk_weight (已归一化的权重)
+                        weights = topk_weight[token_indices, k_indices, None]
+                        weighted_output = real_expert_output * weights
+                        final_output.index_add_(0, token_indices, weighted_output.to(dtype))
+                # ===== 对未激活的 tokens，累加 default vector (使用原始 softmax 权重) =====
+                non_activated_indices = torch.where(~is_activated)[0]
+                if len(non_activated_indices) > 0:
+                    weights_non_activated = routing_weights[non_activated_indices, expert_idx].unsqueeze(-1)
+                    final_output[non_activated_indices] += weights_non_activated * default_vector
+            y = final_output.view(*orig_shape)
             y = AddAuxiliaryLoss.apply(y, aux_loss)
         else:
+            # ========== 推理时使用原始的高效实现 ==========
+            y = self.moe_infer(flat_hidden, topk_idx, topk_weight).view(*orig_shape)
+        # 添加共享专家输出
         if self.config.n_shared_experts is not None:
             y = y + self.shared_experts(identity)
         return y