Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 8, 2025

Commit

44c43d7

1 Parent(s): 7050cb6

fix small experts loss calculation for gradient

Browse files

Files changed (1) hide show

myolmoe/modeling_myolmoe.py +30 -23

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -593,13 +593,6 @@ class OlmoeSparseMoeBlock(nn.Module):
         small_expert_mask = torch.zeros_like(expert_mask)
         for idx in self.small_expert_indices:
             small_expert_mask[idx] = expert_mask[idx]
-        small_expert_loss = load_balancing_loss_func(
-            router_logits,
-            self.num_experts,
-            self.top_k,
-            None
-        ) * self.small_expert_load_balancing_coef
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
@@ -611,7 +604,7 @@ class OlmoeSparseMoeBlock(nn.Module):
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
-        return final_hidden_states, router_logits, small_expert_loss
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
@@ -657,9 +650,9 @@ class OlmoeDecoderLayer(nn.Module):
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states, router_logits, small_expert_loss = self.mlp(hidden_states) #
         hidden_states = residual + hidden_states #
-        outputs = (hidden_states, small_expert_loss)  #
         if output_attentions:
             outputs += (self_attn_weights,)
         if use_cache:
@@ -1048,29 +1041,43 @@ class MyOlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
         loss = None
         if labels is not None:
             loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
         aux_loss = None
         if output_router_logits:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits if return_dict else outputs[-1],
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
             )
-            if labels is not None:
-                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-        #
-        total_small_expert_loss = torch.tensor(0.0, device=logits.device)
-        for layer_output in outputs:
-            if len(layer_output) > 1 and isinstance(layer_output[1], torch.Tensor):
-                total_small_expert_loss += layer_output[1]
         if labels is not None:
-            loss += total_small_expert_loss.to(loss.device)
         #
         return MoeCausalLMOutputWithPast(
             loss=loss,

         small_expert_mask = torch.zeros_like(expert_mask)
         for idx in self.small_expert_indices:
             small_expert_mask[idx] = expert_mask[idx]
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
         final_hidden_states = final_hidden_states.view(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits = self.mlp(hidden_states) #
         hidden_states = residual + hidden_states #
+        outputs = (hidden_states,)  #
         if output_attentions:
             outputs += (self_attn_weights,)
         if use_cache:
         loss = None
         if labels is not None:
             loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        #
         aux_loss = None
+        total_small_expert_loss = torch.tensor(0.0, device=logits.device)
         if output_router_logits:
+            # Calculate regular load balancing loss
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits if return_dict else outputs[-1],
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
             )
+            # Calculate small expert load balancing loss
+            router_logits = outputs.router_logits if return_dict else outputs[-1]
+            if isinstance(router_logits, tuple):
+                small_expert_mask = torch.zeros_like(router_logits[0])
+                # Create mask for small experts
+                for idx in range(self.config.num_experts - self.config.small_expert_count,
+                               self.config.num_experts):
+                    small_expert_mask = small_expert_mask.scatter(-1, torch.tensor([idx]), 1.0)
+                # Apply mask and calculate loss
+                masked_router_logits = [rl * small_expert_mask for rl in router_logits]
+                total_small_expert_loss = load_balancing_loss_func(
+                    tuple(masked_router_logits),
+                    self.num_experts,
+                    self.num_experts_per_tok,
+                    attention_mask,
+                ) * self.config.small_expert_load_balancing_coef
         if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+            if aux_loss is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)
+            if total_small_expert_loss is not None:
+                loss += total_small_expert_loss.to(loss.device)
         #
         return MoeCausalLMOutputWithPast(
             loss=loss,