reorder small experts

Browse files

Files changed (3) hide show

myolmoe/config.json +1 -1
myolmoe/modeling_myolmoe.py +6 -7
scripts/train.py +2 -2

myolmoe/config.json CHANGED Viewed

@@ -32,6 +32,6 @@
   "use_cache": true,
   "vocab_size": 50304,
   "small_expert_intermediate_ratio": 16,
-  "small_expert_frequency": 4,
   "small_expert_load_balancing_coef": 0.1
 }

   "use_cache": true,
   "vocab_size": 50304,
   "small_expert_intermediate_ratio": 16,
+  "small_expert_count": 64,
   "small_expert_load_balancing_coef": 0.1
 }

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -27,7 +27,7 @@ class OlmoeConfig(PretrainedConfig):
         [Previous args remain the same...]
         small_expert_intermediate_ratio (`float`, *optional*, defaults to 0.5):
             Ratio of intermediate size for small experts compared to regular experts.
-        small_expert_frequency (`int`, *optional*, defaults to 4):
             Frequency of small experts - every Nth expert will be small.
         small_expert_load_balancing_coef (`float`, *optional*, defaults to 0.1):
             Coefficient for small expert load balancing loss.
@@ -63,7 +63,7 @@ class OlmoeConfig(PretrainedConfig):
         router_aux_loss_coef=0.01,
         norm_topk_prob=False,
         small_expert_intermediate_ratio=64,
-        small_expert_frequency=4,
         small_expert_load_balancing_coef=0.1,
         **kwargs,
     ):
@@ -96,7 +96,7 @@ class OlmoeConfig(PretrainedConfig):
         # Small expert parameters
         self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
-        self.small_expert_frequency = small_expert_frequency
         self.small_expert_load_balancing_coef = small_expert_load_balancing_coef
         # Validate the correctness of rotary position embeddings parameters
@@ -558,13 +558,12 @@ class OlmoeSparseMoeBlock(nn.Module):
         self.n_step = getattr(config, "nth_step", 2)
         # Track which experts are small
-        self.small_expert_indices = []
         self.experts = nn.ModuleList()
         for i in range(self.num_experts):
-            is_small = (i % config.small_expert_frequency == 0)
-            if is_small:
-                self.small_expert_indices.append(i)
             self.experts.append(OlmoeMLP(config, is_small=is_small))
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)

         [Previous args remain the same...]
         small_expert_intermediate_ratio (`float`, *optional*, defaults to 0.5):
             Ratio of intermediate size for small experts compared to regular experts.
+        small_expert_count (`int`, *optional*, defaults to 64):
             Frequency of small experts - every Nth expert will be small.
         small_expert_load_balancing_coef (`float`, *optional*, defaults to 0.1):
             Coefficient for small expert load balancing loss.
         router_aux_loss_coef=0.01,
         norm_topk_prob=False,
         small_expert_intermediate_ratio=64,
+        small_expert_count=64,
         small_expert_load_balancing_coef=0.1,
         **kwargs,
     ):
         # Small expert parameters
         self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
+        self.small_expert_count = small_expert_count
         self.small_expert_load_balancing_coef = small_expert_load_balancing_coef
         # Validate the correctness of rotary position embeddings parameters
         self.n_step = getattr(config, "nth_step", 2)
         # Track which experts are small
+        self.small_expert_indices = list(range(config.num_experts - config.small_expert_count, config.num_experts))
         self.experts = nn.ModuleList()
         for i in range(self.num_experts):
+            # Small experts are now at the end indices
+            is_small = i in self.small_expert_indices
             self.experts.append(OlmoeMLP(config, is_small=is_small))
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)

scripts/train.py CHANGED Viewed

@@ -95,7 +95,7 @@ def main():
     # Unfreeze only the small experts and their gating networks
     for name, param in model.named_parameters():
         # Unfreeze small expert layers
-        if "mlp.experts" in name and any(f"mlp.experts.{i}." in name for i in range(0, config.num_experts, config.small_expert_frequency)):
             param.requires_grad = True
             print(f"Unfreezing small expert parameter: {name}")
@@ -103,7 +103,7 @@ def main():
         if "mlp.gate" in name:
             param.requires_grad = True
             print(f"Unfreezing gating network parameter: {name}")
     # Trainer
     trainer = Trainer(
         model=model,

     # Unfreeze only the small experts and their gating networks
     for name, param in model.named_parameters():
         # Unfreeze small expert layers
+        if "mlp.experts" in name and any(f"mlp.experts.{i}." in name for i in range(0, config.num_experts, config.small_expert_count)):
             param.requires_grad = True
             print(f"Unfreezing small expert parameter: {name}")
         if "mlp.gate" in name:
             param.requires_grad = True
             print(f"Unfreezing gating network parameter: {name}")
     # Trainer
     trainer = Trainer(
         model=model,