ikaganacar
/

ismail

ikaganacar commited on Nov 12, 2025

Commit

b70422e

1 Parent(s): 1b2e4da

Fixes

Files changed (2) hide show

Model_Architecture/config.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "model": {
-        "max_batch_size": 8,
-        "max_seq_len": 2048,
         "dtype": "bf16",
         "scale_fmt": null,
         "vocab_size": 32768,
-        "dim": 1024,
         "inter_dim": 4096,
-        "moe_inter_dim": 1024,
         "n_layers": 20,
         "n_dense_layers": 3,
         "n_heads": 12,

 {
     "model": {
+        "max_batch_size": 4,
+        "max_seq_len": 1024,
         "dtype": "bf16",
         "scale_fmt": null,
         "vocab_size": 32768,
+        "dim": 768,
         "inter_dim": 4096,
+        "moe_inter_dim": 768,
         "n_layers": 20,
         "n_dense_layers": 3,
         "n_heads": 12,

Model_Architecture/model.py CHANGED Viewed

@@ -394,29 +394,31 @@ class MoE(nn.Module):
         weights = weights / weights.sum(dim=-1, keepdim=True)
         weights = weights * self.gate.route_scale
-        # Sequential Training Mode
         if self.training and self.active_expert_idx is not None:
             y = torch.zeros_like(x)
-            # Only compute gradients for active expert
-            for i in range(self.n_routed_experts):
-                idx, top = torch.where(indices == i)
-                if idx.numel() == 0:
-                    continue
-                # Use gradient context manager
-                grad_context = nullcontext() if i == self.active_expert_idx else torch.no_grad()
-                with grad_context:
-                    expert_out = self.experts[i](x[idx])
-                    y[idx] += expert_out * weights[idx, top, None]
             # Load balance loss (still needed for gate training)
             lb_loss = self.compute_load_balance_loss(router_probs, indices)
-            # Shared experts always train
             z = self.shared_experts(x)
             return (y + z).view(original_shape), lb_loss
         # Normal MoE Mode (inference or full training)

         weights = weights / weights.sum(dim=-1, keepdim=True)
         weights = weights * self.gate.route_scale
+        # Sequential Training Mode - MEMORY EFFICIENT
+        # ONLY compute forward pass for the active expert to save GPU memory
         if self.training and self.active_expert_idx is not None:
             y = torch.zeros_like(x)
+            # Run forward pass ONLY for the active expert
+            i = self.active_expert_idx
+            idx, top = torch.where(indices == i)
+            if idx.numel() > 0:
+                # Only this expert gets gradients and forward pass
+                expert_out = self.experts[i](x[idx])
+                y[idx] = expert_out * weights[idx, top, None]
+            # Inactive experts: Skip forward pass entirely (save memory!)
+            # Note: This means the model output will be degraded during training,
+            # but it's acceptable since we're training experts sequentially.
+            # The shared experts + active expert still provide reasonable outputs.
             # Load balance loss (still needed for gate training)
             lb_loss = self.compute_load_balance_loss(router_probs, indices)
+            # Shared experts always train (provides baseline performance)
             z = self.shared_experts(x)
             return (y + z).view(original_shape), lb_loss
         # Normal MoE Mode (inference or full training)