Motif-Technologies
/

optimizer

Model card Files Files and versions

ca1207 commited on Sep 24, 2025

Commit

6e9baad

·

1 Parent(s): 2a8631f

use inpalce op in update_g

Files changed (1) hide show

torch-ext/optimizer/muon.py +9 -12

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -650,15 +650,12 @@ class Muon(torch.optim.Optimizer):
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
         if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
@@ -704,10 +701,10 @@ class Muon(torch.optim.Optimizer):
                 new_scale = math.sqrt(threshold / v_ele)
                 if new_scale < scales_full[head_idx]:
                     scales_full[head_idx] = new_scale
-                    logger.info(
-                        f"[{kind}] Head {head_idx} exceeded threshold "
-                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
-                    )
                     scaling += 1
         return scales_full if scaling > 0 else None

     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
+        buf = state.setdefault("momentum_buffer", torch.zeros_like(g))
+        torch.add(g, buf, alpha=momentum, out=buf)
         if group["nesterov"]:
+            g.add_(buf, alpha=momentum)
+            return g
+        return buf
     @staticmethod
     def _update_p(p, u, lr, adjusted_lr, weight_decay):
                 new_scale = math.sqrt(threshold / v_ele)
                 if new_scale < scales_full[head_idx]:
                     scales_full[head_idx] = new_scale
+                    #logger.info(
+                    #    f"[{kind}] Head {head_idx} exceeded threshold "
+                    #    f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    #)
                     scaling += 1
         return scales_full if scaling > 0 else None