iamwyldecat commited on Jun 15, 2025

Commit

bdd2678

1 Parent(s): 8535e80

fix(muon): delete intermediate tensors immediately to lower peak mem usage

Browse files

Files changed (36) hide show

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +10 -19
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +10 -19
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +10 -19
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +10 -19
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +10 -19
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +10 -19
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +10 -19
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +10 -19
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +10 -19
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +10 -19
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-310.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-310.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +3 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +10 -19
torch-ext/optimizer/muon.py +10 -19

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a46d9e65efcfa82522950d9ebf2b2b4594d9ed5abc28704352a1f7de2dae707a
+size 1787272

build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d351a600884b7378f546a345afe65c176e1399bb42fb7dfe4333b0e90975803b
+size 1824224

build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c0843f38cee494b7a5939eb62d27039d76dc3f69401d411efbacaa25cb0d67a
+size 1824224

build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acdba99ce95532a9ca6a8987a7ab61a257657872f2cc672c91e8e5fe809aa24e
+size 1749744

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7d5e76c002507f66f2a227d02c2b11aa3fdc3f07a2a0b82faaa34133adb77ef
+size 1787192

build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:becccd250f38a84803350cfb5fac3a6682b1e594968a714642724cbc71246b4a
+size 1824184

build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34215ecc274ef516967962c8457dad214e9bbf618bf5eee8f467371f4f620284
+size 1824184

build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c23a3adbe4dc1a64b4851a9f8e4aed0e3e1eeeded27322c54f5b942282a2a332
+size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4aa09c22745d5efe1ef0669c4ca05615f67595dc90cabeee6e878301fa9bd22
+size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4baf569b70749c4657062fb0f56943fc486adb0c482e50c7aa8e31ddf5cc870
+size 1883352

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-310.pyc and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-310.pyc differ

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-310.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-310.pyc and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-310.pyc differ

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_b4b3752_dirty
-ops = torch.ops._optimizer_b4b3752_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_b4b3752_dirty::{op_name}"

 import torch
+from . import _optimizer_8535e80_dirty
+ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_8535e80_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8566c9bc05e13c9394572f9f9c6bac24c31932548be485f49eb49fb249880832
+size 1749648

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -48,7 +48,6 @@ class _muon_state:
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
-    scattered_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
@@ -93,12 +92,14 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
-def _scatter(p, state, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
@@ -118,13 +119,16 @@ def _scatter(p, state, rank, comm_stream):
             src=state.worker_rank,
             group=mesh.get_group(),
         )
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
-        state.scattered_u = u
 class Muon(torch.optim.Optimizer):
@@ -353,7 +357,8 @@ class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
@@ -368,20 +373,6 @@ class Muon(torch.optim.Optimizer):
         torch.cuda.current_stream().wait_stream(self.comm_stream)
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            # Update p with sharded u
-            state = param_to_state[id(p)]
-            self._update_p(
-                p,
-                state.scattered_u,
-                lr=lr,
-                wd=wd,
-            )
     def step(self, closure=None):
         """Perform a single optimization step.

     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
+            state.gathered_grad.record_stream(compute_stream)
+            del state.gathered_grad
         else:
             state.computed_u = None
             state.compute_event = None
+def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh
             src=state.worker_rank,
             group=mesh.get_group(),
         )
+        if rank == state.worker_rank:
+            state.computed_u.record_stream(comm_stream)
+            del state.computed_u
         u = DTensor.from_local(
             u,
             placements=p.placements,
             device_mesh=mesh,
         )
+        p.data.mul_(1 - lr * wd)
+        p.data.add_(u, alpha=-lr)
 class Muon(torch.optim.Optimizer):
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _scatter(p, state, adjusted_lr, wd, self.rank, self.comm_stream)
         chunk_size = params[0].device_mesh.mesh.numel()
         torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.