use COMM_DTYPE instead of hardcoded dtype

Browse files

Files changed (2) hide show

torch-ext/optimizer/matmul_transpose_triton.py +1 -1
torch-ext/optimizer/muon.py +11 -12

torch-ext/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -28,7 +28,7 @@ def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
                GROUP_SIZE_M: tl.constexpr):
     """
     Core kernel jit function of matmul_transpose that computes y = x @ x.T
-    The code is a simple adaptation from the triton `matmul` tutorial:
     https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
     """
     pid = tl.program_id(axis=0)

                GROUP_SIZE_M: tl.constexpr):
     """
     Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
     https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
     """
     pid = tl.program_id(axis=0)

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -12,6 +12,8 @@ from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -30,8 +32,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    assert G.dtype == torch.bfloat16
-    G = G.to(thorch.float32)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
@@ -55,7 +56,6 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    X = X.to(torch.bfloat16)
     return X
@@ -89,7 +89,7 @@ def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
             if rank == state.worker_rank:
                 num_ranks = dist.get_world_size(group=state.process_group)
                 state.gathered_grad = torch.empty(p.grad.numel(),
-                                                  dtype=torch.bfloat16,
                                                   device="cuda")
             else:
                 state.gathered_grad = None
@@ -114,7 +114,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             dst = state.worker_rank
             shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
-            g = g.to_local().to(torch.bfloat16).contiguous().view(-1)
             assert g.numel() == shard_elems
             per_dst[dst].append(g)
             send_counts[dst] += shard_elems
@@ -139,7 +139,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             recv_counts[src] = total
         recv_total = sum(recv_counts)
-        recv_buf = torch.empty(recv_total, dtype=torch.bfloat16, device="cuda")
         dist.all_to_all_single(
             recv_buf,
             send_buf,
@@ -225,7 +225,7 @@ def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
         for p in params:
             state = param_to_state[id(p)]
             state.scattered_u = torch.empty_like(p.to_local(),
-                                                 dtype=torch.bfloat16)
         alloc_event = torch.cuda.Event()
         alloc_event.record(compute_stream)
@@ -254,8 +254,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 assert state.computed_u is not None
-                u_full = state.computed_u.to(
-                    torch.bfloat16).contiguous().view(-1)
                 offset = 0
                 for dst in range(num_ranks):
@@ -274,7 +273,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
         else:
             # all_to_all requires participation from all ranks
             # Even non-owner ranks must join the collective call
-            send_buf = torch.empty(0, dtype=torch.bfloat16, device="cuda")
         recv_counts = [0] * num_ranks
         for src in range(num_ranks):
@@ -288,7 +287,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
         recv_total = sum(recv_counts)
         assert recv_total > 0
-        recv_buf = torch.empty(recv_total, dtype=torch.bfloat16, device="cuda")
         dist.all_to_all_single(
             recv_buf,
@@ -636,7 +635,7 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g.bfloat16(),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)

 logger = logging.getLogger(__name__)
+COMM_DTYPE = torch.bfloat16
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == COMM_DTYPE
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
     if G.size(0) > G.size(1):
         X = X.T
     return X
             if rank == state.worker_rank:
                 num_ranks = dist.get_world_size(group=state.process_group)
                 state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=COMM_DTYPE,
                                                   device="cuda")
             else:
                 state.gathered_grad = None
             dst = state.worker_rank
             shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
+            g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             per_dst[dst].append(g)
             send_counts[dst] += shard_elems
             recv_counts[src] = total
         recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
         dist.all_to_all_single(
             recv_buf,
             send_buf,
         for p in params:
             state = param_to_state[id(p)]
             state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=COMM_DTYPE)
         alloc_event = torch.cuda.Event()
         alloc_event.record(compute_stream)
                 assert state.computed_u is not None
+                u_full = state.computed_u.to(COMM_DTYPE).contiguous().view(-1)
                 offset = 0
                 for dst in range(num_ranks):
         else:
             # all_to_all requires participation from all ranks
             # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=COMM_DTYPE, device="cuda")
         recv_counts = [0] * num_ranks
         for src in range(num_ranks):
         recv_total = sum(recv_counts)
         assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=COMM_DTYPE, device="cuda")
         dist.all_to_all_single(
             recv_buf,
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.to(COMM_DTYPE),
                                              steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)