delete state in split_func

Browse files

Files changed (2) hide show

torch-ext/optimizer/matmul_transpose_triton.py +4 -4
torch-ext/optimizer/muon.py +6 -6

torch-ext/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,