github-actions[bot] commited on Sep 26, 2025

Commit

e93bd1e

1 Parent(s): 15336dc

Add built binary [ci skip]

Files changed (36) hide show

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -6
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -6
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +6 -6
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +6 -6
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -6
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +6 -6
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +6 -6
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +6 -6
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +6 -6

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:939122c6c19779ad52d51d68a870c547d59b40d57f71464f8c85904078863c45
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:94a28c3602d8c7a6b216976b1fb09cdd1e9f61bfc9359a80f41b5b628efdfc28
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10905a024d5fa31fe31b8370747790b2f0ce60d06881831efb7cc07e1c5e5436
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:058aa5ff9f1e974cb5b52e4a7af074cef2092c457b5498356fc0fbdd86adf5f3
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
 size 1883344

build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f63251067a8472a98f754fc603e51a85b44fb51777a40d229af67b173f2a7b2c
 size 1749776

 version https://git-lfs.github.com/spec/v1
+oid sha256:c7cf2f7b8519dbc3f20e9d151914b55e56d10c012e2232d550b7c8d262746d71
 size 1749776

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10905a024d5fa31fe31b8370747790b2f0ce60d06881831efb7cc07e1c5e5436
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
 size 1824256

build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6fc3515c8f7a60854606e596524c768196bd424219ff9fe80d1c69bfe2803bcd
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
 size 1883344

build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:058aa5ff9f1e974cb5b52e4a7af074cef2092c457b5498356fc0fbdd86adf5f3
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:6880c22f63ccd66e8ac62792a564d1ade58325b47369a1773c7753d4243893b9
 size 1883344

build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d572269f12f1092080e5efeb914e5cb18bbe491d3561ab077016eaec2be7fe55
 size 1749936

 version https://git-lfs.github.com/spec/v1
+oid sha256:ae22a3afdffd54435c6e5b145fc0b7772d03eb8c8bad0d388d9b2d1c8d2f60d5
 size 1749936

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_6943c45_dirty
-ops = torch.ops._optimizer_6943c45_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_6943c45_dirty::{op_name}"

 import torch
+from . import _optimizer_15336dc_dirty
+ops = torch.ops._optimizer_15336dc_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_15336dc_dirty::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76ab2b9f9b47115d3ec6fbef5863bd80f3673a95ddf404ae03d8e516c3e3167a
 size 1750024

 version https://git-lfs.github.com/spec/v1
+oid sha256:8092bc6ee3e353b2188f0874bc7f145e4eafd0366a40da9750c225732961f7c7
 size 1750024

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # MIT License
-#
 # Copyright (c) 2025 Tianyang Lin
-#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 # MIT License
+#
 # Copyright (c) 2025 Tianyang Lin
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
+#
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
-            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                total += split_elems_for_src(p, state, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
-                n = split_elems_for_src(p, state, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 offset = 0
                 for dst in range(num_ranks):
-                    n = split_elems_for_src(p, state, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                total += split_elems_for_src(p, state, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
-                n = split_elems_for_src(p, state, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,

             state = param_to_state[id(p)]
             dst = state.worker_rank
             assert dst < num_ranks
+            shard_elems = split_elems_for_src(p, rank, num_ranks)
             g = p.grad
             g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
             assert g.numel() == shard_elems
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                total += split_elems_for_src(p, src, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
             for p in owned_params:
                 state = param_to_state[id(p)]
                 assert state.worker_rank == rank
+                n = split_elems_for_src(p, src, num_ranks)
                 assert n > 0
                 sg = recv_buf.narrow(0, off + inner_off, n)
                 offset = 0
                 for dst in range(num_ranks):
+                    n = split_elems_for_src(p, dst, num_ranks)
                     assert n > 0
                     su = u_full.narrow(0, offset, n)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                total += split_elems_for_src(p, rank, num_ranks)
             recv_counts[src] = total
         recv_total = sum(recv_counts)
                 state = param_to_state[id(p)]
                 if state.worker_rank != src:
                     continue
+                n = split_elems_for_src(p, rank, num_ranks)
                 assert n > 0
                 flat_local = recv_buf.narrow(0, off + inner_off,