Kernels
github-actions[bot] commited on
Commit
e93bd1e
·
1 Parent(s): 15336dc

Add built binary [ci skip]

Browse files
Files changed (36) hide show
  1. build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  2. build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
  3. build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  4. build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -6
  5. build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  6. build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
  7. build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  8. build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -6
  9. build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
  10. build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
  11. build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  12. build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +6 -6
  13. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
  14. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
  15. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  16. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +6 -6
  17. build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  18. build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
  19. build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  20. build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -6
  21. build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
  22. build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
  23. build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  24. build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +6 -6
  25. build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
  26. build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
  27. build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  28. build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +6 -6
  29. build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
  30. build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
  31. build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  32. build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +6 -6
  33. build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
  34. build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
  35. build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
  36. build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +6 -6
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:939122c6c19779ad52d51d68a870c547d59b40d57f71464f8c85904078863c45
3
  size 1787368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94a28c3602d8c7a6b216976b1fb09cdd1e9f61bfc9359a80f41b5b628efdfc28
3
  size 1787368
build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10905a024d5fa31fe31b8370747790b2f0ce60d06881831efb7cc07e1c5e5436
3
  size 1824256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
3
  size 1824256
build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:058aa5ff9f1e974cb5b52e4a7af074cef2092c457b5498356fc0fbdd86adf5f3
3
  size 1883344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
3
  size 1883344
build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f63251067a8472a98f754fc603e51a85b44fb51777a40d229af67b173f2a7b2c
3
  size 1749776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7cf2f7b8519dbc3f20e9d151914b55e56d10c012e2232d550b7c8d262746d71
3
  size 1749776
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10905a024d5fa31fe31b8370747790b2f0ce60d06881831efb7cc07e1c5e5436
3
  size 1824256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
3
  size 1824256
build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fc3515c8f7a60854606e596524c768196bd424219ff9fe80d1c69bfe2803bcd
3
  size 1883344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
3
  size 1883344
build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:058aa5ff9f1e974cb5b52e4a7af074cef2092c457b5498356fc0fbdd86adf5f3
3
  size 1883344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6880c22f63ccd66e8ac62792a564d1ade58325b47369a1773c7753d4243893b9
3
  size 1883344
build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d572269f12f1092080e5efeb914e5cb18bbe491d3561ab077016eaec2be7fe55
3
  size 1749936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae22a3afdffd54435c6e5b145fc0b7772d03eb8c8bad0d388d9b2d1c8d2f60d5
3
  size 1749936
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_6943c45_dirty
3
- ops = torch.ops._optimizer_6943c45_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_6943c45_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_15336dc_dirty
3
+ ops = torch.ops._optimizer_15336dc_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_15336dc_dirty::{op_name}"
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76ab2b9f9b47115d3ec6fbef5863bd80f3673a95ddf404ae03d8e516c3e3167a
3
  size 1750024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8092bc6ee3e353b2188f0874bc7f145e4eafd0366a40da9750c225732961f7c7
3
  size 1750024
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py CHANGED
@@ -1,17 +1,17 @@
1
  # MIT License
2
- #
3
  # Copyright (c) 2025 Tianyang Lin
4
- #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
- #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
- #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 
1
  # MIT License
2
+ #
3
  # Copyright (c) 2025 Tianyang Lin
4
+ #
5
  # Permission is hereby granted, free of charge, to any person obtaining a copy
6
  # of this software and associated documentation files (the "Software"), to deal
7
  # in the Software without restriction, including without limitation the rights
8
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
  # copies of the Software, and to permit persons to whom the Software is
10
  # furnished to do so, subject to the following conditions:
11
+ #
12
  # The above copyright notice and this permission notice shall be included in all
13
  # copies or substantial portions of the Software.
14
+ #
15
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
  # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
  # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
- shard_elems = split_elems_for_src(p, state, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
- total += split_elems_for_src(p, state, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
- n = split_elems_for_src(p, state, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
- n = split_elems_for_src(p, state, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
- total += split_elems_for_src(p, state, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
- n = split_elems_for_src(p, state, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,
 
121
  state = param_to_state[id(p)]
122
  dst = state.worker_rank
123
  assert dst < num_ranks
124
+ shard_elems = split_elems_for_src(p, rank, num_ranks)
125
  g = p.grad
126
  g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
127
  assert g.numel() == shard_elems
 
145
  for p in owned_params:
146
  state = param_to_state[id(p)]
147
  assert state.worker_rank == rank
148
+ total += split_elems_for_src(p, src, num_ranks)
149
  recv_counts[src] = total
150
 
151
  recv_total = sum(recv_counts)
 
186
  for p in owned_params:
187
  state = param_to_state[id(p)]
188
  assert state.worker_rank == rank
189
+ n = split_elems_for_src(p, src, num_ranks)
190
  assert n > 0
191
 
192
  sg = recv_buf.narrow(0, off + inner_off, n)
 
278
 
279
  offset = 0
280
  for dst in range(num_ranks):
281
+ n = split_elems_for_src(p, dst, num_ranks)
282
  assert n > 0
283
 
284
  su = u_full.narrow(0, offset, n)
 
304
  state = param_to_state[id(p)]
305
  if state.worker_rank != src:
306
  continue
307
+ total += split_elems_for_src(p, rank, num_ranks)
308
  recv_counts[src] = total
309
 
310
  recv_total = sum(recv_counts)
 
348
  state = param_to_state[id(p)]
349
  if state.worker_rank != src:
350
  continue
351
+ n = split_elems_for_src(p, rank, num_ranks)
352
  assert n > 0
353
 
354
  flat_local = recv_buf.narrow(0, off + inner_off,