github-actions[bot]
commited on
Commit
·
e93bd1e
1
Parent(s):
15336dc
Add built binary [ci skip]
Browse files- build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -6
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
- build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -6
- build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
- build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +6 -6
- build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +6 -6
- build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
- build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -6
- build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
- build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +6 -6
- build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
- build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +6 -6
- build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +6 -6
- build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so} +1 -1
- build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py +4 -4
- build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +6 -6
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1787368
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94a28c3602d8c7a6b216976b1fb09cdd1e9f61bfc9359a80f41b5b628efdfc28
|
| 3 |
size 1787368
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1824256
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
|
| 3 |
size 1824256
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1883344
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
|
| 3 |
size 1883344
|
build/torch27-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1749776
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7cf2f7b8519dbc3f20e9d151914b55e56d10c012e2232d550b7c8d262746d71
|
| 3 |
size 1749776
|
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1824256
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ca6ca8225dc9b7888566f5c7fd824234a3b4ac76718a5d18e6c75ca7acd488d
|
| 3 |
size 1824256
|
build/torch28-cxx11-cu126-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1883344
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e06baa32b0950126ee192654bd9f7adc79cc05d8ec39d2078c70d62ee81fdcd5
|
| 3 |
size 1883344
|
build/torch28-cxx11-cu128-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_6943c45_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1883344
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6880c22f63ccd66e8ac62792a564d1ade58325b47369a1773c7753d4243893b9
|
| 3 |
size 1883344
|
build/torch28-cxx11-cu129-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1749936
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae22a3afdffd54435c6e5b145fc0b7772d03eb8c8bad0d388d9b2d1c8d2f60d5
|
| 3 |
size 1749936
|
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _optimizer_15336dc_dirty
|
| 3 |
+
ops = torch.ops._optimizer_15336dc_dirty
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_optimizer_15336dc_dirty::{op_name}"
|
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_6943c45_dirty.abi3.so → _optimizer_15336dc_dirty.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1750024
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8092bc6ee3e353b2188f0874bc7f145e4eafd0366a40da9750c225732961f7c7
|
| 3 |
size 1750024
|
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/matmul_transpose_triton.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# MIT License
|
| 2 |
-
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
-
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
-
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
-
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
| 1 |
# MIT License
|
| 2 |
+
#
|
| 3 |
# Copyright (c) 2025 Tianyang Lin
|
| 4 |
+
#
|
| 5 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
# of this software and associated documentation files (the "Software"), to deal
|
| 7 |
# in the Software without restriction, including without limitation the rights
|
| 8 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
# copies of the Software, and to permit persons to whom the Software is
|
| 10 |
# furnished to do so, subject to the following conditions:
|
| 11 |
+
#
|
| 12 |
# The above copyright notice and this permission notice shall be included in all
|
| 13 |
# copies or substantial portions of the Software.
|
| 14 |
+
#
|
| 15 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py
CHANGED
|
@@ -121,7 +121,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
-
shard_elems = split_elems_for_src(p,
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
@@ -145,7 +145,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
-
total += split_elems_for_src(p,
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
@@ -186,7 +186,7 @@ def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
-
n = split_elems_for_src(p,
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
@@ -278,7 +278,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
-
n = split_elems_for_src(p,
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
@@ -304,7 +304,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
-
total += split_elems_for_src(p,
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
@@ -348,7 +348,7 @@ def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
-
n = split_elems_for_src(p,
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|
|
|
|
| 121 |
state = param_to_state[id(p)]
|
| 122 |
dst = state.worker_rank
|
| 123 |
assert dst < num_ranks
|
| 124 |
+
shard_elems = split_elems_for_src(p, rank, num_ranks)
|
| 125 |
g = p.grad
|
| 126 |
g = g.to_local().to(COMM_DTYPE).contiguous().view(-1)
|
| 127 |
assert g.numel() == shard_elems
|
|
|
|
| 145 |
for p in owned_params:
|
| 146 |
state = param_to_state[id(p)]
|
| 147 |
assert state.worker_rank == rank
|
| 148 |
+
total += split_elems_for_src(p, src, num_ranks)
|
| 149 |
recv_counts[src] = total
|
| 150 |
|
| 151 |
recv_total = sum(recv_counts)
|
|
|
|
| 186 |
for p in owned_params:
|
| 187 |
state = param_to_state[id(p)]
|
| 188 |
assert state.worker_rank == rank
|
| 189 |
+
n = split_elems_for_src(p, src, num_ranks)
|
| 190 |
assert n > 0
|
| 191 |
|
| 192 |
sg = recv_buf.narrow(0, off + inner_off, n)
|
|
|
|
| 278 |
|
| 279 |
offset = 0
|
| 280 |
for dst in range(num_ranks):
|
| 281 |
+
n = split_elems_for_src(p, dst, num_ranks)
|
| 282 |
assert n > 0
|
| 283 |
|
| 284 |
su = u_full.narrow(0, offset, n)
|
|
|
|
| 304 |
state = param_to_state[id(p)]
|
| 305 |
if state.worker_rank != src:
|
| 306 |
continue
|
| 307 |
+
total += split_elems_for_src(p, rank, num_ranks)
|
| 308 |
recv_counts[src] = total
|
| 309 |
|
| 310 |
recv_total = sum(recv_counts)
|
|
|
|
| 348 |
state = param_to_state[id(p)]
|
| 349 |
if state.worker_rank != src:
|
| 350 |
continue
|
| 351 |
+
n = split_elems_for_src(p, rank, num_ranks)
|
| 352 |
assert n > 0
|
| 353 |
|
| 354 |
flat_local = recv_buf.narrow(0, off + inner_off,
|