Build uploaded using `kernels`.
Browse files- build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
- build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch210-cxx11-cpu-x86_64-linux/layers.py +2 -2
- build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
- build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch210-cxx11-cu126-x86_64-linux/layers.py +2 -2
- build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
- build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch210-cxx11-cu128-x86_64-linux/layers.py +2 -2
- build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
- build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch210-cxx11-cu130-x86_64-linux/layers.py +2 -2
- build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
- build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch210-cxx11-xpu20253-x86_64-linux/layers.py +2 -2
- build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
- build/torch29-cxx11-cpu-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch29-cxx11-cpu-x86_64-linux/layers.py +2 -2
- build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
- build/torch29-cxx11-cu126-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch29-cxx11-cu126-x86_64-linux/layers.py +2 -2
- build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
- build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch29-cxx11-cu128-x86_64-linux/layers.py +2 -2
- build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
- build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch29-cxx11-cu130-x86_64-linux/layers.py +2 -2
- build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
- build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py +26 -29
- build/torch29-cxx11-xpu20252-x86_64-linux/layers.py +2 -2
build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a81c0cc23130a95d05263f0509e8de560183f6472f458f4316c97e6e8d8f533
|
| 3 |
+
size 2219056
|
build/torch210-cxx11-cpu-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch210-cxx11-cpu-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15061032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d482577c55ffe1abd34983ce45eeeb280a817e55f92d6585b5e92173b2860749
|
| 3 |
size 15061032
|
build/torch210-cxx11-cu126-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch210-cxx11-cu126-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21009952
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0876dbd4267e12fa67f24fac60cedbee8e6dd41b85104c4c241b173729bee9a
|
| 3 |
size 21009952
|
build/torch210-cxx11-cu128-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch210-cxx11-cu128-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12041568
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c7bc97e0aadcd94b0f6d3d7198269823d894fd5a36f6af9744864211ae0fd71
|
| 3 |
size 12041568
|
build/torch210-cxx11-cu130-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch210-cxx11-cu130-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dbf6091a3c2622e19367385fb8c82b507f841749bc9c4177421884232856c021
|
| 3 |
+
size 4227888
|
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch210-cxx11-xpu20253-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b3f1c2f3058c4c5c08291c7a51be003046657e7567454a779911c7cebfdc3d9
|
| 3 |
+
size 2201176
|
build/torch29-cxx11-cpu-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch29-cxx11-cpu-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15046808
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d58bdd86403eaa524fac1db9361b0025a175f4b10dcddd8fa0bf99892172e54
|
| 3 |
size 15046808
|
build/torch29-cxx11-cu126-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch29-cxx11-cu126-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20995680
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5c3c17f0fa54822f12b05fe5c22f8b61ad1a9711a02de13a706e1e8f63e141b
|
| 3 |
size 20995680
|
build/torch29-cxx11-cu128-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch29-cxx11-cu128-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12031392
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:609492272ed9672ab824abf87b08f078f409696c8db453ccc5f46dff39d84f98
|
| 3 |
size 12031392
|
build/torch29-cxx11-cu130-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch29-cxx11-cu130-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|
build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82d4807a02abe216da87ac6d4fbbf4870fdefa64ef182d09ab3408528107f08b
|
| 3 |
+
size 4075712
|
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _megablocks_099ac3c
|
| 3 |
+
ops = torch.ops._megablocks_099ac3c
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_megablocks_099ac3c::{op_name}"
|
build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py
CHANGED
|
@@ -105,7 +105,7 @@ def fused_moe_cpp(
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
-
class
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
| 118 |
-
def convert_weight(self, dtype, use_mxfp4: bool = False):
|
| 119 |
-
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 120 |
-
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 121 |
-
if use_mxfp4:
|
| 122 |
-
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 123 |
-
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 124 |
-
else:
|
| 125 |
-
# convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 126 |
-
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 127 |
-
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 128 |
-
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 129 |
-
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 130 |
-
|
| 131 |
-
# C++ kernel does not support float32.
|
| 132 |
-
dtype = torch.bfloat16 if dtype == torch.float32 else dtype
|
| 133 |
-
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 134 |
-
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 135 |
-
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 136 |
-
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 137 |
-
|
| 138 |
-
def convert_scales(self):
|
| 139 |
-
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 140 |
-
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 141 |
-
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 142 |
-
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 143 |
-
|
| 144 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 145 |
"""
|
| 146 |
Forward pass through the MoE layer using C++ kernel.
|
|
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
|
|
| 163 |
and hasattr(self.experts, "gate_up_proj")
|
| 164 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 165 |
):
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self.packed_scales = True
|
| 168 |
self.use_mxfp4 = True
|
| 169 |
|
| 170 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 171 |
self.experts, "gate_up_proj"
|
| 172 |
):
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
self.packed_weight = True
|
| 175 |
|
| 176 |
# Get MoE parameters
|
|
|
|
| 105 |
return output
|
| 106 |
|
| 107 |
|
| 108 |
+
class CPUMegaBlocksMoeMLP(torch.nn.Module):
|
| 109 |
"""
|
| 110 |
C++ optimized MoE MLP using brgemm.
|
| 111 |
Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
|
|
|
|
| 115 |
"""
|
| 116 |
can_torch_compile: bool = True
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def forward(self, x: torch.Tensor) -> tuple:
|
| 119 |
"""
|
| 120 |
Forward pass through the MoE layer using C++ kernel.
|
|
|
|
| 137 |
and hasattr(self.experts, "gate_up_proj")
|
| 138 |
and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
|
| 139 |
):
|
| 140 |
+
# convert scales
|
| 141 |
+
data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 142 |
+
data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
|
| 143 |
+
self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
|
| 144 |
+
self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
|
| 145 |
self.packed_scales = True
|
| 146 |
self.use_mxfp4 = True
|
| 147 |
|
| 148 |
if not getattr(self, "packed_weight", False) and hasattr(
|
| 149 |
self.experts, "gate_up_proj"
|
| 150 |
):
|
| 151 |
+
# convert weights
|
| 152 |
+
data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
|
| 153 |
+
data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
|
| 154 |
+
if self.use_mxfp4:
|
| 155 |
+
self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
|
| 156 |
+
self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
|
| 157 |
+
else:
|
| 158 |
+
# convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
|
| 159 |
+
data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
|
| 160 |
+
data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
|
| 161 |
+
self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
|
| 162 |
+
self.experts.down_proj.data = ops.convert_weight_packed(data_2)
|
| 163 |
+
|
| 164 |
+
# C++ kernel does not support float32.
|
| 165 |
+
dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
|
| 166 |
+
if getattr(self.experts, "gate_up_proj_bias", None) is not None:
|
| 167 |
+
self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
|
| 168 |
+
if getattr(self.experts, "down_proj_bias", None) is not None:
|
| 169 |
+
self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
|
| 170 |
+
|
| 171 |
self.packed_weight = True
|
| 172 |
|
| 173 |
# Get MoE parameters
|
build/torch29-cxx11-xpu20252-x86_64-linux/layers.py
CHANGED
|
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
-
|
| 1232 |
-
|
|
|
|
| 1228 |
# Patch for XPU or CPU support
|
| 1229 |
if hasattr(torch, "xpu") and torch.xpu.is_available():
|
| 1230 |
from .xpu_fused_moe import MegaBlocksMoeMLP
|
| 1231 |
+
|
| 1232 |
+
from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
|