Kernels
danieldk HF Staff commited on
Commit
4f20330
·
verified ·
1 Parent(s): 82f6f0e

Build uploaded using `kernels`.

Browse files
Files changed (40) hide show
  1. build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
  2. build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
  3. build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py +26 -29
  4. build/torch210-cxx11-cpu-x86_64-linux/layers.py +2 -2
  5. build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
  6. build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  7. build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py +26 -29
  8. build/torch210-cxx11-cu126-x86_64-linux/layers.py +2 -2
  9. build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
  10. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  11. build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py +26 -29
  12. build/torch210-cxx11-cu128-x86_64-linux/layers.py +2 -2
  13. build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
  14. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  15. build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py +26 -29
  16. build/torch210-cxx11-cu130-x86_64-linux/layers.py +2 -2
  17. build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
  18. build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +3 -3
  19. build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py +26 -29
  20. build/torch210-cxx11-xpu20253-x86_64-linux/layers.py +2 -2
  21. build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
  22. build/torch29-cxx11-cpu-x86_64-linux/_ops.py +3 -3
  23. build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py +26 -29
  24. build/torch29-cxx11-cpu-x86_64-linux/layers.py +2 -2
  25. build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
  26. build/torch29-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  27. build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py +26 -29
  28. build/torch29-cxx11-cu126-x86_64-linux/layers.py +2 -2
  29. build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
  30. build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  31. build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py +26 -29
  32. build/torch29-cxx11-cu128-x86_64-linux/layers.py +2 -2
  33. build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
  34. build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  35. build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py +26 -29
  36. build/torch29-cxx11-cu130-x86_64-linux/layers.py +2 -2
  37. build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
  38. build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +3 -3
  39. build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py +26 -29
  40. build/torch29-cxx11-xpu20252-x86_64-linux/layers.py +2 -2
build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d106dd5b45ae2a650aba0a07a1e75a0354eb10b68837d0c53dbb628e6d6def9c
3
- size 481440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a81c0cc23130a95d05263f0509e8de560183f6472f458f4316c97e6e8d8f533
3
+ size 2219056
build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch210-cxx11-cpu-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb14f5ada7b5fcdd840c416b6cf681b7b2a696daaa05fd3433e5b407bfc9ca60
3
  size 15061032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d482577c55ffe1abd34983ce45eeeb280a817e55f92d6585b5e92173b2860749
3
  size 15061032
build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch210-cxx11-cu126-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bff52333038ec399aeb6a59e6eaa4ab14181a5078f991073e7dc0832d9fd734
3
  size 21009952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0876dbd4267e12fa67f24fac60cedbee8e6dd41b85104c4c241b173729bee9a
3
  size 21009952
build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch210-cxx11-cu128-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54b7fd567f3b59954adc84c1f59daca1e3aaae0e39ed55eb0dae26b757abec69
3
  size 12041568
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c7bc97e0aadcd94b0f6d3d7198269823d894fd5a36f6af9744864211ae0fd71
3
  size 12041568
build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch210-cxx11-cu130-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:126a70e85e8005d5a2db89fb2b23fc632280e75e0ae2b0379c9d608bc4e52fac
3
- size 5331944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbf6091a3c2622e19367385fb8c82b507f841749bc9c4177421884232856c021
3
+ size 4227888
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch210-cxx11-xpu20253-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47753740f498270f38d7f4c86788b331cdb1a4f5844e33bbd47c88e2f41018a4
3
- size 463560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b3f1c2f3058c4c5c08291c7a51be003046657e7567454a779911c7cebfdc3d9
3
+ size 2201176
build/torch29-cxx11-cpu-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch29-cxx11-cpu-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:298f12bcf9a7309303c12de19abce186339781436f6434a8ae26b1285532c047
3
  size 15046808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d58bdd86403eaa524fac1db9361b0025a175f4b10dcddd8fa0bf99892172e54
3
  size 15046808
build/torch29-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch29-cxx11-cu126-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4aaa489ff9216e3e64de07c6df0d848458b4093ba58f34d1982ed23014e28cb9
3
  size 20995680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5c3c17f0fa54822f12b05fe5c22f8b61ad1a9711a02de13a706e1e8f63e141b
3
  size 20995680
build/torch29-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch29-cxx11-cu128-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f6d752268e2d2d229f22023130937c39eea95cfa2ac6ee7343aae9f6554d52e
3
  size 12031392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:609492272ed9672ab824abf87b08f078f409696c8db453ccc5f46dff39d84f98
3
  size 12031392
build/torch29-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch29-cxx11-cu130-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dac2f7b352f54ffdcf3e2b6a9487ac305b9cca32ec5f6d6eec140f460378a794
3
- size 5192224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82d4807a02abe216da87ac6d4fbbf4870fdefa64ef182d09ab3408528107f08b
3
+ size 4075712
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_db0709c
3
- ops = torch.ops._megablocks_db0709c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_db0709c::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_099ac3c
3
+ ops = torch.ops._megablocks_099ac3c
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_099ac3c::{op_name}"
build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py CHANGED
@@ -105,7 +105,7 @@ def fused_moe_cpp(
105
  return output
106
 
107
 
108
- class MegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
115
  """
116
  can_torch_compile: bool = True
117
 
118
- def convert_weight(self, dtype, use_mxfp4: bool = False):
119
- data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
120
- data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
121
- if use_mxfp4:
122
- self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
123
- self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
124
- else:
125
- # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
126
- data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
127
- data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
128
- self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
129
- self.experts.down_proj.data = ops.convert_weight_packed(data_2)
130
-
131
- # C++ kernel does not support float32.
132
- dtype = torch.bfloat16 if dtype == torch.float32 else dtype
133
- if getattr(self.experts, "gate_up_proj_bias", None) is not None:
134
- self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
135
- if getattr(self.experts, "down_proj_bias", None) is not None:
136
- self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
137
-
138
- def convert_scales(self):
139
- data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
140
- data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
141
- self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
142
- self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
143
-
144
  def forward(self, x: torch.Tensor) -> tuple:
145
  """
146
  Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
163
  and hasattr(self.experts, "gate_up_proj")
164
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
165
  ):
166
- self.convert_scales()
 
 
 
 
167
  self.packed_scales = True
168
  self.use_mxfp4 = True
169
 
170
  if not getattr(self, "packed_weight", False) and hasattr(
171
  self.experts, "gate_up_proj"
172
  ):
173
- self.convert_weight(x.dtype, self.use_mxfp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  self.packed_weight = True
175
 
176
  # Get MoE parameters
 
105
  return output
106
 
107
 
108
+ class CPUMegaBlocksMoeMLP(torch.nn.Module):
109
  """
110
  C++ optimized MoE MLP using brgemm.
111
  Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
 
115
  """
116
  can_torch_compile: bool = True
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def forward(self, x: torch.Tensor) -> tuple:
119
  """
120
  Forward pass through the MoE layer using C++ kernel.
 
137
  and hasattr(self.experts, "gate_up_proj")
138
  and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
139
  ):
140
+ # convert scales
141
+ data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
142
+ data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
143
+ self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
144
+ self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
145
  self.packed_scales = True
146
  self.use_mxfp4 = True
147
 
148
  if not getattr(self, "packed_weight", False) and hasattr(
149
  self.experts, "gate_up_proj"
150
  ):
151
+ # convert weights
152
+ data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
153
+ data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
154
+ if self.use_mxfp4:
155
+ self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
156
+ self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
157
+ else:
158
+ # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
159
+ data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
160
+ data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
161
+ self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
162
+ self.experts.down_proj.data = ops.convert_weight_packed(data_2)
163
+
164
+ # C++ kernel does not support float32.
165
+ dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
166
+ if getattr(self.experts, "gate_up_proj_bias", None) is not None:
167
+ self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
168
+ if getattr(self.experts, "down_proj_bias", None) is not None:
169
+ self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
170
+
171
  self.packed_weight = True
172
 
173
  # Get MoE parameters
build/torch29-cxx11-xpu20252-x86_64-linux/layers.py CHANGED
@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
- elif not torch.cuda.is_available():
1232
- from .cpu_moe_cpp import MegaBlocksMoeMLP
 
1228
  # Patch for XPU or CPU support
1229
  if hasattr(torch, "xpu") and torch.xpu.is_available():
1230
  from .xpu_fused_moe import MegaBlocksMoeMLP
1231
+
1232
+ from .cpu_moe_cpp import CPUMegaBlocksMoeMLP