danieldk HF Staff commited on Feb 2

Commit

4f20330

verified ·

1 Parent(s): 82f6f0e

Build uploaded using `kernels`.

Browse files

Files changed (40) hide show

build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch210-cxx11-cpu-x86_64-linux/layers.py +2 -2
build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch210-cxx11-cu126-x86_64-linux/layers.py +2 -2
build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch210-cxx11-cu128-x86_64-linux/layers.py +2 -2
build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch210-cxx11-cu130-x86_64-linux/layers.py +2 -2
build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch210-cxx11-xpu20253-x86_64-linux/layers.py +2 -2
build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
build/torch29-cxx11-cpu-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch29-cxx11-cpu-x86_64-linux/layers.py +2 -2
build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
build/torch29-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch29-cxx11-cu126-x86_64-linux/layers.py +2 -2
build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch29-cxx11-cu128-x86_64-linux/layers.py +2 -2
build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +1 -1
build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch29-cxx11-cu130-x86_64-linux/layers.py +2 -2
build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} +2 -2
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py +26 -29
build/torch29-cxx11-xpu20252-x86_64-linux/layers.py +2 -2

build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d106dd5b45ae2a650aba0a07a1e75a0354eb10b68837d0c53dbb628e6d6def9c
-size 481440

 version https://git-lfs.github.com/spec/v1
+oid sha256:3a81c0cc23130a95d05263f0509e8de560183f6472f458f4316c97e6e8d8f533
+size 2219056

build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch210-cxx11-cpu-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb14f5ada7b5fcdd840c416b6cf681b7b2a696daaa05fd3433e5b407bfc9ca60
 size 15061032

 version https://git-lfs.github.com/spec/v1
+oid sha256:d482577c55ffe1abd34983ce45eeeb280a817e55f92d6585b5e92173b2860749
 size 15061032

build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch210-cxx11-cu126-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4bff52333038ec399aeb6a59e6eaa4ab14181a5078f991073e7dc0832d9fd734
 size 21009952

 version https://git-lfs.github.com/spec/v1
+oid sha256:c0876dbd4267e12fa67f24fac60cedbee8e6dd41b85104c4c241b173729bee9a
 size 21009952

build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch210-cxx11-cu128-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54b7fd567f3b59954adc84c1f59daca1e3aaae0e39ed55eb0dae26b757abec69
 size 12041568

 version https://git-lfs.github.com/spec/v1
+oid sha256:4c7bc97e0aadcd94b0f6d3d7198269823d894fd5a36f6af9744864211ae0fd71
 size 12041568

build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch210-cxx11-cu130-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:126a70e85e8005d5a2db89fb2b23fc632280e75e0ae2b0379c9d608bc4e52fac
-size 5331944

 version https://git-lfs.github.com/spec/v1
+oid sha256:dbf6091a3c2622e19367385fb8c82b507f841749bc9c4177421884232856c021
+size 4227888

build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch210-cxx11-xpu20253-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:47753740f498270f38d7f4c86788b331cdb1a4f5844e33bbd47c88e2f41018a4
-size 463560

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b3f1c2f3058c4c5c08291c7a51be003046657e7567454a779911c7cebfdc3d9
+size 2201176

build/torch29-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch29-cxx11-cpu-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:298f12bcf9a7309303c12de19abce186339781436f6434a8ae26b1285532c047
 size 15046808

 version https://git-lfs.github.com/spec/v1
+oid sha256:4d58bdd86403eaa524fac1db9361b0025a175f4b10dcddd8fa0bf99892172e54
 size 15046808

build/torch29-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch29-cxx11-cu126-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4aaa489ff9216e3e64de07c6df0d848458b4093ba58f34d1982ed23014e28cb9
 size 20995680

 version https://git-lfs.github.com/spec/v1
+oid sha256:a5c3c17f0fa54822f12b05fe5c22f8b61ad1a9711a02de13a706e1e8f63e141b
 size 20995680

build/torch29-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch29-cxx11-cu128-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8f6d752268e2d2d229f22023130937c39eea95cfa2ac6ee7343aae9f6554d52e
 size 12031392

 version https://git-lfs.github.com/spec/v1
+oid sha256:609492272ed9672ab824abf87b08f078f409696c8db453ccc5f46dff39d84f98
 size 12031392

build/torch29-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch29-cxx11-cu130-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP

build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_db0709c.abi3.so → _megablocks_099ac3c.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dac2f7b352f54ffdcf3e2b6a9487ac305b9cca32ec5f6d6eec140f460378a794
-size 5192224

 version https://git-lfs.github.com/spec/v1
+oid sha256:82d4807a02abe216da87ac6d4fbbf4870fdefa64ef182d09ab3408528107f08b
+size 4075712

build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_db0709c
-ops = torch.ops._megablocks_db0709c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_db0709c::{op_name}"

 import torch
+from . import _megablocks_099ac3c
+ops = torch.ops._megablocks_099ac3c
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_099ac3c::{op_name}"

build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py CHANGED Viewed

@@ -105,7 +105,7 @@ def fused_moe_cpp(
     return output
-class MegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
@@ -115,32 +115,6 @@ class MegaBlocksMoeMLP(torch.nn.Module):
     """
     can_torch_compile: bool = True
-    def convert_weight(self, dtype, use_mxfp4: bool = False):
-        data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
-        data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
-        if use_mxfp4:
-            self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
-        else:
-            # convert_weight_packed onlu supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
-            data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
-            data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
-            self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
-            self.experts.down_proj.data = ops.convert_weight_packed(data_2)
-        # C++ kernel does not support float32.
-        dtype = torch.bfloat16 if dtype == torch.float32 else dtype
-        if getattr(self.experts, "gate_up_proj_bias", None) is not None:
-            self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
-        if getattr(self.experts, "down_proj_bias", None) is not None:
-            self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
-    def convert_scales(self):
-        data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
-        self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
-        self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
@@ -163,14 +137,37 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
-            self.convert_scales()
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
-            self.convert_weight(x.dtype, self.use_mxfp4)
             self.packed_weight = True
         # Get MoE parameters

     return output
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
     """
     C++ optimized MoE MLP using brgemm.
     Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
     """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor) -> tuple:
         """
         Forward pass through the MoE layer using C++ kernel.
             and hasattr(self.experts, "gate_up_proj")
             and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
         ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
             self.packed_scales = True
             self.use_mxfp4 = True
         if not getattr(self, "packed_weight", False) and hasattr(
             self.experts, "gate_up_proj"
         ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
             self.packed_weight = True
         # Get MoE parameters

build/torch29-cxx11-xpu20252-x86_64-linux/layers.py CHANGED Viewed

@@ -1228,5 +1228,5 @@ class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
-elif not torch.cuda.is_available():
-    from .cpu_moe_cpp import MegaBlocksMoeMLP

 # Patch for XPU or CPU support
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .xpu_fused_moe import MegaBlocksMoeMLP
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP