danieldk HF Staff commited on Mar 5

Commit

af99866

verified ·

1 Parent(s): 4b022f0

Build uploaded using `kernels`.

Browse files

Files changed (18) hide show

build/torch210-cxx11-cu126-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch210-cxx11-cu126-aarch64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-aarch64-linux/xpu_fused_moe.py +57 -4
build/torch210-cxx11-cu128-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch210-cxx11-cu128-aarch64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-aarch64-linux/xpu_fused_moe.py +57 -4
build/torch210-cxx11-cu130-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch210-cxx11-cu130-aarch64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-aarch64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-cu126-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch29-cxx11-cu126-aarch64-linux/_ops.py +3 -3
build/torch29-cxx11-cu126-aarch64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-cu128-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch29-cxx11-cu128-aarch64-linux/_ops.py +3 -3
build/torch29-cxx11-cu128-aarch64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-cu130-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch29-cxx11-cu130-aarch64-linux/_ops.py +3 -3
build/torch29-cxx11-cu130-aarch64-linux/xpu_fused_moe.py +57 -4

build/torch210-cxx11-cu126-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c22a3296d294dd5d350de36f19583cf331e7cf8a75c4afb2cce263116b149316
 size 15124328

 version https://git-lfs.github.com/spec/v1
+oid sha256:d43ea617155587acccc47750e126596b0438c63c7ada6f3607a2ed4603337f72
 size 15124328

build/torch210-cxx11-cu126-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch210-cxx11-cu126-aarch64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch210-cxx11-cu128-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba9d77e7863a9a03527ddfe47cc818b1089f384930e969c575be2da559c052f5
 size 21088232

 version https://git-lfs.github.com/spec/v1
+oid sha256:12705f4547b6a55442c52e081a303d4407202cdc26522f7269c983b627946ab9
 size 21088232

build/torch210-cxx11-cu128-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch210-cxx11-cu128-aarch64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch210-cxx11-cu130-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:71bc5b507fe6153b2bdcd4f54a763bc90cf863baa85f026afd25d4eb1a82adb6
 size 12073200

 version https://git-lfs.github.com/spec/v1
+oid sha256:ca7f2de93adbb930ffecaea6953cb94c870333295d05eade3c9c17296aa766a0
 size 12073200

build/torch210-cxx11-cu130-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch210-cxx11-cu130-aarch64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-cu126-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4fe1c804c13c22f3a6ba6b7d104ca92da9734c3dc463f085384eb83750769a96
 size 15121720

 version https://git-lfs.github.com/spec/v1
+oid sha256:581f5d3cd17031f674e6da22c23430881408630004e4ece5a57f9c36583665b5
 size 15121720

build/torch29-cxx11-cu126-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch29-cxx11-cu126-aarch64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-cu128-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:64e119261f728c64bc971e5f9017b472ddaae621ee0527fabcea6b8e6dd7f815
 size 21085456

 version https://git-lfs.github.com/spec/v1
+oid sha256:81684a3eed6a7fb374cdbba3cf65f1cd46f5392ddc6d4992d37186c3b15f5734
 size 21085456

build/torch29-cxx11-cu128-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch29-cxx11-cu128-aarch64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-cu130-aarch64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ee72001370d93ef391b01bca733d227ba8dad92eb99ce7cb51fd97d5589f0ac
 size 12070448

 version https://git-lfs.github.com/spec/v1
+oid sha256:8669b2a5cf6f36ab1d6c518040d4f4e2874d7b1c5880b4424d21f89c60e77c5f
 size 12070448

build/torch29-cxx11-cu130-aarch64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch29-cxx11-cu130-aarch64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)