danieldk HF Staff commited on about 1 month ago

Commit

1a70d6e

verified ·

1 Parent(s): af99866

Build uploaded using `kernels`.

Browse files

Files changed (30) hide show

build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_cpu_dd32462.abi3.so → _megablocks_cpu_6e04dec.abi3.so} +1 -1
build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cpu-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_xpu_dd32462.abi3.so → _megablocks_xpu_6e04dec.abi3.so} +2 -2
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_cpu_dd32462.abi3.so → _megablocks_cpu_6e04dec.abi3.so} +1 -1
build/torch29-cxx11-cpu-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cpu-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch29-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu126-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu128-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} +1 -1
build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-cu130-x86_64-linux/xpu_fused_moe.py +57 -4
build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_xpu_dd32462.abi3.so → _megablocks_xpu_6e04dec.abi3.so} +2 -2
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +3 -3
build/torch29-cxx11-xpu20252-x86_64-linux/xpu_fused_moe.py +57 -4

build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_cpu_dd32462.abi3.so → _megablocks_cpu_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d456fb003229668826de9c88890c3bd0af4b2bdc313140017b75e3ed2e553c8
 size 2219080

 version https://git-lfs.github.com/spec/v1
+oid sha256:70b79b772262fee7ee79153a54dc208c9166f4c34680f752b7bc2ce8d8ae1f74
 size 2219080

build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cpu_dd32462
-ops = torch.ops._megablocks_cpu_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cpu_dd32462::{op_name}"

 import torch
+from . import _megablocks_cpu_6e04dec
+ops = torch.ops._megablocks_cpu_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cpu_6e04dec::{op_name}"

build/torch210-cxx11-cpu-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a86d0f456034ce972bae452ab95b8fb8fcf24e015985f74678fca4b673fc50dc
 size 15061056

 version https://git-lfs.github.com/spec/v1
+oid sha256:55948eae893317a5e500315e47efd66c4482bb67449caef3f512b2cabffb7dc6
 size 15061056

build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:000b7e99c0d0afc09167bb819cb7c4de4f2f3de7136744020d58a9f0fd51a24a
 size 21009984

 version https://git-lfs.github.com/spec/v1
+oid sha256:6e66fd44576448dc82e7392db0c935cd8654bfcb51db51ddc044e1c33bc82c60
 size 21009984

build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e79c036505112f43afa8658e1e04dcc5bf536a297560fd03e7070ebbe21e2b54
 size 12041592

 version https://git-lfs.github.com/spec/v1
+oid sha256:4ed503a781293a9d6150e0362edbe9360ef6e58590b511ee23596649ee9a437d
 size 12041592

build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_xpu_dd32462.abi3.so → _megablocks_xpu_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1eafe2cfbec6a1c65fc7b523e3abdc3270f2e85b4f4bc64b88b9aeb41698484c
-size 5331960

 version https://git-lfs.github.com/spec/v1
+oid sha256:46cfa6050944b0bd6daeaf4848fe5393a68397ae29a5c7f0a04280e287cb0e7d
+size 5381760

build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_xpu_dd32462
-ops = torch.ops._megablocks_xpu_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_xpu_dd32462::{op_name}"

 import torch
+from . import _megablocks_xpu_6e04dec
+ops = torch.ops._megablocks_xpu_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_xpu_6e04dec::{op_name}"

build/torch210-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-cpu-x86_64-linux/{_megablocks_cpu_dd32462.abi3.so → _megablocks_cpu_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e97831d51919986a68bcb622208e131d3b66fa4a83b99da941b77708dd522edc
 size 2201200

 version https://git-lfs.github.com/spec/v1
+oid sha256:18348238274eb1b281afe628b09ca6a4a5b8267370aaed7bf34a2bd91c9b815b
 size 2201200

build/torch29-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cpu_dd32462
-ops = torch.ops._megablocks_cpu_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cpu_dd32462::{op_name}"

 import torch
+from . import _megablocks_cpu_6e04dec
+ops = torch.ops._megablocks_cpu_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cpu_6e04dec::{op_name}"

build/torch29-cxx11-cpu-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-cu126-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a66815ed63ba2be83feeae47d8255e2d72ce1cd5e6a0e9f92d063e2cb81a522
 size 15046832

 version https://git-lfs.github.com/spec/v1
+oid sha256:fae42809a452f57bb4ef6967a397029f4e557ad73424c1b68fb613070dcd3f0d
 size 15046832

build/torch29-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch29-cxx11-cu126-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-cu128-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da033fc8fa10230b35ddd1d0a45dc29bc44462739d0bb70ac7373cf5864b6634
 size 20995704

 version https://git-lfs.github.com/spec/v1
+oid sha256:0349d7de015576f9dae76f82c321d491609d1ae84bc5f2cb8053891e167a0aca
 size 20995704

build/torch29-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch29-cxx11-cu128-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-cu130-x86_64-linux/{_megablocks_cuda_dd32462.abi3.so → _megablocks_cuda_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e7d5b3f6e60dd4e7a0ada343f1665ccb8d5daf8b535808b9f455efe022a2783
 size 12031416

 version https://git-lfs.github.com/spec/v1
+oid sha256:1e1383adbf7afa208f0769d84a826fcd43de9ee9ce39d676ebce97698759c526
 size 12031416

build/torch29-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_dd32462
-ops = torch.ops._megablocks_cuda_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_dd32462::{op_name}"

 import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_6e04dec::{op_name}"

build/torch29-cxx11-cu130-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)

build/torch29-cxx11-xpu20252-x86_64-linux/{_megablocks_xpu_dd32462.abi3.so → _megablocks_xpu_6e04dec.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8c5866635254310bb5a96138b0c7e78dc4509730ada6eb0a4c7d8d112c0585e
-size 5192232

 version https://git-lfs.github.com/spec/v1
+oid sha256:02442f31668da97521b3301b613a9acaa3478b83bfe838213ec690f7412c0157
+size 5197008

build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_xpu_dd32462
-ops = torch.ops._megablocks_xpu_dd32462
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_xpu_dd32462::{op_name}"

 import torch
+from . import _megablocks_xpu_6e04dec
+ops = torch.ops._megablocks_xpu_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_xpu_6e04dec::{op_name}"

build/torch29-cxx11-xpu20252-x86_64-linux/xpu_fused_moe.py CHANGED Viewed

@@ -31,12 +31,12 @@ def _register_xpu_fake_kernels():
     _register_if_available(
         "fused_moe_prologue",
-        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
-        lambda output, moe_output, topk_weights, unpermuted_row_to_permuted_row, num_experts: None,
     )
     _register_if_available(
@@ -202,6 +202,8 @@ def xpu_fused_moe(hidden_states,
                   n_experts_per_token,
                   activation,
                   num_experts,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
@@ -329,7 +331,7 @@ def xpu_fused_moe(hidden_states,
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
-    workspace = torch.empty(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
@@ -341,6 +343,8 @@ def xpu_fused_moe(hidden_states,
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
@@ -351,6 +355,10 @@ def xpu_fused_moe(hidden_states,
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
@@ -359,9 +367,13 @@ def xpu_fused_moe(hidden_states,
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
@@ -451,7 +463,9 @@ def xpu_fused_moe(hidden_states,
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
                                 unpermuted_row_to_permuted_row,
                                 num_experts_per_node)
     return output
@@ -500,6 +514,21 @@ def route_tokens_xpu(
     return logits, expert_weights, expert_indices
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
@@ -524,6 +553,23 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             self.experts, "normalize_expert_weights", None
         )
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
@@ -598,12 +644,19 @@ class MegaBlocksMoeMLP(torch.nn.Module):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
-            num_experts=moe_num_experts,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
         # Restore original shape
         output = output.view(in_shape)

     _register_if_available(
         "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
     )
     _register_if_available(
         "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
     )
     _register_if_available(
                   n_experts_per_token,
                   activation,
                   num_experts,
+                  ep_rank=0,
+                  ep_size=1,
                   is_fp8=False,
                   is_int4=False,
                   is_mxfp4=False):
     config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
     config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+    workspace = torch.zeros(map_offset,
                             dtype=torch.uint8,
                             device=hidden_states.device)
     if topk_ids.dtype == torch.int32:
         workspace=workspace,
         hidden_size=hidden_size,
         inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
         num_experts_on_rank=num_experts_per_node)
     expert_first_token_offset_bytes = workspace[
         ws_map["unpermuted_row_to_permuted_row"][1]:
         ws_map["unpermuted_row_to_permuted_row"][1] +
         src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
     if torch.compiler.is_compiling():
         expert_first_token_offset = _bytes_to_typed_tensor(
         unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
             unpermuted_row_to_permuted_row_bytes, torch.int32
         )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
     else:
         expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
         unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
     gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
                             ws_map["overlapped_gemm1_gemm2_inputs"][1] +
                             permuted_data_size].view(hidden_states.dtype).view(
             is_B_mxfp4=is_mxfp4)
     ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
                                 unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
                                 num_experts_per_node)
     return output
     return logits, expert_weights, expert_indices
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
 class MegaBlocksMoeMLP(torch.nn.Module):
     can_torch_compile: bool = True
             self.experts, "normalize_expert_weights", None
         )
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
         # Detect activation type - check for GptOss-style swigluoai activation
         # GptOssExperts has alpha and limit attributes for swigluoai
         if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
             topk_ids=expert_indices,
             n_experts_per_token=moe_top_k,
             activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
             is_fp8=is_fp8,
             is_int4=is_int4,
             is_mxfp4=is_mxfp4,
         )
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
         # Restore original shape
         output = output.view(in_shape)