Kernels:

kernels-community
/

sonic-moe

Trusted publisher

Kernel card Files Files and versions

xet

Community

danieldk HF Staff commited on Apr 17

Commit

a36a00c

verified ·

1 Parent(s): fd7de3f

Uploaded using `kernel-builder`.

Browse files

Files changed (5) hide show

build/torch-cuda/_ops.py +2 -2
build/torch-cuda/functional/__init__.py +9 -1
build/torch-cuda/functional/backward.py +14 -12
build/torch-cuda/functional/forward.py +7 -6
build/torch-cuda/functional/moe_config.py +41 -3

build/torch-cuda/_ops.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch
-ops = torch.ops._sonic_moe_57a1b31
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_sonic_moe_57a1b31::{op_name}"

 import torch
+ops = torch.ops._sonic_moe_75daa46
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_sonic_moe_75daa46::{op_name}"

build/torch-cuda/functional/__init__.py CHANGED Viewed

@@ -70,6 +70,7 @@ class _UpProjection(torch.autograd.Function):
         is_varlen_K: bool,
         activation_type: ActivationType,
         is_inference_mode_enabled: bool,
     ) -> torch.Tensor:
         T, H = x.shape
         I, H, E = w1.shape
@@ -105,6 +106,7 @@ class _UpProjection(torch.autograd.Function):
                 activation_type=activation_type.value,
                 is_glu_activation=is_glu_activation,
                 is_inference_mode_enabled=is_inference_mode_enabled,
             )
         ctx.T = T
@@ -115,6 +117,7 @@ class _UpProjection(torch.autograd.Function):
         ctx.I = I
         ctx.is_varlen_K = is_varlen_K
         ctx.is_glu_activation = is_glu_activation
         ctx.stream_id = stream_id
         ctx.save_for_backward(
@@ -146,6 +149,7 @@ class _UpProjection(torch.autograd.Function):
         K = ctx.K
         H = ctx.H
         is_glu_activation = ctx.is_glu_activation
         is_varlen_K = ctx.is_varlen_K
         stream_id = ctx.stream_id
@@ -190,6 +194,7 @@ class _UpProjection(torch.autograd.Function):
                 s_scatter_idx=s_scatter_idx,
                 is_glu_activation=is_glu_activation,
                 stream_id=stream_id,
             )
             _up_projection_backward_weight(
@@ -201,6 +206,7 @@ class _UpProjection(torch.autograd.Function):
                 x_gather_idx=x_gather_idx,
                 is_glu_activation=is_glu_activation,
                 stream_id=stream_id,
             )
         dx_reduced = torch.empty(T, H, dtype=dz.dtype, device=dz.device)
@@ -215,7 +221,7 @@ class _UpProjection(torch.autograd.Function):
             is_varlen_K=is_varlen_K,
         )
-        return dx_reduced, dw1, db1, *[None] * 12
 class _DownProjection(torch.autograd.Function):
@@ -486,6 +492,7 @@ def moe_general_routing_inputs(
     stream_id: int,
     activation_type: ActivationType,
     is_inference_mode_enabled: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert ((b1 is None) and (b2 is None)) or (
         (b1 is not None) and (b2 is not None)
@@ -531,6 +538,7 @@ def moe_general_routing_inputs(
         True,  # is_varlen_K
         activation_type,
         is_inference_mode_enabled,
     )
     o = _DownProjection.apply(

         is_varlen_K: bool,
         activation_type: ActivationType,
         is_inference_mode_enabled: bool,
+        is_concatenated_gate_up: bool = False,
     ) -> torch.Tensor:
         T, H = x.shape
         I, H, E = w1.shape
                 activation_type=activation_type.value,
                 is_glu_activation=is_glu_activation,
                 is_inference_mode_enabled=is_inference_mode_enabled,
+                is_concatenated_gate_up=is_concatenated_gate_up,
             )
         ctx.T = T
         ctx.I = I
         ctx.is_varlen_K = is_varlen_K
         ctx.is_glu_activation = is_glu_activation
+        ctx.is_concatenated_gate_up = is_concatenated_gate_up
         ctx.stream_id = stream_id
         ctx.save_for_backward(
         K = ctx.K
         H = ctx.H
         is_glu_activation = ctx.is_glu_activation
+        is_concatenated_gate_up = ctx.is_concatenated_gate_up
         is_varlen_K = ctx.is_varlen_K
         stream_id = ctx.stream_id
                 s_scatter_idx=s_scatter_idx,
                 is_glu_activation=is_glu_activation,
                 stream_id=stream_id,
+                is_concatenated_gate_up=is_concatenated_gate_up,
             )
             _up_projection_backward_weight(
                 x_gather_idx=x_gather_idx,
                 is_glu_activation=is_glu_activation,
                 stream_id=stream_id,
+                is_concatenated_gate_up=is_concatenated_gate_up,
             )
         dx_reduced = torch.empty(T, H, dtype=dz.dtype, device=dz.device)
             is_varlen_K=is_varlen_K,
         )
+        return dx_reduced, dw1, db1, *[None] * 13
 class _DownProjection(torch.autograd.Function):
     stream_id: int,
     activation_type: ActivationType,
     is_inference_mode_enabled: bool = False,
+    is_concatenated_gate_up: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert ((b1 is None) and (b2 is None)) or (
         (b1 is not None) and (b2 is not None)
         True,  # is_varlen_K
         activation_type,
         is_inference_mode_enabled,
+        is_concatenated_gate_up,
     )
     o = _DownProjection.apply(

build/torch-cuda/functional/backward.py CHANGED Viewed

@@ -206,6 +206,7 @@ def _up_projection_backward_act(
     s_scatter_idx: torch.Tensor,
     is_glu_activation: bool,
     stream_id: int,
 ) -> None:
     I, H, E = w1.size()
     if is_glu_activation:
@@ -228,9 +229,9 @@ def _up_projection_backward_act(
         mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
     current_stream = cuda.CUstream(stream_id)
-    compile_dx_key = ("dx", E, H, I, is_glu_activation, dx_expanded.dtype)
     if compile_dx_key not in _up_projection_backward_act.compile_cache:
-        dx_module = HopperWgmma_MoE_Up_proj_ActGrad_Bwd(E, H, I, is_glu_activation)
         tensormaps = [dx_module.module.generate_tensormap(None, None, None) for _ in range(2)]
         _up_projection_backward_act.compile_cache[compile_dx_key] = cute.compile(
             dx_module,
@@ -244,9 +245,9 @@ def _up_projection_backward_act(
             mE_permute_order,
             current_stream,
         )
-        _up_projection_backward_act.compile_cache[f"dx-{TENSORMAP}"] = tensormaps
-    dx_tensormaps = _up_projection_backward_act.compile_cache[f"dx-{TENSORMAP}"]
     _up_projection_backward_act.compile_cache[compile_dx_key](
         mDz,
         mW1_trans,
@@ -273,6 +274,7 @@ def _up_projection_backward_weight(
     x_gather_idx: torch.Tensor,
     is_glu_activation: bool,
     stream_id: int,
 ) -> None:
     I, H, E = dw1.size()
     if is_glu_activation:
@@ -293,9 +295,9 @@ def _up_projection_backward_weight(
         mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
     current_stream = cuda.CUstream(stream_id)
-    compile_dw1_key = ("dw1", E, H, I, is_glu_activation, x.dtype)
     if compile_dw1_key not in _up_projection_backward_weight.compile_cache:
-        dw1_module = HopperWgmma_MoE_Up_proj_WeightGrad_Bwd(E, H, I, is_glu_activation)
         tensormaps = [dw1_module.module.generate_tensormap(None, None, None) for _ in range(1)]
         _up_projection_backward_weight.compile_cache[compile_dw1_key] = cute.compile(
             dw1_module,
@@ -308,9 +310,9 @@ def _up_projection_backward_weight(
             mE_permute_order,
             current_stream,
         )
-        _up_projection_backward_weight.compile_cache[f"dw1-{TENSORMAP}"] = tensormaps
-    dw1_tensormaps = _up_projection_backward_weight.compile_cache[f"dw1-{TENSORMAP}"]
     _up_projection_backward_weight.compile_cache[compile_dw1_key](
         mX_trans,
         mDz_trans,
@@ -406,14 +408,14 @@ def _down_projection_backward_act(
             mE_permute_order,
             current_stream,
         )
-        _down_projection_backward_act.compile_cache[f"dz-{TENSORMAP}"] = tensormaps
     if ds_partial is None:
         ds_partial_N = _down_projection_backward_act.compile_cache["ds_partial_N"]
         ds_partial = torch.empty(TK, ds_partial_N, dtype=torch.float32, device=topk_scores.device)
         mDS_partial = convert_torch_tensor_to_cute_tensor(ds_partial, (0, 1), 1, 4, 1, stream=stream_id)
-    dz_tensormaps = _down_projection_backward_act.compile_cache[f"dz-{TENSORMAP}"]
     _down_projection_backward_act.compile_cache[compile_dz_key](
         mDout,
         mW2_trans,
@@ -520,9 +522,9 @@ def _down_projection_backward_weight(
             mE_permute_order,
             current_stream,
         )
-        _down_projection_backward_weight.compile_cache[f"dw2-{TENSORMAP}"] = tensormaps
-    dw2_tensormaps = _down_projection_backward_weight.compile_cache[f"dw2-{TENSORMAP}"]
     _down_projection_backward_weight.compile_cache[compile_dw2_key](
         mDout_trans, mY1S_trans, mDw2, mE_offset, mX_gather, dw2_tensormaps, mE_permute_order, current_stream
     )

     s_scatter_idx: torch.Tensor,
     is_glu_activation: bool,
     stream_id: int,
+    is_concatenated_gate_up: bool = False,
 ) -> None:
     I, H, E = w1.size()
     if is_glu_activation:
         mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
     current_stream = cuda.CUstream(stream_id)
+    compile_dx_key = ("dx", E, H, I, is_glu_activation, dx_expanded.dtype, is_concatenated_gate_up)
     if compile_dx_key not in _up_projection_backward_act.compile_cache:
+        dx_module = HopperWgmma_MoE_Up_proj_ActGrad_Bwd(E, H, I, is_glu_activation, is_concatenated_gate_up=is_concatenated_gate_up)
         tensormaps = [dx_module.module.generate_tensormap(None, None, None) for _ in range(2)]
         _up_projection_backward_act.compile_cache[compile_dx_key] = cute.compile(
             dx_module,
             mE_permute_order,
             current_stream,
         )
+        _up_projection_backward_act.compile_cache[(TENSORMAP, compile_dx_key)] = tensormaps
+    dx_tensormaps = _up_projection_backward_act.compile_cache[(TENSORMAP, compile_dx_key)]
     _up_projection_backward_act.compile_cache[compile_dx_key](
         mDz,
         mW1_trans,
     x_gather_idx: torch.Tensor,
     is_glu_activation: bool,
     stream_id: int,
+    is_concatenated_gate_up: bool = False,
 ) -> None:
     I, H, E = dw1.size()
     if is_glu_activation:
         mE_permute_order = convert_torch_tensor_to_cute_tensor(expert_schedule_order, (0,), 0, 4, 1, stream=stream_id)
     current_stream = cuda.CUstream(stream_id)
+    compile_dw1_key = ("dw1", E, H, I, is_glu_activation, x.dtype, is_concatenated_gate_up)
     if compile_dw1_key not in _up_projection_backward_weight.compile_cache:
+        dw1_module = HopperWgmma_MoE_Up_proj_WeightGrad_Bwd(E, H, I, is_glu_activation, is_concatenated_gate_up=is_concatenated_gate_up)
         tensormaps = [dw1_module.module.generate_tensormap(None, None, None) for _ in range(1)]
         _up_projection_backward_weight.compile_cache[compile_dw1_key] = cute.compile(
             dw1_module,
             mE_permute_order,
             current_stream,
         )
+        _up_projection_backward_weight.compile_cache[(TENSORMAP, compile_dw1_key)] = tensormaps
+    dw1_tensormaps = _up_projection_backward_weight.compile_cache[(TENSORMAP, compile_dw1_key)]
     _up_projection_backward_weight.compile_cache[compile_dw1_key](
         mX_trans,
         mDz_trans,
             mE_permute_order,
             current_stream,
         )
+        _down_projection_backward_act.compile_cache[(TENSORMAP, compile_dz_key)] = tensormaps
     if ds_partial is None:
         ds_partial_N = _down_projection_backward_act.compile_cache["ds_partial_N"]
         ds_partial = torch.empty(TK, ds_partial_N, dtype=torch.float32, device=topk_scores.device)
         mDS_partial = convert_torch_tensor_to_cute_tensor(ds_partial, (0, 1), 1, 4, 1, stream=stream_id)
+    dz_tensormaps = _down_projection_backward_act.compile_cache[(TENSORMAP, compile_dz_key)]
     _down_projection_backward_act.compile_cache[compile_dz_key](
         mDout,
         mW2_trans,
             mE_permute_order,
             current_stream,
         )
+        _down_projection_backward_weight.compile_cache[(TENSORMAP, compile_dw2_key)] = tensormaps
+    dw2_tensormaps = _down_projection_backward_weight.compile_cache[(TENSORMAP, compile_dw2_key)]
     _down_projection_backward_weight.compile_cache[compile_dw2_key](
         mDout_trans, mY1S_trans, mDw2, mE_offset, mX_gather, dw2_tensormaps, mE_permute_order, current_stream
     )

build/torch-cuda/functional/forward.py CHANGED Viewed

@@ -65,6 +65,7 @@ def _up_projection_forward(
     activation_type: str,
     is_glu_activation: bool,
     is_inference_mode_enabled: bool = False,
 ) -> None:
     I, H, E = w1.size()
     if is_glu_activation:
@@ -89,10 +90,10 @@ def _up_projection_forward(
     current_stream = cuda.CUstream(stream_id)
-    compile_w1_key = (E, H, I, (b1 is None), x.dtype, activation_type, is_inference_mode_enabled)
     if compile_w1_key not in _up_projection_forward.compile_cache:
         w1_module = HopperWgmma_MoE_Up_proj_Fwd(
-            E, H, I, activation_type=ActivationType(activation_type), inference_mode=is_inference_mode_enabled
         )
         tensormaps = [w1_module.module.generate_tensormap(None, None, None) for _ in range(2)]
         _up_projection_forward.compile_cache[compile_w1_key] = cute.compile(
@@ -109,9 +110,9 @@ def _up_projection_forward(
             mE_permute_order,
             current_stream,
         )
-        _up_projection_forward.compile_cache[TENSORMAP] = tensormaps
-    w1_tensormaps = _up_projection_forward.compile_cache[TENSORMAP]
     _up_projection_forward.compile_cache[compile_w1_key](
         mX,
         mW1,
@@ -168,9 +169,9 @@ def _down_projection_forward(
         _down_projection_forward.compile_cache[compile_w2_key] = cute.compile(
             w2_module, mY1, mW2, mY2, mB2, mE_offset, mX_gather, tensormaps[0], mE_permute_order, current_stream
         )
-        _down_projection_forward.compile_cache[TENSORMAP] = tensormaps
-    w2_tensormaps = _down_projection_forward.compile_cache[TENSORMAP]
     _down_projection_forward.compile_cache[compile_w2_key](
         mY1, mW2, mY2, mB2, mE_offset, mX_gather, w2_tensormaps[0], mE_permute_order, current_stream
     )

     activation_type: str,
     is_glu_activation: bool,
     is_inference_mode_enabled: bool = False,
+    is_concatenated_gate_up: bool = False,
 ) -> None:
     I, H, E = w1.size()
     if is_glu_activation:
     current_stream = cuda.CUstream(stream_id)
+    compile_w1_key = (E, H, I, (b1 is None), x.dtype, activation_type, is_inference_mode_enabled, is_concatenated_gate_up)
     if compile_w1_key not in _up_projection_forward.compile_cache:
         w1_module = HopperWgmma_MoE_Up_proj_Fwd(
+            E, H, I, activation_type=ActivationType(activation_type), inference_mode=is_inference_mode_enabled, is_concatenated_gate_up=is_concatenated_gate_up,
         )
         tensormaps = [w1_module.module.generate_tensormap(None, None, None) for _ in range(2)]
         _up_projection_forward.compile_cache[compile_w1_key] = cute.compile(
             mE_permute_order,
             current_stream,
         )
+        _up_projection_forward.compile_cache[(TENSORMAP, compile_w1_key)] = tensormaps
+    w1_tensormaps = _up_projection_forward.compile_cache[(TENSORMAP, compile_w1_key)]
     _up_projection_forward.compile_cache[compile_w1_key](
         mX,
         mW1,
         _down_projection_forward.compile_cache[compile_w2_key] = cute.compile(
             w2_module, mY1, mW2, mY2, mB2, mE_offset, mX_gather, tensormaps[0], mE_permute_order, current_stream
         )
+        _down_projection_forward.compile_cache[(TENSORMAP, compile_w2_key)] = tensormaps
+    w2_tensormaps = _down_projection_forward.compile_cache[(TENSORMAP, compile_w2_key)]
     _down_projection_forward.compile_cache[compile_w2_key](
         mY1, mW2, mY2, mB2, mE_offset, mX_gather, w2_tensormaps[0], mE_permute_order, current_stream
     )

build/torch-cuda/functional/moe_config.py CHANGED Viewed

@@ -37,9 +37,10 @@ class HopperGEMMConfig:
 class HopperWgmma_MoE_Up_proj_Fwd:
-    def __init__(self, E: int, H: int, I: int, activation_type: ActivationType, inference_mode=False):
         super().__init__()
         is_glu_activation = is_glu(activation_type)
         if is_glu_activation:
             assert (
                 H % 64 == 0 and H >= 512 and I % 64 == 0
@@ -127,6 +128,18 @@ class HopperWgmma_MoE_Up_proj_Fwd:
     def __call__(
         self, mX, mW1, mZ, mY1, mB1, mE_offset, mX_gather, mD_tensormap, mY1_tensormap, mE_permute_order, stream
     ):
         return self.module(
             mX,
             mW1,
@@ -424,7 +437,8 @@ class HopperWgmma_MoE_Down_proj_WeightGrad_Bwd:
 class HopperWgmma_MoE_Up_proj_ActGrad_Bwd:
-    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool):
         super().__init__()
         if is_glu_activation:
             assert (
@@ -478,6 +492,17 @@ class HopperWgmma_MoE_Up_proj_ActGrad_Bwd:
     def __call__(
         self, mDz, mW1_trans, mDx_expanded, mE_offset, mX_gather, mS_scatter, tensormaps, mE_permute_order, stream
     ):
         return self.module(
             mDz,
             mW1_trans,
@@ -504,7 +529,8 @@ class HopperWgmma_MoE_Up_proj_ActGrad_Bwd:
 class HopperWgmma_MoE_Up_proj_WeightGrad_Bwd:
-    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool):
         super().__init__()
         if is_glu_activation:
             assert (
@@ -556,6 +582,18 @@ class HopperWgmma_MoE_Up_proj_WeightGrad_Bwd:
     @cute.jit
     def __call__(self, mX_trans, mDz_trans, mDw1_trans, mE_offset, mX_gather, tensormaps, mE_permute_order, stream):
         return self.module(
             mX_trans,
             mDz_trans,

 class HopperWgmma_MoE_Up_proj_Fwd:
+    def __init__(self, E: int, H: int, I: int, activation_type: ActivationType, inference_mode=False, is_concatenated_gate_up: bool = False):
         super().__init__()
         is_glu_activation = is_glu(activation_type)
+        self.is_concatenated_gate_up = is_concatenated_gate_up
         if is_glu_activation:
             assert (
                 H % 64 == 0 and H >= 512 and I % 64 == 0
     def __call__(
         self, mX, mW1, mZ, mY1, mB1, mE_offset, mX_gather, mD_tensormap, mY1_tensormap, mE_permute_order, stream
     ):
+        if const_expr(self.is_concatenated_gate_up):
+            # mW1 is (2*I, H, E) concatenated [gate; up]. Reshape N dim to ((2, I))
+            # so TMA reads interleaved pairs from the two halves.
+            half_N = mW1.shape[0] // 2
+            mW1 = cute.make_tensor(
+                mW1.iterator,
+                cute.make_layout(
+                    ((2, half_N), mW1.shape[1], mW1.shape[2]),
+                    stride=((half_N * mW1.stride[0], mW1.stride[0]), mW1.stride[1], mW1.stride[2]),
+                ),
+            )
         return self.module(
             mX,
             mW1,
 class HopperWgmma_MoE_Up_proj_ActGrad_Bwd:
+    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool, is_concatenated_gate_up: bool = False):
+        self.is_concatenated_gate_up = is_concatenated_gate_up
         super().__init__()
         if is_glu_activation:
             assert (
     def __call__(
         self, mDz, mW1_trans, mDx_expanded, mE_offset, mX_gather, mS_scatter, tensormaps, mE_permute_order, stream
     ):
+        if const_expr(self.is_concatenated_gate_up):
+            # mW1_trans is (H, 2*I, E) with concatenated N dim (dim 1).
+            # Reshape dim 1 to ((2, I)) so TMA reads interleaved from concatenated memory.
+            half_N = mW1_trans.shape[1] // 2
+            mW1_trans = cute.make_tensor(
+                mW1_trans.iterator,
+                cute.make_layout(
+                    (mW1_trans.shape[0], (2, half_N), mW1_trans.shape[2]),
+                    stride=(mW1_trans.stride[0], (half_N * mW1_trans.stride[1], mW1_trans.stride[1]), mW1_trans.stride[2]),
+                ),
+            )
         return self.module(
             mDz,
             mW1_trans,
 class HopperWgmma_MoE_Up_proj_WeightGrad_Bwd:
+    def __init__(self, E: int, H: int, I: int, is_glu_activation: bool, is_concatenated_gate_up: bool = False):
+        self.is_concatenated_gate_up = is_concatenated_gate_up
         super().__init__()
         if is_glu_activation:
             assert (
     @cute.jit
     def __call__(self, mX_trans, mDz_trans, mDw1_trans, mE_offset, mX_gather, tensormaps, mE_permute_order, stream):
+        if const_expr(self.is_concatenated_gate_up):
+            # mDw1_trans is (H, 2*I, E) — output in concatenated layout.
+            # Reshape dim 1 to ((2, I)) so GEMM writes interleaved results
+            # to the correct concatenated memory positions.
+            half_N = mDw1_trans.shape[1] // 2
+            mDw1_trans = cute.make_tensor(
+                mDw1_trans.iterator,
+                cute.make_layout(
+                    (mDw1_trans.shape[0], (2, half_N), mDw1_trans.shape[2]),
+                    stride=(mDw1_trans.stride[0], (half_N * mDw1_trans.stride[1], mDw1_trans.stride[1]), mDw1_trans.stride[2]),
+                ),
+            )
         return self.module(
             mX_trans,
             mDz_trans,