danieldk HF Staff commited on Jan 8

Commit

b6db418

1 Parent(s): e0e5abe

Revert "Build uploaded using `kernels`."

Browse files

This reverts commit e0e5abe72d54407c849a04974181545bf03eb02a.

Files changed (27) hide show

build/torch-cuda/__init__.py +0 -13
build/torch-cuda/_ops.py +0 -8
build/torch-cuda/kernels/__init__.py +0 -3
build/torch-cuda/kernels/ops.py +0 -457
build/torch-cuda/kernels/single.py +0 -59
build/torch-cuda/layers.py +0 -52
build/torch-cuda/metadata.json +0 -1
build/torch-cuda/parallel_experts.py +0 -182
build/torch-cuda/scattermoe/__init__.py +0 -26
build/torch-rocm/__init__.py +0 -13
build/torch-rocm/_ops.py +0 -8
build/torch-rocm/kernels/__init__.py +0 -3
build/torch-rocm/kernels/ops.py +0 -457
build/torch-rocm/kernels/single.py +0 -59
build/torch-rocm/layers.py +0 -52
build/torch-rocm/metadata.json +0 -1
build/torch-rocm/parallel_experts.py +0 -182
build/torch-rocm/scattermoe/__init__.py +0 -26
build/torch-xpu/__init__.py +0 -13
build/torch-xpu/_ops.py +0 -8
build/torch-xpu/kernels/__init__.py +0 -3
build/torch-xpu/kernels/ops.py +0 -457
build/torch-xpu/kernels/single.py +0 -59
build/torch-xpu/layers.py +0 -52
build/torch-xpu/metadata.json +0 -1
build/torch-xpu/parallel_experts.py +0 -182
build/torch-xpu/scattermoe/__init__.py +0 -26

build/torch-cuda/__init__.py DELETED Viewed

@@ -1,13 +0,0 @@
-from .parallel_experts import flatten_sort_count, parallel_linear, ParallelExperts
-from . import parallel_experts
-from . import kernels
-from . import layers
-__all__ = [
-    "flatten_sort_count",
-    "parallel_linear",
-    "ParallelExperts",
-    "parallel_experts",
-    "kernels",
-    "layers"
-]

build/torch-cuda/_ops.py DELETED Viewed

@@ -1,8 +0,0 @@
-import torch
-ops = torch.ops._scattermoe_05b9d77
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_scattermoe_05b9d77::{op_name}"

build/torch-cuda/kernels/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from . import ops
-__all__ = ["ops"]

build/torch-cuda/kernels/ops.py DELETED Viewed

@@ -1,457 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-from typing import Optional
-BLOCK_M = 128
-ALLOW_TF32 = True
-@triton.jit
-def _compute_expert_block(
-    E_idx, E_mask,
-    M_in_idx,
-    N_block, N_mask,
-    X_ptr, stride_xm, stride_xk,
-    W_ptr, stride_we, stride_wk, stride_wn,
-    K,
-    acc,
-    no_k_mask,
-    BLOCK_K,
-    allow_tf32=True,
-):
-    K_block = tl.arange(0, BLOCK_K)
-    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
-    W_blk_ptrs = W_ptr + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn + E_idx * stride_we
-    iters = tl.cdiv(K, BLOCK_K)
-    for K_block_id in range(iters):
-        if no_k_mask:
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None])
-            w = tl.load(W_blk_ptrs, mask=N_mask[None, :])
-        else:
-            K_mask = (K_block_id * BLOCK_K + K_block) < K
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :])
-            w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :])
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-        acc = tl.dot(x, w, acc, allow_tf32=allow_tf32)
-    return acc
-def _scatter2scatter_configs():
-    return [
-        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
-    ]
-@triton.autotune(configs=_scatter2scatter_configs(), key=['M', 'N', 'K'], )
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
-    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
-})
-@triton.jit
-def _scatter2scatter(
-    X_ptr, stride_xm: tl.constexpr, stride_xk: tl.constexpr,
-    W_ptr, stride_we, stride_wk: tl.constexpr, stride_wn: tl.constexpr,
-    Y_ptr, stride_ym: tl.constexpr, stride_yn: tl.constexpr,
-    B_ptr, stride_be: tl.constexpr, stride_bn: tl.constexpr,
-    grouped_idx_ptr, expert_idxs_ptr,
-    # block_start_idx_ptr,
-    FAN_OUT: tl.constexpr,
-    M, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    # OUT_M,
-    allow_tf32: tl.constexpr,
-    x_grouped: tl.constexpr, y_grouped: tl.constexpr,
-    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
-):
-    pid = tl.program_id(axis=0)
-    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
-    M_block_id = pid // N_BLOCK_COUNT
-    N_block_id = pid % N_BLOCK_COUNT
-    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
-    N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_block < N
-    M_boundary_mask = M_block < (FAN_OUT * M)
-    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)
-    no_k_mask = K % BLOCK_K == 0
-    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-    E_first_idx = tl.min(E_idxs)
-    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
-    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
-    for E_idx in range(E_first_idx, E_last_idx + 1):
-        E_mask = E_idxs == E_idx
-        E_M_idx = M_idx
-        if x_grouped:
-            M_in_idx = M_block
-        else:
-            M_in_idx = E_M_idx // FAN_OUT
-        acc = _compute_expert_block(
-            E_idx, E_mask,
-            M_in_idx, N_block, N_mask,
-            X_ptr, stride_xm, stride_xk,
-            W_ptr, stride_we, stride_wk, stride_wn,
-            K,
-            acc,
-            no_k_mask,
-            BLOCK_K,
-            allow_tf32=allow_tf32,
-        )
-    if B_ptr is not None:
-        B_blk_ptrs = B_ptr + E_idxs[:, None] * stride_be + N_block[None, :] * stride_bn
-        acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :])
-    if y_grouped:
-        M_out_idx = M_block
-    else:
-        M_out_idx = M_idx
-    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
-    tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :])
-def scatter2scatter(X, W, sorted_expert_idxs, sorted_scattered_idxs, k,
-                    b=None,
-                    x_grouped=False, y_grouped=False,
-                    out=None):
-    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
-    assert sorted_scattered_idxs.size(0) == X.size(0) * k
-    # Pre-kernel setup
-    y_dim = W.size(-1)
-    L_scattered = sorted_expert_idxs.size(0)
-    if out is None:
-        output = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype)
-    else:
-        assert out.size(0) == L_scattered and out.size(1) == y_dim
-        output = out
-    scatter2scatter_compileable(output, W, X, k, sorted_expert_idxs, sorted_scattered_idxs,
-                                b, x_grouped, y_grouped)
-    return output
-@torch.library.custom_op("scattermoe::scatter2scatter", mutates_args={"output"})
-def scatter2scatter_compileable(
-        output: torch.Tensor,
-        W: torch.Tensor,
-        X: torch.Tensor,
-        k: int,
-        sorted_expert_idxs: torch.Tensor,
-        sorted_scattered_idxs: torch.Tensor,
-        b: Optional[torch.Tensor],
-        x_grouped: bool, y_grouped: bool) -> None:
-    def grid(META):
-        grid_num = (
-            triton.cdiv(sorted_expert_idxs.size(0), META["BLOCK_M"]) *
-            triton.cdiv(META['N'], META['BLOCK_N']),
-        )
-        return grid_num
-    if b is None:
-        b = None
-        stride_be = stride_bk = 0
-    else:
-        stride_be, stride_bk = b.stride()
-    _scatter2scatter[grid](
-        # X_ptr, stride_xm, stride_xk,
-        X, X.stride(0), X.stride(1),
-        # W_ptr, stride_we, stride_wk, stride_wn,
-        W, W.stride(0), W.stride(1), W.stride(2),
-        # Y_ptr, stride_ym, stride_yn,
-        output, output.stride(0), output.stride(1),
-        # B_ptr, stride_be, stride_bk
-        b, stride_be, stride_bk,
-        grouped_idx_ptr=sorted_scattered_idxs,
-        expert_idxs_ptr=sorted_expert_idxs,
-        # block_start_idx_ptr=padded_block_idxs,
-        FAN_OUT=k,
-        M=X.size(0),
-        K=X.size(1),
-        N=output.size(1), E=W.size(0),
-        BLOCK_M=BLOCK_M,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32,
-        x_grouped=x_grouped, y_grouped=y_grouped,
-    )
-def _config_XtY():
-    return [
-        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 128, 'BLOCK_M': 32}, num_stages=4, num_warps=4),
-    ]
-def group_bwd_W(DY, X, expert_offsets, E, has_bias=False):
-    DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype)
-    DW = DWt.permute(0, 2, 1)
-    if has_bias:
-        Db = torch.zeros((E, DY.size(-1)), device=DY.device, dtype=DY.dtype)
-    else:
-        Db = None
-    groupXtY_compileable(E, DW, Db, DY, X, expert_offsets)
-    return DW, Db
-@torch.library.custom_op("scattermoe::groupXtY", mutates_args={"DW"})
-def groupXtY_compileable(
-        E: int,
-        DW: torch.Tensor,
-        Db: Optional[torch.Tensor],
-        DY: torch.Tensor,
-        X: torch.Tensor,
-        expert_offsets: torch.Tensor) -> None:
-    def grid(META):
-        grid = (
-            E * triton.cdiv(META['K'], META['BLOCK_K']),
-            triton.cdiv(META['N'], META['BLOCK_N']),
-        )
-        return grid
-    if Db is None:
-        stride_dbe = 0
-        stride_dbn = 0
-    else:
-        stride_dbe, stride_dbn = Db.stride()
-    _groupXtY[grid](
-        # DY_ptr, stride_dym, stride_dyk,
-        DY, DY.stride(0), DY.stride(1),
-        # X_ptr, stride_xm, stride_xn,
-        X, X.stride(0), X.stride(1),
-        # DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-        DW, DW.stride(0), DW.stride(1), DW.stride(2),
-        # Db_ptr, stride_dwe, stride_dbn,
-        Db, stride_dbe, stride_dbn,
-        # expert_offsets_ptr,
-        expert_offsets,
-        # K: tl.constexpr, N: tl.constexpr,
-        M=DY.size(0), N=DY.size(-1), K=X.size(-1),
-        # ACC_TYPE: tl.constexpr,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32
-    )
-@triton.autotune(configs=_config_XtY(), key=['M', 'N', 'K'], )
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
-    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
-})
-@triton.jit
-def _groupXtY(
-    DY_ptr, stride_dym, stride_dyk,
-    X_ptr, stride_xm, stride_xn,
-    DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-    Db_ptr, stride_dbe, stride_dbn,
-    expert_offsets_ptr,
-    M, K: tl.constexpr, N: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    allow_tf32: tl.constexpr,
-    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-    num0 = tl.num_programs(0)
-    num1 = tl.num_programs(1)
-    # pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128)
-    pid0, pid1 = tl.swizzle2d(pid0, pid1, num0, num1, 4)
-    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
-    E_idx = pid0 // K_BLOCK_COUNT
-    K_block_id = pid0 % K_BLOCK_COUNT
-    N_block_id = pid1
-    if E_idx == 0:
-        start_idx = 0
-    else:
-        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
-    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
-    if end_idx > start_idx:
-        M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M)
-        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
-        K_mask = K_block < K
-        K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K)
-        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-        N_mask = N_block < N
-        N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
-        M_idxs = M_block
-        xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm
-        dy_blk_ptrs = DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk
-        if (Db_ptr is not None) and (K_block_id == 0):
-            _xty_and_bias(
-                E_idx, start_idx, end_idx,
-                M_block,
-                K_block, K_mask, N_block, N_mask,
-                dy_blk_ptrs, stride_dym,
-                xt_blk_ptrs, stride_xm,
-                DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-                Db_ptr, stride_dbe, stride_dbn,
-                BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-                allow_tf32, NO_K_MASK, NO_N_MASK,
-                compute_bias=True
-            )
-        else:
-            _xty_and_bias(
-                E_idx, start_idx, end_idx,
-                M_block,
-                K_block, K_mask, N_block, N_mask,
-                dy_blk_ptrs, stride_dym,
-                xt_blk_ptrs, stride_xm,
-                DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-                Db_ptr, stride_dbe, stride_dbn,
-                BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-                allow_tf32, NO_K_MASK, NO_N_MASK,
-                compute_bias=False
-            )
-@triton.jit
-def _xty_and_bias(
-        E_idx, start_idx, end_idx,
-        M_block,
-        K_block, K_mask, N_block, N_mask,
-        dy_blk_ptrs, stride_dym,
-        xt_blk_ptrs, stride_xm,
-        DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-        Db_ptr, stride_dbe, stride_dbn,
-        BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-        allow_tf32, NO_K_MASK, NO_N_MASK,
-        compute_bias: tl.constexpr
-    ):
-    if compute_bias:
-        db_acc = tl.zeros((BLOCK_N,), dtype=ACC_TYPE)
-    else:
-        db_acc = None
-    acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE)
-    iters = tl.cdiv(end_idx - start_idx, BLOCK_M)
-    for i in range(0, iters):
-        M_mask = (i * BLOCK_M + M_block) < end_idx
-        if NO_K_MASK:
-            xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :])
-        else:
-            xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :])
-        if NO_N_MASK:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None])
-        else:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :])
-        acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32)
-        xt_blk_ptrs += BLOCK_M * stride_xm
-        dy_blk_ptrs += BLOCK_M * stride_dym
-        if compute_bias:
-            db_acc += tl.sum(dy, axis=0)
-    DW_blk_ptrs = DW_ptr + E_idx * stride_dwe + K_block[:, None] * stride_dwk + N_block[None, :] * stride_dwn
-    acc = acc.to(DW_blk_ptrs.dtype.element_ty)
-    tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :])
-    if compute_bias:
-        Db_blk_ptrs =  Db_ptr + E_idx * stride_dbe + N_block * stride_dbn
-        tl.store(Db_blk_ptrs, db_acc, mask=N_mask)
-def _config_grouping():
-    return [
-        triton.Config({'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
-    ]
-def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None):
-    N = sorted_expert_idxs.size(0)
-    K = A.size(1)
-    assert A.size(0) * fan_out == N
-    if out is not None:
-        Y = out
-    else:
-        Y = torch.empty((N, K), dtype=A.dtype, device=A.device)
-    group_compileable(A, K, N, Y, coeff, coeff is not None, fan_out, sorted_expert_idxs)
-    return Y
-@torch.library.custom_op("scattermoe::group", mutates_args={"Y"})
-def group_compileable(
-        A: torch.Tensor,
-        K: int,
-        N: int,
-        Y: torch.Tensor,
-        coeff: torch.Tensor, has_coeff: bool,
-        fan_out: int,
-        sorted_expert_idxs: torch.Tensor) -> None:
-    def grid(META):
-        grid_num = (triton.cdiv(META['N'], META['BLOCK_N']),)
-        return grid_num
-    _group[grid](
-        # A_ptr, stride_an, stride_ai,
-        A, A.stride(0), A.stride(1), has_coeff, coeff, fan_out,
-        # Y_ptr, stride_yn, stride_yk,
-        Y, Y.stride(0), Y.stride(1),
-        # grouped_idx_ptr,
-        sorted_expert_idxs,
-        # N: tl.constexpr, K: tl.constexpr,
-        N, K
-    )
-@triton.autotune(configs=_config_grouping(), key=['K'])
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0
-})
-@triton.jit
-def _group(
-    src_ptr, stride_sn, stride_sk, has_coeff: tl.constexpr, coeff_ptr, FAN_OUT: tl.constexpr,
-    tgt_ptr, stride_tn, stride_ti,
-    grouped_idx_ptr,
-    N, K: tl.constexpr,
-    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    NO_K_MASK: tl.constexpr
-):
-    pid = tl.program_id(axis=0)
-    N_block_id = pid
-    N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_blk < N
-    N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N)
-    N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0)
-    K_blk = tl.arange(0, BLOCK_K)
-    src_blk_ptrs = src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk
-    tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti
-    if has_coeff:
-        c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None]
-    iters = tl.cdiv(K, BLOCK_K)
-    for i in range(0, iters):
-        if NO_K_MASK or i < iters - 1:
-            block = tl.load(src_blk_ptrs, mask=N_mask[:, None])
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None])
-        else:
-            K_mask = (i * BLOCK_K + K_blk) < K
-            mask = N_mask[:, None] & K_mask[None, :]
-            block = tl.load(src_blk_ptrs, mask=mask)
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=mask)
-        src_blk_ptrs += BLOCK_K * stride_sk
-        tgt_blk_ptrs += BLOCK_K * stride_ti

build/torch-cuda/kernels/single.py DELETED Viewed

@@ -1,59 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-@triton.jit
-def _single2scatter(
-    X_ptr, stride_xm, stride_xk,
-    W_ptr, stride_we, stride_wk, stride_wn,
-    Y_ptr, stride_ym, stride_yn,
-    expert_idxs_ptr,
-    FAN_OUT: tl.constexpr,
-    K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
-    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-    N_block_id = pid0
-    if FAN_OUT == 1:
-        in_idx = pid1
-    else:
-        in_idx = 0
-    out_idx = pid1
-    K_block = tl.arange(0, BLOCK_K)
-    N_block = tl.max_contiguous(tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N), BLOCK_N)
-    E_idx = tl.load(expert_idxs_ptr + pid1)
-    X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk
-    W_blk_ptrs = W_ptr + E_idx * stride_we + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn
-    acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE)
-    for K_block_id in range(0, tl.cdiv(K, BLOCK_K)):
-        x = tl.load(X_blk_ptrs)
-        w = tl.load(W_blk_ptrs)
-        acc += tl.sum(x * w, axis=0)[None, :]
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-    Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn
-    tl.store(Y_blk_ptrs, acc)
-def single2scatter(X, W, expert_idxs):
-    E, xdim, ydim = W.size()
-    k = expert_idxs.size(1)
-    assert X.size(0) == k or X.size(0) == 1
-    Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype)
-    BLOCK_N = 128
-    BLOCK_K = 128
-    grid = ydim // BLOCK_N, k
-    _single2scatter[grid](
-        X, X.stride(0), X.stride(1),
-        W, W.stride(0), W.stride(1), W.stride(2),
-        Y, Y.stride(0), Y.stride(1),
-        expert_idxs,
-        FAN_OUT=Y.size(0) // X.size(0),
-        K=xdim, N=ydim, E=E,
-        BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
-        ACC_TYPE=tl.float32
-    )
-    return Y

build/torch-cuda/layers.py DELETED Viewed

@@ -1,52 +0,0 @@
-import torch
-from torch.nn import functional as F
-from torch import nn
-from . import parallel_linear, flatten_sort_count
-class ScatterMoEGatedMLP(nn.Module):
-    def forward(self, layer_input):
-        """
-        Forward pass of the mixture of experts layer.
-        Args:
-            layer_input (Tensor):
-                Input tensor.
-        Returns:
-            Tensor:
-                Output tensor.
-            Tensor:
-                Router logits.
-        """
-        bsz, length, emb_size = layer_input.size()
-        layer_input = layer_input.reshape(-1, emb_size)
-        # compute the top_k routing decision
-        router_logits = self.router.layer(layer_input)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights, self.router.top_k, dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(layer_input.dtype)
-        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = \
-            flatten_sort_count(selected_experts, num_experts=self.router.num_experts)
-        # compute experts
-        gates, h = parallel_linear(
-            layer_input, self.input_linear.weight.transpose(2, 1),
-            self.router.top_k,
-            sorted_expert_idxs, sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=False, grouped_out=True,
-        ).chunk(2, dim=-1)
-        h = self.activation(gates) * h
-        layer_output = parallel_linear(
-            h, self.output_linear.weight.transpose(2, 1),
-            1,
-            sorted_expert_idxs, sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=True, grouped_out=False,
-            gates=routing_weights
-        )
-        layer_output = layer_output.view(bsz, length, emb_size)
-        return layer_output

build/torch-cuda/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch-cuda/parallel_experts.py DELETED Viewed

@@ -1,182 +0,0 @@
-import torch
-import torch.nn as nn
-from . import kernels
-from typing import Optional
-@torch.library.custom_op("scattermoe::bincount", mutates_args={})
-def compileable_bincount(x: torch.Tensor, minlength: int) -> torch.Tensor:
-        return x.bincount(minlength=minlength)
-@compileable_bincount.register_fake
-def _(x: torch.Tensor, minlength: int) -> torch.Tensor:
-    return torch.empty(minlength, dtype=torch.long, device=x.device)
-@torch.compile
-def flatten_sort_count(expert_idxs: torch.Tensor, num_experts: int):
-    with torch.no_grad():
-        flattened_expert_idxs = expert_idxs.flatten()
-        sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs)
-        expert_counts = compileable_bincount(flattened_expert_idxs, minlength=num_experts)
-        expert_offsets = expert_counts.cumsum(-1)
-        return sorted_expert_idxs, sorted_scattered_idxs, expert_offsets
-class ParallelLinear(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: torch.Tensor, expert_weights: torch.Tensor, k: int,
-        sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor,
-        expert_offsets: torch.Tensor,
-        expert_biases: Optional[torch.Tensor]=None,
-        gates: Optional[torch.Tensor]=None,
-        grouped_in: bool =False, grouped_out: bool=False,
-    ):
-        with torch.device(x.device):
-            output = kernels.ops.scatter2scatter(
-                X=x, W=expert_weights,
-                b=expert_biases, k=k,
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                x_grouped=grouped_in, y_grouped=grouped_out
-            )
-            if gates is not None:
-                output_expanded = output.view(gates.size(0), gates.size(1), output.size(-1))
-                output = (gates.unsqueeze(1) @ output_expanded).squeeze(1)
-            else:
-                output_expanded = None
-            ctx.save_for_backward(
-                x, expert_weights,
-                expert_biases,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                gates,
-                output_expanded
-            )
-            ctx.grouped_in = grouped_in
-            ctx.grouped_out = grouped_out
-            ctx.k = k
-        return output
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor):
-        with torch.device(grad_out.device):
-            (x, expert_weights, expert_biases,
-             sorted_expert_idxs,
-             sorted_scattered_idxs,
-             expert_offsets,
-             gates, output_expanded) = ctx.saved_tensors
-            k = ctx.k
-            grouped_in = ctx.grouped_in
-            grouped_out = ctx.grouped_out
-            # print("backward")
-            if gates is not None:
-                # calculate gates gradient
-                # d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1)
-                d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1)
-                gates_flat = gates.flatten()
-                gate_fan = gates.size(1)
-                grouped_grad_out = output_expanded.flatten(0, 1) # reuse expanded buffer later
-            else:
-                d_gates = None
-                gates_flat = None
-                gate_fan = 1
-                grouped_grad_out = None
-            if grouped_out:
-                grouped_grad_out = grad_out
-            else:
-                grouped_grad_out = kernels.ops.group(grad_out, sorted_scattered_idxs,
-                                                     fan_out=gate_fan, coeff=gates_flat,
-                                                     out=grouped_grad_out)
-            if grouped_in:
-                grouped_x = x
-                d_expanded_input = None
-            else:
-                grouped_x = kernels.ops.group(x, sorted_scattered_idxs, fan_out=k)
-                d_expanded_input = grouped_x
-            d_weights, d_biases = kernels.ops.group_bwd_W(
-                DY=grouped_grad_out, X=grouped_x,
-                expert_offsets=expert_offsets,
-                E=expert_weights.size(0),
-                has_bias=expert_biases is not None
-            )
-            d_expanded_input = kernels.ops.scatter2scatter(
-                X=grouped_grad_out, x_grouped=True,
-                W=expert_weights.permute(0, 2, 1),
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                k=1,
-                y_grouped=grouped_in,
-                out=d_expanded_input # Reuse grouped_x buffer
-            )
-            if k == 1:
-                d_input = d_expanded_input
-            else:
-                d_input = d_expanded_input.view(x.size(0), k, d_expanded_input.size(-1)).sum(-2)
-        # print("backward end.")
-        return (
-            # x, expert_weights,
-            d_input, d_weights,
-            # k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
-            None, None, None, None,
-            # bias, gates
-            d_biases, d_gates,
-            # grouped_in, grouped_out,
-            None, None
-        )
-def parallel_linear(inputs, expert_weights, k,
-                    sorted_expert_idxs, sorted_scattered_idxs,
-                    expert_offsets,
-                    expert_biases=None,
-                    gates=None, grouped_in=False, grouped_out=False):
-    results = ParallelLinear.apply(inputs, expert_weights, k,
-                                   sorted_expert_idxs, sorted_scattered_idxs,
-                                   expert_offsets,
-                                   expert_biases,
-                                   gates, grouped_in, grouped_out)
-    return results
-class ParallelExperts(nn.Module):
-    def __init__(self, num_experts, input_size, output_size, bias=False) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
-        if bias:
-            self.bias = nn.Parameter(torch.empty(num_experts, output_size))
-        else:
-            self.bias = None
-        self.num_experts = num_experts
-        self.input_size = input_size
-        self.output_size = output_size
-        self.reset_parameters()
-    def extra_repr(self):
-        return 'num_experts={}, input_size={}, output_size={}'.format(
-            self.num_experts, self.input_size, self.output_size)
-    def reset_parameters(self) -> None:
-        nn.init.normal_(self.weight, std=0.02)
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-    def forward(self, inputs, k, sorted_expert_idxs, sorted_scattered_idxs,
-                expert_offsets,
-                gates=None, grouped_in=False, grouped_out=False):
-        results = parallel_linear(
-            inputs, self.weight.permute(0, 2, 1), k,
-            sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
-            expert_biases=self.bias,
-            gates=gates, grouped_in=grouped_in, grouped_out=grouped_out
-        )
-        return results

build/torch-cuda/scattermoe/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-rocm/__init__.py DELETED Viewed

@@ -1,13 +0,0 @@
-from .parallel_experts import flatten_sort_count, parallel_linear, ParallelExperts
-from . import parallel_experts
-from . import kernels
-from . import layers
-__all__ = [
-    "flatten_sort_count",
-    "parallel_linear",
-    "ParallelExperts",
-    "parallel_experts",
-    "kernels",
-    "layers"
-]

build/torch-rocm/_ops.py DELETED Viewed

@@ -1,8 +0,0 @@
-import torch
-ops = torch.ops._scattermoe_05b9d77
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_scattermoe_05b9d77::{op_name}"

build/torch-rocm/kernels/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from . import ops
-__all__ = ["ops"]

build/torch-rocm/kernels/ops.py DELETED Viewed

@@ -1,457 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-from typing import Optional
-BLOCK_M = 128
-ALLOW_TF32 = True
-@triton.jit
-def _compute_expert_block(
-    E_idx, E_mask,
-    M_in_idx,
-    N_block, N_mask,
-    X_ptr, stride_xm, stride_xk,
-    W_ptr, stride_we, stride_wk, stride_wn,
-    K,
-    acc,
-    no_k_mask,
-    BLOCK_K,
-    allow_tf32=True,
-):
-    K_block = tl.arange(0, BLOCK_K)
-    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
-    W_blk_ptrs = W_ptr + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn + E_idx * stride_we
-    iters = tl.cdiv(K, BLOCK_K)
-    for K_block_id in range(iters):
-        if no_k_mask:
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None])
-            w = tl.load(W_blk_ptrs, mask=N_mask[None, :])
-        else:
-            K_mask = (K_block_id * BLOCK_K + K_block) < K
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :])
-            w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :])
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-        acc = tl.dot(x, w, acc, allow_tf32=allow_tf32)
-    return acc
-def _scatter2scatter_configs():
-    return [
-        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
-    ]
-@triton.autotune(configs=_scatter2scatter_configs(), key=['M', 'N', 'K'], )
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
-    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
-})
-@triton.jit
-def _scatter2scatter(
-    X_ptr, stride_xm: tl.constexpr, stride_xk: tl.constexpr,
-    W_ptr, stride_we, stride_wk: tl.constexpr, stride_wn: tl.constexpr,
-    Y_ptr, stride_ym: tl.constexpr, stride_yn: tl.constexpr,
-    B_ptr, stride_be: tl.constexpr, stride_bn: tl.constexpr,
-    grouped_idx_ptr, expert_idxs_ptr,
-    # block_start_idx_ptr,
-    FAN_OUT: tl.constexpr,
-    M, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    # OUT_M,
-    allow_tf32: tl.constexpr,
-    x_grouped: tl.constexpr, y_grouped: tl.constexpr,
-    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
-):
-    pid = tl.program_id(axis=0)
-    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
-    M_block_id = pid // N_BLOCK_COUNT
-    N_block_id = pid % N_BLOCK_COUNT
-    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
-    N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_block < N
-    M_boundary_mask = M_block < (FAN_OUT * M)
-    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)
-    no_k_mask = K % BLOCK_K == 0
-    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-    E_first_idx = tl.min(E_idxs)
-    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
-    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
-    for E_idx in range(E_first_idx, E_last_idx + 1):
-        E_mask = E_idxs == E_idx
-        E_M_idx = M_idx
-        if x_grouped:
-            M_in_idx = M_block
-        else:
-            M_in_idx = E_M_idx // FAN_OUT
-        acc = _compute_expert_block(
-            E_idx, E_mask,
-            M_in_idx, N_block, N_mask,
-            X_ptr, stride_xm, stride_xk,
-            W_ptr, stride_we, stride_wk, stride_wn,
-            K,
-            acc,
-            no_k_mask,
-            BLOCK_K,
-            allow_tf32=allow_tf32,
-        )
-    if B_ptr is not None:
-        B_blk_ptrs = B_ptr + E_idxs[:, None] * stride_be + N_block[None, :] * stride_bn
-        acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :])
-    if y_grouped:
-        M_out_idx = M_block
-    else:
-        M_out_idx = M_idx
-    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
-    tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :])
-def scatter2scatter(X, W, sorted_expert_idxs, sorted_scattered_idxs, k,
-                    b=None,
-                    x_grouped=False, y_grouped=False,
-                    out=None):
-    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
-    assert sorted_scattered_idxs.size(0) == X.size(0) * k
-    # Pre-kernel setup
-    y_dim = W.size(-1)
-    L_scattered = sorted_expert_idxs.size(0)
-    if out is None:
-        output = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype)
-    else:
-        assert out.size(0) == L_scattered and out.size(1) == y_dim
-        output = out
-    scatter2scatter_compileable(output, W, X, k, sorted_expert_idxs, sorted_scattered_idxs,
-                                b, x_grouped, y_grouped)
-    return output
-@torch.library.custom_op("scattermoe::scatter2scatter", mutates_args={"output"})
-def scatter2scatter_compileable(
-        output: torch.Tensor,
-        W: torch.Tensor,
-        X: torch.Tensor,
-        k: int,
-        sorted_expert_idxs: torch.Tensor,
-        sorted_scattered_idxs: torch.Tensor,
-        b: Optional[torch.Tensor],
-        x_grouped: bool, y_grouped: bool) -> None:
-    def grid(META):
-        grid_num = (
-            triton.cdiv(sorted_expert_idxs.size(0), META["BLOCK_M"]) *
-            triton.cdiv(META['N'], META['BLOCK_N']),
-        )
-        return grid_num
-    if b is None:
-        b = None
-        stride_be = stride_bk = 0
-    else:
-        stride_be, stride_bk = b.stride()
-    _scatter2scatter[grid](
-        # X_ptr, stride_xm, stride_xk,
-        X, X.stride(0), X.stride(1),
-        # W_ptr, stride_we, stride_wk, stride_wn,
-        W, W.stride(0), W.stride(1), W.stride(2),
-        # Y_ptr, stride_ym, stride_yn,
-        output, output.stride(0), output.stride(1),
-        # B_ptr, stride_be, stride_bk
-        b, stride_be, stride_bk,
-        grouped_idx_ptr=sorted_scattered_idxs,
-        expert_idxs_ptr=sorted_expert_idxs,
-        # block_start_idx_ptr=padded_block_idxs,
-        FAN_OUT=k,
-        M=X.size(0),
-        K=X.size(1),
-        N=output.size(1), E=W.size(0),
-        BLOCK_M=BLOCK_M,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32,
-        x_grouped=x_grouped, y_grouped=y_grouped,
-    )
-def _config_XtY():
-    return [
-        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 128, 'BLOCK_M': 32}, num_stages=4, num_warps=4),
-    ]
-def group_bwd_W(DY, X, expert_offsets, E, has_bias=False):
-    DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype)
-    DW = DWt.permute(0, 2, 1)
-    if has_bias:
-        Db = torch.zeros((E, DY.size(-1)), device=DY.device, dtype=DY.dtype)
-    else:
-        Db = None
-    groupXtY_compileable(E, DW, Db, DY, X, expert_offsets)
-    return DW, Db
-@torch.library.custom_op("scattermoe::groupXtY", mutates_args={"DW"})
-def groupXtY_compileable(
-        E: int,
-        DW: torch.Tensor,
-        Db: Optional[torch.Tensor],
-        DY: torch.Tensor,
-        X: torch.Tensor,
-        expert_offsets: torch.Tensor) -> None:
-    def grid(META):
-        grid = (
-            E * triton.cdiv(META['K'], META['BLOCK_K']),
-            triton.cdiv(META['N'], META['BLOCK_N']),
-        )
-        return grid
-    if Db is None:
-        stride_dbe = 0
-        stride_dbn = 0
-    else:
-        stride_dbe, stride_dbn = Db.stride()
-    _groupXtY[grid](
-        # DY_ptr, stride_dym, stride_dyk,
-        DY, DY.stride(0), DY.stride(1),
-        # X_ptr, stride_xm, stride_xn,
-        X, X.stride(0), X.stride(1),
-        # DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-        DW, DW.stride(0), DW.stride(1), DW.stride(2),
-        # Db_ptr, stride_dwe, stride_dbn,
-        Db, stride_dbe, stride_dbn,
-        # expert_offsets_ptr,
-        expert_offsets,
-        # K: tl.constexpr, N: tl.constexpr,
-        M=DY.size(0), N=DY.size(-1), K=X.size(-1),
-        # ACC_TYPE: tl.constexpr,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32
-    )
-@triton.autotune(configs=_config_XtY(), key=['M', 'N', 'K'], )
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
-    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
-})
-@triton.jit
-def _groupXtY(
-    DY_ptr, stride_dym, stride_dyk,
-    X_ptr, stride_xm, stride_xn,
-    DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-    Db_ptr, stride_dbe, stride_dbn,
-    expert_offsets_ptr,
-    M, K: tl.constexpr, N: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    allow_tf32: tl.constexpr,
-    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-    num0 = tl.num_programs(0)
-    num1 = tl.num_programs(1)
-    # pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128)
-    pid0, pid1 = tl.swizzle2d(pid0, pid1, num0, num1, 4)
-    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
-    E_idx = pid0 // K_BLOCK_COUNT
-    K_block_id = pid0 % K_BLOCK_COUNT
-    N_block_id = pid1
-    if E_idx == 0:
-        start_idx = 0
-    else:
-        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
-    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
-    if end_idx > start_idx:
-        M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M)
-        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
-        K_mask = K_block < K
-        K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K)
-        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-        N_mask = N_block < N
-        N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
-        M_idxs = M_block
-        xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm
-        dy_blk_ptrs = DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk
-        if (Db_ptr is not None) and (K_block_id == 0):
-            _xty_and_bias(
-                E_idx, start_idx, end_idx,
-                M_block,
-                K_block, K_mask, N_block, N_mask,
-                dy_blk_ptrs, stride_dym,
-                xt_blk_ptrs, stride_xm,
-                DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-                Db_ptr, stride_dbe, stride_dbn,
-                BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-                allow_tf32, NO_K_MASK, NO_N_MASK,
-                compute_bias=True
-            )
-        else:
-            _xty_and_bias(
-                E_idx, start_idx, end_idx,
-                M_block,
-                K_block, K_mask, N_block, N_mask,
-                dy_blk_ptrs, stride_dym,
-                xt_blk_ptrs, stride_xm,
-                DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-                Db_ptr, stride_dbe, stride_dbn,
-                BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-                allow_tf32, NO_K_MASK, NO_N_MASK,
-                compute_bias=False
-            )
-@triton.jit
-def _xty_and_bias(
-        E_idx, start_idx, end_idx,
-        M_block,
-        K_block, K_mask, N_block, N_mask,
-        dy_blk_ptrs, stride_dym,
-        xt_blk_ptrs, stride_xm,
-        DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-        Db_ptr, stride_dbe, stride_dbn,
-        BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-        allow_tf32, NO_K_MASK, NO_N_MASK,
-        compute_bias: tl.constexpr
-    ):
-    if compute_bias:
-        db_acc = tl.zeros((BLOCK_N,), dtype=ACC_TYPE)
-    else:
-        db_acc = None
-    acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE)
-    iters = tl.cdiv(end_idx - start_idx, BLOCK_M)
-    for i in range(0, iters):
-        M_mask = (i * BLOCK_M + M_block) < end_idx
-        if NO_K_MASK:
-            xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :])
-        else:
-            xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :])
-        if NO_N_MASK:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None])
-        else:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :])
-        acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32)
-        xt_blk_ptrs += BLOCK_M * stride_xm
-        dy_blk_ptrs += BLOCK_M * stride_dym
-        if compute_bias:
-            db_acc += tl.sum(dy, axis=0)
-    DW_blk_ptrs = DW_ptr + E_idx * stride_dwe + K_block[:, None] * stride_dwk + N_block[None, :] * stride_dwn
-    acc = acc.to(DW_blk_ptrs.dtype.element_ty)
-    tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :])
-    if compute_bias:
-        Db_blk_ptrs =  Db_ptr + E_idx * stride_dbe + N_block * stride_dbn
-        tl.store(Db_blk_ptrs, db_acc, mask=N_mask)
-def _config_grouping():
-    return [
-        triton.Config({'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
-    ]
-def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None):
-    N = sorted_expert_idxs.size(0)
-    K = A.size(1)
-    assert A.size(0) * fan_out == N
-    if out is not None:
-        Y = out
-    else:
-        Y = torch.empty((N, K), dtype=A.dtype, device=A.device)
-    group_compileable(A, K, N, Y, coeff, coeff is not None, fan_out, sorted_expert_idxs)
-    return Y
-@torch.library.custom_op("scattermoe::group", mutates_args={"Y"})
-def group_compileable(
-        A: torch.Tensor,
-        K: int,
-        N: int,
-        Y: torch.Tensor,
-        coeff: torch.Tensor, has_coeff: bool,
-        fan_out: int,
-        sorted_expert_idxs: torch.Tensor) -> None:
-    def grid(META):
-        grid_num = (triton.cdiv(META['N'], META['BLOCK_N']),)
-        return grid_num
-    _group[grid](
-        # A_ptr, stride_an, stride_ai,
-        A, A.stride(0), A.stride(1), has_coeff, coeff, fan_out,
-        # Y_ptr, stride_yn, stride_yk,
-        Y, Y.stride(0), Y.stride(1),
-        # grouped_idx_ptr,
-        sorted_expert_idxs,
-        # N: tl.constexpr, K: tl.constexpr,
-        N, K
-    )
-@triton.autotune(configs=_config_grouping(), key=['K'])
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0
-})
-@triton.jit
-def _group(
-    src_ptr, stride_sn, stride_sk, has_coeff: tl.constexpr, coeff_ptr, FAN_OUT: tl.constexpr,
-    tgt_ptr, stride_tn, stride_ti,
-    grouped_idx_ptr,
-    N, K: tl.constexpr,
-    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    NO_K_MASK: tl.constexpr
-):
-    pid = tl.program_id(axis=0)
-    N_block_id = pid
-    N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_blk < N
-    N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N)
-    N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0)
-    K_blk = tl.arange(0, BLOCK_K)
-    src_blk_ptrs = src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk
-    tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti
-    if has_coeff:
-        c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None]
-    iters = tl.cdiv(K, BLOCK_K)
-    for i in range(0, iters):
-        if NO_K_MASK or i < iters - 1:
-            block = tl.load(src_blk_ptrs, mask=N_mask[:, None])
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None])
-        else:
-            K_mask = (i * BLOCK_K + K_blk) < K
-            mask = N_mask[:, None] & K_mask[None, :]
-            block = tl.load(src_blk_ptrs, mask=mask)
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=mask)
-        src_blk_ptrs += BLOCK_K * stride_sk
-        tgt_blk_ptrs += BLOCK_K * stride_ti

build/torch-rocm/kernels/single.py DELETED Viewed

@@ -1,59 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-@triton.jit
-def _single2scatter(
-    X_ptr, stride_xm, stride_xk,
-    W_ptr, stride_we, stride_wk, stride_wn,
-    Y_ptr, stride_ym, stride_yn,
-    expert_idxs_ptr,
-    FAN_OUT: tl.constexpr,
-    K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
-    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-    N_block_id = pid0
-    if FAN_OUT == 1:
-        in_idx = pid1
-    else:
-        in_idx = 0
-    out_idx = pid1
-    K_block = tl.arange(0, BLOCK_K)
-    N_block = tl.max_contiguous(tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N), BLOCK_N)
-    E_idx = tl.load(expert_idxs_ptr + pid1)
-    X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk
-    W_blk_ptrs = W_ptr + E_idx * stride_we + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn
-    acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE)
-    for K_block_id in range(0, tl.cdiv(K, BLOCK_K)):
-        x = tl.load(X_blk_ptrs)
-        w = tl.load(W_blk_ptrs)
-        acc += tl.sum(x * w, axis=0)[None, :]
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-    Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn
-    tl.store(Y_blk_ptrs, acc)
-def single2scatter(X, W, expert_idxs):
-    E, xdim, ydim = W.size()
-    k = expert_idxs.size(1)
-    assert X.size(0) == k or X.size(0) == 1
-    Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype)
-    BLOCK_N = 128
-    BLOCK_K = 128
-    grid = ydim // BLOCK_N, k
-    _single2scatter[grid](
-        X, X.stride(0), X.stride(1),
-        W, W.stride(0), W.stride(1), W.stride(2),
-        Y, Y.stride(0), Y.stride(1),
-        expert_idxs,
-        FAN_OUT=Y.size(0) // X.size(0),
-        K=xdim, N=ydim, E=E,
-        BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
-        ACC_TYPE=tl.float32
-    )
-    return Y

build/torch-rocm/layers.py DELETED Viewed

@@ -1,52 +0,0 @@
-import torch
-from torch.nn import functional as F
-from torch import nn
-from . import parallel_linear, flatten_sort_count
-class ScatterMoEGatedMLP(nn.Module):
-    def forward(self, layer_input):
-        """
-        Forward pass of the mixture of experts layer.
-        Args:
-            layer_input (Tensor):
-                Input tensor.
-        Returns:
-            Tensor:
-                Output tensor.
-            Tensor:
-                Router logits.
-        """
-        bsz, length, emb_size = layer_input.size()
-        layer_input = layer_input.reshape(-1, emb_size)
-        # compute the top_k routing decision
-        router_logits = self.router.layer(layer_input)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights, self.router.top_k, dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(layer_input.dtype)
-        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = \
-            flatten_sort_count(selected_experts, num_experts=self.router.num_experts)
-        # compute experts
-        gates, h = parallel_linear(
-            layer_input, self.input_linear.weight.transpose(2, 1),
-            self.router.top_k,
-            sorted_expert_idxs, sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=False, grouped_out=True,
-        ).chunk(2, dim=-1)
-        h = self.activation(gates) * h
-        layer_output = parallel_linear(
-            h, self.output_linear.weight.transpose(2, 1),
-            1,
-            sorted_expert_idxs, sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=True, grouped_out=False,
-            gates=routing_weights
-        )
-        layer_output = layer_output.view(bsz, length, emb_size)
-        return layer_output

build/torch-rocm/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch-rocm/parallel_experts.py DELETED Viewed

@@ -1,182 +0,0 @@
-import torch
-import torch.nn as nn
-from . import kernels
-from typing import Optional
-@torch.library.custom_op("scattermoe::bincount", mutates_args={})
-def compileable_bincount(x: torch.Tensor, minlength: int) -> torch.Tensor:
-        return x.bincount(minlength=minlength)
-@compileable_bincount.register_fake
-def _(x: torch.Tensor, minlength: int) -> torch.Tensor:
-    return torch.empty(minlength, dtype=torch.long, device=x.device)
-@torch.compile
-def flatten_sort_count(expert_idxs: torch.Tensor, num_experts: int):
-    with torch.no_grad():
-        flattened_expert_idxs = expert_idxs.flatten()
-        sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs)
-        expert_counts = compileable_bincount(flattened_expert_idxs, minlength=num_experts)
-        expert_offsets = expert_counts.cumsum(-1)
-        return sorted_expert_idxs, sorted_scattered_idxs, expert_offsets
-class ParallelLinear(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: torch.Tensor, expert_weights: torch.Tensor, k: int,
-        sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor,
-        expert_offsets: torch.Tensor,
-        expert_biases: Optional[torch.Tensor]=None,
-        gates: Optional[torch.Tensor]=None,
-        grouped_in: bool =False, grouped_out: bool=False,
-    ):
-        with torch.device(x.device):
-            output = kernels.ops.scatter2scatter(
-                X=x, W=expert_weights,
-                b=expert_biases, k=k,
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                x_grouped=grouped_in, y_grouped=grouped_out
-            )
-            if gates is not None:
-                output_expanded = output.view(gates.size(0), gates.size(1), output.size(-1))
-                output = (gates.unsqueeze(1) @ output_expanded).squeeze(1)
-            else:
-                output_expanded = None
-            ctx.save_for_backward(
-                x, expert_weights,
-                expert_biases,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                gates,
-                output_expanded
-            )
-            ctx.grouped_in = grouped_in
-            ctx.grouped_out = grouped_out
-            ctx.k = k
-        return output
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor):
-        with torch.device(grad_out.device):
-            (x, expert_weights, expert_biases,
-             sorted_expert_idxs,
-             sorted_scattered_idxs,
-             expert_offsets,
-             gates, output_expanded) = ctx.saved_tensors
-            k = ctx.k
-            grouped_in = ctx.grouped_in
-            grouped_out = ctx.grouped_out
-            # print("backward")
-            if gates is not None:
-                # calculate gates gradient
-                # d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1)
-                d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1)
-                gates_flat = gates.flatten()
-                gate_fan = gates.size(1)
-                grouped_grad_out = output_expanded.flatten(0, 1) # reuse expanded buffer later
-            else:
-                d_gates = None
-                gates_flat = None
-                gate_fan = 1
-                grouped_grad_out = None
-            if grouped_out:
-                grouped_grad_out = grad_out
-            else:
-                grouped_grad_out = kernels.ops.group(grad_out, sorted_scattered_idxs,
-                                                     fan_out=gate_fan, coeff=gates_flat,
-                                                     out=grouped_grad_out)
-            if grouped_in:
-                grouped_x = x
-                d_expanded_input = None
-            else:
-                grouped_x = kernels.ops.group(x, sorted_scattered_idxs, fan_out=k)
-                d_expanded_input = grouped_x
-            d_weights, d_biases = kernels.ops.group_bwd_W(
-                DY=grouped_grad_out, X=grouped_x,
-                expert_offsets=expert_offsets,
-                E=expert_weights.size(0),
-                has_bias=expert_biases is not None
-            )
-            d_expanded_input = kernels.ops.scatter2scatter(
-                X=grouped_grad_out, x_grouped=True,
-                W=expert_weights.permute(0, 2, 1),
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                k=1,
-                y_grouped=grouped_in,
-                out=d_expanded_input # Reuse grouped_x buffer
-            )
-            if k == 1:
-                d_input = d_expanded_input
-            else:
-                d_input = d_expanded_input.view(x.size(0), k, d_expanded_input.size(-1)).sum(-2)
-        # print("backward end.")
-        return (
-            # x, expert_weights,
-            d_input, d_weights,
-            # k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
-            None, None, None, None,
-            # bias, gates
-            d_biases, d_gates,
-            # grouped_in, grouped_out,
-            None, None
-        )
-def parallel_linear(inputs, expert_weights, k,
-                    sorted_expert_idxs, sorted_scattered_idxs,
-                    expert_offsets,
-                    expert_biases=None,
-                    gates=None, grouped_in=False, grouped_out=False):
-    results = ParallelLinear.apply(inputs, expert_weights, k,
-                                   sorted_expert_idxs, sorted_scattered_idxs,
-                                   expert_offsets,
-                                   expert_biases,
-                                   gates, grouped_in, grouped_out)
-    return results
-class ParallelExperts(nn.Module):
-    def __init__(self, num_experts, input_size, output_size, bias=False) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
-        if bias:
-            self.bias = nn.Parameter(torch.empty(num_experts, output_size))
-        else:
-            self.bias = None
-        self.num_experts = num_experts
-        self.input_size = input_size
-        self.output_size = output_size
-        self.reset_parameters()
-    def extra_repr(self):
-        return 'num_experts={}, input_size={}, output_size={}'.format(
-            self.num_experts, self.input_size, self.output_size)
-    def reset_parameters(self) -> None:
-        nn.init.normal_(self.weight, std=0.02)
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-    def forward(self, inputs, k, sorted_expert_idxs, sorted_scattered_idxs,
-                expert_offsets,
-                gates=None, grouped_in=False, grouped_out=False):
-        results = parallel_linear(
-            inputs, self.weight.permute(0, 2, 1), k,
-            sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
-            expert_biases=self.bias,
-            gates=gates, grouped_in=grouped_in, grouped_out=grouped_out
-        )
-        return results

build/torch-rocm/scattermoe/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-xpu/__init__.py DELETED Viewed

@@ -1,13 +0,0 @@
-from .parallel_experts import flatten_sort_count, parallel_linear, ParallelExperts
-from . import parallel_experts
-from . import kernels
-from . import layers
-__all__ = [
-    "flatten_sort_count",
-    "parallel_linear",
-    "ParallelExperts",
-    "parallel_experts",
-    "kernels",
-    "layers"
-]

build/torch-xpu/_ops.py DELETED Viewed

@@ -1,8 +0,0 @@
-import torch
-ops = torch.ops._scattermoe_05b9d77
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_scattermoe_05b9d77::{op_name}"

build/torch-xpu/kernels/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from . import ops
-__all__ = ["ops"]

build/torch-xpu/kernels/ops.py DELETED Viewed

@@ -1,457 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-from typing import Optional
-BLOCK_M = 128
-ALLOW_TF32 = True
-@triton.jit
-def _compute_expert_block(
-    E_idx, E_mask,
-    M_in_idx,
-    N_block, N_mask,
-    X_ptr, stride_xm, stride_xk,
-    W_ptr, stride_we, stride_wk, stride_wn,
-    K,
-    acc,
-    no_k_mask,
-    BLOCK_K,
-    allow_tf32=True,
-):
-    K_block = tl.arange(0, BLOCK_K)
-    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
-    W_blk_ptrs = W_ptr + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn + E_idx * stride_we
-    iters = tl.cdiv(K, BLOCK_K)
-    for K_block_id in range(iters):
-        if no_k_mask:
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None])
-            w = tl.load(W_blk_ptrs, mask=N_mask[None, :])
-        else:
-            K_mask = (K_block_id * BLOCK_K + K_block) < K
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :])
-            w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :])
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-        acc = tl.dot(x, w, acc, allow_tf32=allow_tf32)
-    return acc
-def _scatter2scatter_configs():
-    return [
-        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
-    ]
-@triton.autotune(configs=_scatter2scatter_configs(), key=['M', 'N', 'K'], )
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
-    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
-})
-@triton.jit
-def _scatter2scatter(
-    X_ptr, stride_xm: tl.constexpr, stride_xk: tl.constexpr,
-    W_ptr, stride_we, stride_wk: tl.constexpr, stride_wn: tl.constexpr,
-    Y_ptr, stride_ym: tl.constexpr, stride_yn: tl.constexpr,
-    B_ptr, stride_be: tl.constexpr, stride_bn: tl.constexpr,
-    grouped_idx_ptr, expert_idxs_ptr,
-    # block_start_idx_ptr,
-    FAN_OUT: tl.constexpr,
-    M, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    # OUT_M,
-    allow_tf32: tl.constexpr,
-    x_grouped: tl.constexpr, y_grouped: tl.constexpr,
-    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
-):
-    pid = tl.program_id(axis=0)
-    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
-    M_block_id = pid // N_BLOCK_COUNT
-    N_block_id = pid % N_BLOCK_COUNT
-    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
-    N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_block < N
-    M_boundary_mask = M_block < (FAN_OUT * M)
-    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)
-    no_k_mask = K % BLOCK_K == 0
-    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-    E_first_idx = tl.min(E_idxs)
-    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
-    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
-    for E_idx in range(E_first_idx, E_last_idx + 1):
-        E_mask = E_idxs == E_idx
-        E_M_idx = M_idx
-        if x_grouped:
-            M_in_idx = M_block
-        else:
-            M_in_idx = E_M_idx // FAN_OUT
-        acc = _compute_expert_block(
-            E_idx, E_mask,
-            M_in_idx, N_block, N_mask,
-            X_ptr, stride_xm, stride_xk,
-            W_ptr, stride_we, stride_wk, stride_wn,
-            K,
-            acc,
-            no_k_mask,
-            BLOCK_K,
-            allow_tf32=allow_tf32,
-        )
-    if B_ptr is not None:
-        B_blk_ptrs = B_ptr + E_idxs[:, None] * stride_be + N_block[None, :] * stride_bn
-        acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :])
-    if y_grouped:
-        M_out_idx = M_block
-    else:
-        M_out_idx = M_idx
-    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
-    tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :])
-def scatter2scatter(X, W, sorted_expert_idxs, sorted_scattered_idxs, k,
-                    b=None,
-                    x_grouped=False, y_grouped=False,
-                    out=None):
-    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
-    assert sorted_scattered_idxs.size(0) == X.size(0) * k
-    # Pre-kernel setup
-    y_dim = W.size(-1)
-    L_scattered = sorted_expert_idxs.size(0)
-    if out is None:
-        output = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype)
-    else:
-        assert out.size(0) == L_scattered and out.size(1) == y_dim
-        output = out
-    scatter2scatter_compileable(output, W, X, k, sorted_expert_idxs, sorted_scattered_idxs,
-                                b, x_grouped, y_grouped)
-    return output
-@torch.library.custom_op("scattermoe::scatter2scatter", mutates_args={"output"})
-def scatter2scatter_compileable(
-        output: torch.Tensor,
-        W: torch.Tensor,
-        X: torch.Tensor,
-        k: int,
-        sorted_expert_idxs: torch.Tensor,
-        sorted_scattered_idxs: torch.Tensor,
-        b: Optional[torch.Tensor],
-        x_grouped: bool, y_grouped: bool) -> None:
-    def grid(META):
-        grid_num = (
-            triton.cdiv(sorted_expert_idxs.size(0), META["BLOCK_M"]) *
-            triton.cdiv(META['N'], META['BLOCK_N']),
-        )
-        return grid_num
-    if b is None:
-        b = None
-        stride_be = stride_bk = 0
-    else:
-        stride_be, stride_bk = b.stride()
-    _scatter2scatter[grid](
-        # X_ptr, stride_xm, stride_xk,
-        X, X.stride(0), X.stride(1),
-        # W_ptr, stride_we, stride_wk, stride_wn,
-        W, W.stride(0), W.stride(1), W.stride(2),
-        # Y_ptr, stride_ym, stride_yn,
-        output, output.stride(0), output.stride(1),
-        # B_ptr, stride_be, stride_bk
-        b, stride_be, stride_bk,
-        grouped_idx_ptr=sorted_scattered_idxs,
-        expert_idxs_ptr=sorted_expert_idxs,
-        # block_start_idx_ptr=padded_block_idxs,
-        FAN_OUT=k,
-        M=X.size(0),
-        K=X.size(1),
-        N=output.size(1), E=W.size(0),
-        BLOCK_M=BLOCK_M,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32,
-        x_grouped=x_grouped, y_grouped=y_grouped,
-    )
-def _config_XtY():
-    return [
-        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 128, 'BLOCK_M': 32}, num_stages=4, num_warps=4),
-    ]
-def group_bwd_W(DY, X, expert_offsets, E, has_bias=False):
-    DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype)
-    DW = DWt.permute(0, 2, 1)
-    if has_bias:
-        Db = torch.zeros((E, DY.size(-1)), device=DY.device, dtype=DY.dtype)
-    else:
-        Db = None
-    groupXtY_compileable(E, DW, Db, DY, X, expert_offsets)
-    return DW, Db
-@torch.library.custom_op("scattermoe::groupXtY", mutates_args={"DW"})
-def groupXtY_compileable(
-        E: int,
-        DW: torch.Tensor,
-        Db: Optional[torch.Tensor],
-        DY: torch.Tensor,
-        X: torch.Tensor,
-        expert_offsets: torch.Tensor) -> None:
-    def grid(META):
-        grid = (
-            E * triton.cdiv(META['K'], META['BLOCK_K']),
-            triton.cdiv(META['N'], META['BLOCK_N']),
-        )
-        return grid
-    if Db is None:
-        stride_dbe = 0
-        stride_dbn = 0
-    else:
-        stride_dbe, stride_dbn = Db.stride()
-    _groupXtY[grid](
-        # DY_ptr, stride_dym, stride_dyk,
-        DY, DY.stride(0), DY.stride(1),
-        # X_ptr, stride_xm, stride_xn,
-        X, X.stride(0), X.stride(1),
-        # DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-        DW, DW.stride(0), DW.stride(1), DW.stride(2),
-        # Db_ptr, stride_dwe, stride_dbn,
-        Db, stride_dbe, stride_dbn,
-        # expert_offsets_ptr,
-        expert_offsets,
-        # K: tl.constexpr, N: tl.constexpr,
-        M=DY.size(0), N=DY.size(-1), K=X.size(-1),
-        # ACC_TYPE: tl.constexpr,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32
-    )
-@triton.autotune(configs=_config_XtY(), key=['M', 'N', 'K'], )
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
-    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
-})
-@triton.jit
-def _groupXtY(
-    DY_ptr, stride_dym, stride_dyk,
-    X_ptr, stride_xm, stride_xn,
-    DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-    Db_ptr, stride_dbe, stride_dbn,
-    expert_offsets_ptr,
-    M, K: tl.constexpr, N: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    allow_tf32: tl.constexpr,
-    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-    num0 = tl.num_programs(0)
-    num1 = tl.num_programs(1)
-    # pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128)
-    pid0, pid1 = tl.swizzle2d(pid0, pid1, num0, num1, 4)
-    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
-    E_idx = pid0 // K_BLOCK_COUNT
-    K_block_id = pid0 % K_BLOCK_COUNT
-    N_block_id = pid1
-    if E_idx == 0:
-        start_idx = 0
-    else:
-        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
-    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
-    if end_idx > start_idx:
-        M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M)
-        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
-        K_mask = K_block < K
-        K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K)
-        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-        N_mask = N_block < N
-        N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
-        M_idxs = M_block
-        xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm
-        dy_blk_ptrs = DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk
-        if (Db_ptr is not None) and (K_block_id == 0):
-            _xty_and_bias(
-                E_idx, start_idx, end_idx,
-                M_block,
-                K_block, K_mask, N_block, N_mask,
-                dy_blk_ptrs, stride_dym,
-                xt_blk_ptrs, stride_xm,
-                DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-                Db_ptr, stride_dbe, stride_dbn,
-                BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-                allow_tf32, NO_K_MASK, NO_N_MASK,
-                compute_bias=True
-            )
-        else:
-            _xty_and_bias(
-                E_idx, start_idx, end_idx,
-                M_block,
-                K_block, K_mask, N_block, N_mask,
-                dy_blk_ptrs, stride_dym,
-                xt_blk_ptrs, stride_xm,
-                DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-                Db_ptr, stride_dbe, stride_dbn,
-                BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-                allow_tf32, NO_K_MASK, NO_N_MASK,
-                compute_bias=False
-            )
-@triton.jit
-def _xty_and_bias(
-        E_idx, start_idx, end_idx,
-        M_block,
-        K_block, K_mask, N_block, N_mask,
-        dy_blk_ptrs, stride_dym,
-        xt_blk_ptrs, stride_xm,
-        DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-        Db_ptr, stride_dbe, stride_dbn,
-        BLOCK_M, BLOCK_N, BLOCK_K, ACC_TYPE,
-        allow_tf32, NO_K_MASK, NO_N_MASK,
-        compute_bias: tl.constexpr
-    ):
-    if compute_bias:
-        db_acc = tl.zeros((BLOCK_N,), dtype=ACC_TYPE)
-    else:
-        db_acc = None
-    acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE)
-    iters = tl.cdiv(end_idx - start_idx, BLOCK_M)
-    for i in range(0, iters):
-        M_mask = (i * BLOCK_M + M_block) < end_idx
-        if NO_K_MASK:
-            xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :])
-        else:
-            xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :])
-        if NO_N_MASK:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None])
-        else:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :])
-        acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32)
-        xt_blk_ptrs += BLOCK_M * stride_xm
-        dy_blk_ptrs += BLOCK_M * stride_dym
-        if compute_bias:
-            db_acc += tl.sum(dy, axis=0)
-    DW_blk_ptrs = DW_ptr + E_idx * stride_dwe + K_block[:, None] * stride_dwk + N_block[None, :] * stride_dwn
-    acc = acc.to(DW_blk_ptrs.dtype.element_ty)
-    tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :])
-    if compute_bias:
-        Db_blk_ptrs =  Db_ptr + E_idx * stride_dbe + N_block * stride_dbn
-        tl.store(Db_blk_ptrs, db_acc, mask=N_mask)
-def _config_grouping():
-    return [
-        triton.Config({'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
-    ]
-def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None):
-    N = sorted_expert_idxs.size(0)
-    K = A.size(1)
-    assert A.size(0) * fan_out == N
-    if out is not None:
-        Y = out
-    else:
-        Y = torch.empty((N, K), dtype=A.dtype, device=A.device)
-    group_compileable(A, K, N, Y, coeff, coeff is not None, fan_out, sorted_expert_idxs)
-    return Y
-@torch.library.custom_op("scattermoe::group", mutates_args={"Y"})
-def group_compileable(
-        A: torch.Tensor,
-        K: int,
-        N: int,
-        Y: torch.Tensor,
-        coeff: torch.Tensor, has_coeff: bool,
-        fan_out: int,
-        sorted_expert_idxs: torch.Tensor) -> None:
-    def grid(META):
-        grid_num = (triton.cdiv(META['N'], META['BLOCK_N']),)
-        return grid_num
-    _group[grid](
-        # A_ptr, stride_an, stride_ai,
-        A, A.stride(0), A.stride(1), has_coeff, coeff, fan_out,
-        # Y_ptr, stride_yn, stride_yk,
-        Y, Y.stride(0), Y.stride(1),
-        # grouped_idx_ptr,
-        sorted_expert_idxs,
-        # N: tl.constexpr, K: tl.constexpr,
-        N, K
-    )
-@triton.autotune(configs=_config_grouping(), key=['K'])
-@triton.heuristics({
-    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0
-})
-@triton.jit
-def _group(
-    src_ptr, stride_sn, stride_sk, has_coeff: tl.constexpr, coeff_ptr, FAN_OUT: tl.constexpr,
-    tgt_ptr, stride_tn, stride_ti,
-    grouped_idx_ptr,
-    N, K: tl.constexpr,
-    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    NO_K_MASK: tl.constexpr
-):
-    pid = tl.program_id(axis=0)
-    N_block_id = pid
-    N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_blk < N
-    N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N)
-    N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0)
-    K_blk = tl.arange(0, BLOCK_K)
-    src_blk_ptrs = src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk
-    tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti
-    if has_coeff:
-        c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None]
-    iters = tl.cdiv(K, BLOCK_K)
-    for i in range(0, iters):
-        if NO_K_MASK or i < iters - 1:
-            block = tl.load(src_blk_ptrs, mask=N_mask[:, None])
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None])
-        else:
-            K_mask = (i * BLOCK_K + K_blk) < K
-            mask = N_mask[:, None] & K_mask[None, :]
-            block = tl.load(src_blk_ptrs, mask=mask)
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=mask)
-        src_blk_ptrs += BLOCK_K * stride_sk
-        tgt_blk_ptrs += BLOCK_K * stride_ti

build/torch-xpu/kernels/single.py DELETED Viewed

@@ -1,59 +0,0 @@
-import torch
-import triton
-import triton.language as tl
-@triton.jit
-def _single2scatter(
-    X_ptr, stride_xm, stride_xk,
-    W_ptr, stride_we, stride_wk, stride_wn,
-    Y_ptr, stride_ym, stride_yn,
-    expert_idxs_ptr,
-    FAN_OUT: tl.constexpr,
-    K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
-    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-    N_block_id = pid0
-    if FAN_OUT == 1:
-        in_idx = pid1
-    else:
-        in_idx = 0
-    out_idx = pid1
-    K_block = tl.arange(0, BLOCK_K)
-    N_block = tl.max_contiguous(tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N), BLOCK_N)
-    E_idx = tl.load(expert_idxs_ptr + pid1)
-    X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk
-    W_blk_ptrs = W_ptr + E_idx * stride_we + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn
-    acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE)
-    for K_block_id in range(0, tl.cdiv(K, BLOCK_K)):
-        x = tl.load(X_blk_ptrs)
-        w = tl.load(W_blk_ptrs)
-        acc += tl.sum(x * w, axis=0)[None, :]
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-    Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn
-    tl.store(Y_blk_ptrs, acc)
-def single2scatter(X, W, expert_idxs):
-    E, xdim, ydim = W.size()
-    k = expert_idxs.size(1)
-    assert X.size(0) == k or X.size(0) == 1
-    Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype)
-    BLOCK_N = 128
-    BLOCK_K = 128
-    grid = ydim // BLOCK_N, k
-    _single2scatter[grid](
-        X, X.stride(0), X.stride(1),
-        W, W.stride(0), W.stride(1), W.stride(2),
-        Y, Y.stride(0), Y.stride(1),
-        expert_idxs,
-        FAN_OUT=Y.size(0) // X.size(0),
-        K=xdim, N=ydim, E=E,
-        BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
-        ACC_TYPE=tl.float32
-    )
-    return Y

build/torch-xpu/layers.py DELETED Viewed

@@ -1,52 +0,0 @@
-import torch
-from torch.nn import functional as F
-from torch import nn
-from . import parallel_linear, flatten_sort_count
-class ScatterMoEGatedMLP(nn.Module):
-    def forward(self, layer_input):
-        """
-        Forward pass of the mixture of experts layer.
-        Args:
-            layer_input (Tensor):
-                Input tensor.
-        Returns:
-            Tensor:
-                Output tensor.
-            Tensor:
-                Router logits.
-        """
-        bsz, length, emb_size = layer_input.size()
-        layer_input = layer_input.reshape(-1, emb_size)
-        # compute the top_k routing decision
-        router_logits = self.router.layer(layer_input)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights, self.router.top_k, dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(layer_input.dtype)
-        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = \
-            flatten_sort_count(selected_experts, num_experts=self.router.num_experts)
-        # compute experts
-        gates, h = parallel_linear(
-            layer_input, self.input_linear.weight.transpose(2, 1),
-            self.router.top_k,
-            sorted_expert_idxs, sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=False, grouped_out=True,
-        ).chunk(2, dim=-1)
-        h = self.activation(gates) * h
-        layer_output = parallel_linear(
-            h, self.output_linear.weight.transpose(2, 1),
-            1,
-            sorted_expert_idxs, sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=True, grouped_out=False,
-            gates=routing_weights
-        )
-        layer_output = layer_output.view(bsz, length, emb_size)
-        return layer_output

build/torch-xpu/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch-xpu/parallel_experts.py DELETED Viewed

@@ -1,182 +0,0 @@
-import torch
-import torch.nn as nn
-from . import kernels
-from typing import Optional
-@torch.library.custom_op("scattermoe::bincount", mutates_args={})
-def compileable_bincount(x: torch.Tensor, minlength: int) -> torch.Tensor:
-        return x.bincount(minlength=minlength)
-@compileable_bincount.register_fake
-def _(x: torch.Tensor, minlength: int) -> torch.Tensor:
-    return torch.empty(minlength, dtype=torch.long, device=x.device)
-@torch.compile
-def flatten_sort_count(expert_idxs: torch.Tensor, num_experts: int):
-    with torch.no_grad():
-        flattened_expert_idxs = expert_idxs.flatten()
-        sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs)
-        expert_counts = compileable_bincount(flattened_expert_idxs, minlength=num_experts)
-        expert_offsets = expert_counts.cumsum(-1)
-        return sorted_expert_idxs, sorted_scattered_idxs, expert_offsets
-class ParallelLinear(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: torch.Tensor, expert_weights: torch.Tensor, k: int,
-        sorted_expert_idxs: torch.Tensor, sorted_scattered_idxs: torch.Tensor,
-        expert_offsets: torch.Tensor,
-        expert_biases: Optional[torch.Tensor]=None,
-        gates: Optional[torch.Tensor]=None,
-        grouped_in: bool =False, grouped_out: bool=False,
-    ):
-        with torch.device(x.device):
-            output = kernels.ops.scatter2scatter(
-                X=x, W=expert_weights,
-                b=expert_biases, k=k,
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                x_grouped=grouped_in, y_grouped=grouped_out
-            )
-            if gates is not None:
-                output_expanded = output.view(gates.size(0), gates.size(1), output.size(-1))
-                output = (gates.unsqueeze(1) @ output_expanded).squeeze(1)
-            else:
-                output_expanded = None
-            ctx.save_for_backward(
-                x, expert_weights,
-                expert_biases,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                gates,
-                output_expanded
-            )
-            ctx.grouped_in = grouped_in
-            ctx.grouped_out = grouped_out
-            ctx.k = k
-        return output
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor):
-        with torch.device(grad_out.device):
-            (x, expert_weights, expert_biases,
-             sorted_expert_idxs,
-             sorted_scattered_idxs,
-             expert_offsets,
-             gates, output_expanded) = ctx.saved_tensors
-            k = ctx.k
-            grouped_in = ctx.grouped_in
-            grouped_out = ctx.grouped_out
-            # print("backward")
-            if gates is not None:
-                # calculate gates gradient
-                # d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1)
-                d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1)
-                gates_flat = gates.flatten()
-                gate_fan = gates.size(1)
-                grouped_grad_out = output_expanded.flatten(0, 1) # reuse expanded buffer later
-            else:
-                d_gates = None
-                gates_flat = None
-                gate_fan = 1
-                grouped_grad_out = None
-            if grouped_out:
-                grouped_grad_out = grad_out
-            else:
-                grouped_grad_out = kernels.ops.group(grad_out, sorted_scattered_idxs,
-                                                     fan_out=gate_fan, coeff=gates_flat,
-                                                     out=grouped_grad_out)
-            if grouped_in:
-                grouped_x = x
-                d_expanded_input = None
-            else:
-                grouped_x = kernels.ops.group(x, sorted_scattered_idxs, fan_out=k)
-                d_expanded_input = grouped_x
-            d_weights, d_biases = kernels.ops.group_bwd_W(
-                DY=grouped_grad_out, X=grouped_x,
-                expert_offsets=expert_offsets,
-                E=expert_weights.size(0),
-                has_bias=expert_biases is not None
-            )
-            d_expanded_input = kernels.ops.scatter2scatter(
-                X=grouped_grad_out, x_grouped=True,
-                W=expert_weights.permute(0, 2, 1),
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                k=1,
-                y_grouped=grouped_in,
-                out=d_expanded_input # Reuse grouped_x buffer
-            )
-            if k == 1:
-                d_input = d_expanded_input
-            else:
-                d_input = d_expanded_input.view(x.size(0), k, d_expanded_input.size(-1)).sum(-2)
-        # print("backward end.")
-        return (
-            # x, expert_weights,
-            d_input, d_weights,
-            # k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
-            None, None, None, None,
-            # bias, gates
-            d_biases, d_gates,
-            # grouped_in, grouped_out,
-            None, None
-        )
-def parallel_linear(inputs, expert_weights, k,
-                    sorted_expert_idxs, sorted_scattered_idxs,
-                    expert_offsets,
-                    expert_biases=None,
-                    gates=None, grouped_in=False, grouped_out=False):
-    results = ParallelLinear.apply(inputs, expert_weights, k,
-                                   sorted_expert_idxs, sorted_scattered_idxs,
-                                   expert_offsets,
-                                   expert_biases,
-                                   gates, grouped_in, grouped_out)
-    return results
-class ParallelExperts(nn.Module):
-    def __init__(self, num_experts, input_size, output_size, bias=False) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
-        if bias:
-            self.bias = nn.Parameter(torch.empty(num_experts, output_size))
-        else:
-            self.bias = None
-        self.num_experts = num_experts
-        self.input_size = input_size
-        self.output_size = output_size
-        self.reset_parameters()
-    def extra_repr(self):
-        return 'num_experts={}, input_size={}, output_size={}'.format(
-            self.num_experts, self.input_size, self.output_size)
-    def reset_parameters(self) -> None:
-        nn.init.normal_(self.weight, std=0.02)
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-    def forward(self, inputs, k, sorted_expert_idxs, sorted_scattered_idxs,
-                expert_offsets,
-                gates=None, grouped_in=False, grouped_out=False):
-        results = parallel_linear(
-            inputs, self.weight.permute(0, 2, 1), k,
-            sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
-            expert_biases=self.bias,
-            gates=gates, grouped_in=grouped_in, grouped_out=grouped_out
-        )
-        return results

build/torch-xpu/scattermoe/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))