File size: 7,833 Bytes

9396a12

import torch
import torch.nn.functional as F
from collections import namedtuple

from kernels.benchmark import Benchmark


def moe_mlp_reference(
    x: torch.Tensor,
    router_weight: torch.Tensor,
    router_bias: torch.Tensor,
    gate_up_proj: torch.Tensor,
    gate_up_proj_bias: torch.Tensor,
    down_proj: torch.Tensor,
    down_proj_bias: torch.Tensor,
    top_k: int = 4,
    alpha: float = 1.702,
    limit: float = 7.0,
) -> tuple[torch.Tensor, torch.Tensor]:
    in_shape = x.shape
    num_experts = router_weight.shape[0]
    hidden_size = x.shape[-1]

    # Flatten to (num_tokens, hidden_size)
    hidden_states = x.view(-1, hidden_size)
    num_tokens = hidden_states.shape[0]

    # Router: compute logits and get top-k experts
    logits = F.linear(hidden_states, router_weight, router_bias)
    expert_weights, router_indices = torch.topk(logits, top_k, dim=-1)
    routing_weights = F.softmax(expert_weights, dim=-1)

    # Initialize output
    next_states = torch.zeros_like(hidden_states)

    # Create expert mask using one_hot
    with torch.no_grad():
        expert_mask = F.one_hot(router_indices, num_classes=num_experts)
        expert_mask = expert_mask.permute(2, 1, 0)  # (num_experts, top_k, num_tokens)
        # Find which experts are hit
        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()

    # Process each expert that has tokens
    for expert_idx in expert_hit:
        expert_idx = expert_idx[0]
        with torch.no_grad():
            top_k_idx, token_idx = torch.where(expert_mask[expert_idx])

        current_state = hidden_states[token_idx]

        # Up projection
        gate_up = (
            current_state @ gate_up_proj[expert_idx] + gate_up_proj_bias[expert_idx]
        )

        # Split into gate and up
        gate, up = gate_up[..., ::2], gate_up[..., 1::2]

        # Clamp
        gate = gate.clamp(min=None, max=limit)
        up = up.clamp(min=-limit, max=limit)

        # SwiGLU-like activation
        glu = gate * torch.sigmoid(gate * alpha)
        gated_output = (up + 1) * glu

        # Down projection
        out = gated_output @ down_proj[expert_idx] + down_proj_bias[expert_idx]

        # Get the routing weight for this expert at the correct top_k position
        weights_for_expert = routing_weights[token_idx, top_k_idx]
        weighted_output = out * weights_for_expert[:, None]
        next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))

    return next_states.view(in_shape), routing_weights


class MegaBlocksMoeBenchmark(Benchmark):
    seed: int = 42

    def setup(self):
        # Config matching readme_example.py
        ne, hs, isz = 128, 1152, 3072
        batch, seq = 8, 1

        # Router
        self.router_weight = torch.randn(
            ne, hs, device=self.device, dtype=torch.float32
        )
        torch.nn.init.kaiming_uniform_(self.router_weight)
        self.router_bias = torch.zeros(ne, device=self.device, dtype=torch.float32)

        # Expert weights
        self.gate_up_proj = (
            torch.randn(ne, hs, isz, device=self.device, dtype=torch.float32) * 0.02
        )
        self.gate_up_proj_bias = torch.zeros(
            ne, isz, device=self.device, dtype=torch.float32
        )
        self.down_proj = (
            torch.randn(ne, isz // 2, hs, device=self.device, dtype=torch.float32)
            * 0.02
        )
        self.down_proj_bias = torch.zeros(
            ne, hs, device=self.device, dtype=torch.float32
        )

        # Input
        self.x = (
            torch.randn(seq, batch, hs, device=self.device, dtype=torch.float32) * 0.1
        )

        # Setup the model
        self.model = self.kernel.layers.MegaBlocksMoeMLP()
        self.model.router = torch.nn.Linear(hs, ne, device=self.device)
        self.model.router.weight.data = self.router_weight.clone()
        self.model.router.bias.data = self.router_bias.clone()

        Experts = namedtuple(
            "Experts",
            [
                "gate_up_proj",
                "gate_up_proj_bias",
                "down_proj",
                "down_proj_bias",
                "hidden_size",
                "num_experts",
            ],
        )
        self.model.experts = Experts(
            gate_up_proj=torch.nn.Parameter(self.gate_up_proj.clone()),
            gate_up_proj_bias=torch.nn.Parameter(self.gate_up_proj_bias.clone()),
            down_proj=torch.nn.Parameter(self.down_proj.clone()),
            down_proj_bias=torch.nn.Parameter(self.down_proj_bias.clone()),
            hidden_size=hs,
            num_experts=ne,
        )

        self.out = torch.empty(seq, batch, hs, device=self.device, dtype=torch.float32)

    def benchmark_base(self):
        self.out, self.expert_weights = self.model(self.x)

    def verify_base(self) -> torch.Tensor:
        ref_out, _ = moe_mlp_reference(
            self.x,
            self.router_weight,
            self.router_bias,
            self.gate_up_proj,
            self.gate_up_proj_bias,
            self.down_proj,
            self.down_proj_bias,
            top_k=4,
        )
        return ref_out

    def setup_large(self):
        # Larger config with more tokens
        ne, hs, isz = 128, 1152, 3072
        batch, seq = 32, 16

        # Router
        self.router_weight = torch.randn(
            ne, hs, device=self.device, dtype=torch.float32
        )
        torch.nn.init.kaiming_uniform_(self.router_weight)
        self.router_bias = torch.zeros(ne, device=self.device, dtype=torch.float32)

        # Expert weights
        self.gate_up_proj = (
            torch.randn(ne, hs, isz, device=self.device, dtype=torch.float32) * 0.02
        )
        self.gate_up_proj_bias = torch.zeros(
            ne, isz, device=self.device, dtype=torch.float32
        )
        self.down_proj = (
            torch.randn(ne, isz // 2, hs, device=self.device, dtype=torch.float32)
            * 0.02
        )
        self.down_proj_bias = torch.zeros(
            ne, hs, device=self.device, dtype=torch.float32
        )

        # Input
        self.x = (
            torch.randn(seq, batch, hs, device=self.device, dtype=torch.float32) * 0.1
        )

        # Setup the model
        self.model = self.kernel.layers.MegaBlocksMoeMLP()
        self.model.router = torch.nn.Linear(hs, ne, device=self.device)
        self.model.router.weight.data = self.router_weight.clone()
        self.model.router.bias.data = self.router_bias.clone()

        Experts = namedtuple(
            "Experts",
            [
                "gate_up_proj",
                "gate_up_proj_bias",
                "down_proj",
                "down_proj_bias",
                "hidden_size",
                "num_experts",
                "capacity_factor",
            ],
        )
        self.model.experts = Experts(
            gate_up_proj=torch.nn.Parameter(self.gate_up_proj.clone()),
            gate_up_proj_bias=torch.nn.Parameter(self.gate_up_proj_bias.clone()),
            down_proj=torch.nn.Parameter(self.down_proj.clone()),
            down_proj_bias=torch.nn.Parameter(self.down_proj_bias.clone()),
            hidden_size=hs,
            num_experts=ne,
            capacity_factor=4.0,  # Higher capacity to avoid token dropping
        )

        self.out = torch.empty(seq, batch, hs, device=self.device, dtype=torch.float32)

    def benchmark_large(self):
        self.out, self.expert_weights = self.model(self.x)

    def verify_large(self) -> torch.Tensor:
        ref_out, _ = moe_mlp_reference(
            self.x,
            self.router_weight,
            self.router_bias,
            self.gate_up_proj,
            self.gate_up_proj_bias,
            self.down_proj,
            self.down_proj_bias,
            top_k=4,
        )
        return ref_out