|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
import sys |
|
|
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark |
|
|
from kernels import get_kernel |
|
|
|
|
|
|
|
|
yamoe = get_kernel("drbh/yamoe", revision="v0.2.0") |
|
|
GptOssExperts = yamoe.vendored.gpt_oss_mlp.GptOssExperts |
|
|
|
|
|
|
|
|
def gpt_oss_openai_moe( |
|
|
hidden_states, |
|
|
router_indices, |
|
|
routing_weights, |
|
|
gate_up_proj, |
|
|
gate_up_proj_bias, |
|
|
down_proj, |
|
|
down_proj_bias, |
|
|
): |
|
|
""" |
|
|
GptOssExperts reference implementation of OpenAI-style MoE. |
|
|
This is the reference model implementation from the original GPT OSS codebase. |
|
|
""" |
|
|
B, S, H = hidden_states.shape |
|
|
E = routing_weights.shape[2] |
|
|
|
|
|
|
|
|
config = type("Config", (), {})() |
|
|
config.hidden_size = H |
|
|
config.intermediate_size = gate_up_proj.shape[2] // 2 |
|
|
config.num_local_experts = E |
|
|
|
|
|
|
|
|
model = GptOssExperts(config) |
|
|
|
|
|
|
|
|
model.gate_up_proj.data = gate_up_proj |
|
|
model.gate_up_proj_bias.data = gate_up_proj_bias |
|
|
model.down_proj.data = down_proj |
|
|
model.down_proj_bias.data = down_proj_bias |
|
|
|
|
|
model = model.to(hidden_states.device) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model.train() |
|
|
|
|
|
|
|
|
routing_weights_flat = routing_weights.view(-1, E) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
output = model(hidden_states, router_indices, routing_weights_flat) |
|
|
|
|
|
model.eval() |
|
|
|
|
|
return output |
|
|
|
|
|
|
|
|
run_benchmark( |
|
|
kernel_type=KernelTypeEnum.OPENAI_MOE, |
|
|
impl_name="gpt_oss_experts", |
|
|
impl_tags={"family": "reference", "backend": "pytorch"}, |
|
|
impl_func=gpt_oss_openai_moe, |
|
|
dtype="float32", |
|
|
) |