|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
import sys |
|
|
import kernels_benchmark_tools as kbt |
|
|
from kernels import get_kernel |
|
|
|
|
|
|
|
|
activation = get_kernel("kernels-community/activation") |
|
|
|
|
|
|
|
|
def hf_kernels_swiglu(input_tensor): |
|
|
"""HuggingFace Kernels SwiGLU implementation""" |
|
|
hidden_dim = input_tensor.shape[-1] // 2 |
|
|
out_shape = input_tensor.shape[:-1] + (hidden_dim,) |
|
|
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device) |
|
|
return activation.silu_and_mul(out, input_tensor) |
|
|
|
|
|
|
|
|
|
|
|
kbt.add( |
|
|
"hf_kernels_swiglu", |
|
|
hf_kernels_swiglu, |
|
|
tags={"family": "hf-kernels", "backend": "triton", "compile": "none"}, |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
if device == "cpu": |
|
|
print("HF Kernels SwiGLU requires CUDA - skipping benchmark") |
|
|
sys.exit(0) |
|
|
|
|
|
dtype = "bfloat16" |
|
|
|
|
|
|
|
|
wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3] |
|
|
|
|
|
print(f"Running SwiGLU benchmarks on {device} with {dtype}") |
|
|
print(f"Testing {len(wl)} workloads") |
|
|
|
|
|
|
|
|
kbt.run( |
|
|
wl, |
|
|
jsonl="activation.jsonl", |
|
|
reps=5, |
|
|
warmup=2, |
|
|
gen=kbt.activation.gen_inputs, |
|
|
ref=kbt.activation.ref_swiglu, |
|
|
cmp=kbt.activation.cmp_allclose, |
|
|
profile_trace=True |
|
|
) |
|
|
|
|
|
kbt.summarize(["activation.jsonl"]) |