drbh's picture
drbh HF Staff
Upload folder using huggingface_hub
81fff32 verified
raw
history blame
1.72 kB
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
import kernels_benchmark_tools as kbt
from kernels import get_kernel
# Load the activation kernel
activation = get_kernel("kernels-community/activation")
def hf_kernels_swiglu(input_tensor):
"""HuggingFace Kernels SwiGLU implementation"""
hidden_dim = input_tensor.shape[-1] // 2
out_shape = input_tensor.shape[:-1] + (hidden_dim,)
out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device)
return activation.silu_and_mul(out, input_tensor)
# Register the implementation
kbt.add(
"hf_kernels_swiglu",
hf_kernels_swiglu,
tags={"family": "hf-kernels", "backend": "triton", "compile": "none"},
)
if __name__ == "__main__":
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu":
print("HF Kernels SwiGLU requires CUDA - skipping benchmark")
sys.exit(0)
dtype = "bfloat16"
# Generate workloads - using a subset for faster testing
wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3] # First 3 workloads
print(f"Running SwiGLU benchmarks on {device} with {dtype}")
print(f"Testing {len(wl)} workloads")
# Run benchmark
kbt.run(
wl,
jsonl="activation.jsonl",
reps=5,
warmup=2,
gen=kbt.activation.gen_inputs,
ref=kbt.activation.ref_swiglu,
cmp=kbt.activation.cmp_allclose,
profile_trace=True
)
kbt.summarize(["activation.jsonl"])