# /// script # requires-python = ">=3.10" # dependencies = [ # "numpy", # "torch==2.8.0", # "kernels-benchmark-tools", # "kernels", # ] # # [tool.uv.sources] # kernels-benchmark-tools = { path = "../../../../../tools", editable = true } # /// import torch import sys import kernels_benchmark_tools as kbt from kernels import get_kernel # Load the activation kernel activation = get_kernel("kernels-community/activation") def hf_kernels_swiglu(input_tensor): """HuggingFace Kernels SwiGLU implementation""" hidden_dim = input_tensor.shape[-1] // 2 out_shape = input_tensor.shape[:-1] + (hidden_dim,) out = torch.empty(out_shape, dtype=input_tensor.dtype, device=input_tensor.device) return activation.silu_and_mul(out, input_tensor) # Register the implementation kbt.add( "hf_kernels_swiglu", hf_kernels_swiglu, tags={"family": "hf-kernels", "backend": "triton", "compile": "none"}, ) if __name__ == "__main__": device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cpu": print("HF Kernels SwiGLU requires CUDA - skipping benchmark") sys.exit(0) dtype = "bfloat16" # Generate workloads - using a subset for faster testing wl = list(kbt.activation.llama_workloads(dtype=dtype))[:3] # First 3 workloads print(f"Running SwiGLU benchmarks on {device} with {dtype}") print(f"Testing {len(wl)} workloads") # Run benchmark kbt.run( wl, jsonl="activation.jsonl", reps=5, warmup=2, gen=kbt.activation.gen_inputs, ref=kbt.activation.ref_swiglu, cmp=kbt.activation.cmp_allclose, profile_trace=True ) kbt.summarize(["activation.jsonl"])