# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels",
# "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the layer norm kernel
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
B, S, D = x.shape
# The kernel expects [N, D] input; support beta (bias) if provided.
out = layer_norm_kernel.dropout_add_ln_fwd(
input=x.view(-1, D),
gamma=weight,
beta=bias,
rowscale=None,
colscale=None,
x0_subset=None,
z_subset=None,
dropout_p=0.0,
epsilon=eps,
rowscale_const=1.0,
z_numrows=S,
gen=None,
residual_in_fp32=False,
is_rms_norm=False,
)[0].view(B, S, D)
return out
run_benchmark(
kernel_type=KernelTypeEnum.LAYER_NORM,
impl_name="hf_kernels_layer_norm",
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
impl_func=hf_kernels_layer_norm,
)
Running layer_norm benchmark on cuda with 4 workloads.
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 4.95% 198.743us 46.81% 1.878ms 1.878ms 0.000us 0.00% 3.111ms 3.111ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 1.73% 69.535us 41.21% 1.653ms 550.933us 2.375ms 100.00% 3.111ms 1.037ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.376ms 100.07% 2.376ms 2.376ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.375ms 100.00% 2.375ms 791.590us 3
Activity Buffer Request 36.98% 1.483ms 36.98% 1.483ms 1.483ms 736.636us 31.02% 736.636us 736.636us 1
aten::view 0.65% 26.132us 0.65% 26.132us 4.355us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.22% 49.009us 1.22% 49.009us 5.445us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 8.769us 0.22% 8.769us 2.923us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.05% 42.291us 1.05% 42.291us 14.097us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 53.19% 2.133ms 53.19% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.011ms
Self CUDA time total: 2.375ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.97% 125.105us 26.88% 1.705ms 1.705ms 0.000us 0.00% 6.375ms 6.375ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.73% 46.170us 24.73% 1.568ms 522.755us 4.809ms 100.00% 6.375ms 2.125ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.811ms 100.03% 4.811ms 4.811ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.809ms 100.00% 4.809ms 1.603ms 3
Activity Buffer Request 22.98% 1.457ms 22.98% 1.457ms 1.457ms 1.565ms 32.55% 1.565ms 1.565ms 1
aten::view 0.18% 11.529us 0.18% 11.529us 1.922us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.46% 29.430us 0.46% 29.430us 3.270us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.900us 0.08% 4.900us 1.633us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.48% 30.441us 0.48% 30.441us 10.147us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 73.12% 4.638ms 73.12% 4.638ms 4.638ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.343ms
Self CUDA time total: 4.809ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.75% 110.793us 26.94% 1.702ms 1.702ms 0.000us 0.00% 6.331ms 6.331ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.70% 44.248us 25.01% 1.580ms 526.532us 4.779ms 100.00% 6.331ms 2.110ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.781ms 100.03% 4.781ms 4.781ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.779ms 100.00% 4.779ms 1.593ms 3
Activity Buffer Request 23.30% 1.472ms 23.30% 1.472ms 1.472ms 1.552ms 32.48% 1.552ms 1.552ms 1
aten::view 0.18% 11.190us 0.18% 11.190us 1.865us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.49% 30.823us 0.49% 30.823us 3.425us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.981us 0.08% 4.981us 1.660us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.44% 28.031us 0.44% 28.031us 9.344us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 73.06% 4.615ms 73.06% 4.615ms 4.615ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.317ms
Self CUDA time total: 4.779ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.11% 111.882us 6.14% 619.354us 619.354us 0.000us 0.00% 12.808ms 12.808ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.46% 46.119us 4.92% 496.462us 165.487us 9.625ms 100.00% 12.808ms 4.269ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.626ms 100.01% 9.626ms 9.626ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.625ms 100.00% 9.625ms 3.208ms 3
Activity Buffer Request 1.38% 138.943us 1.38% 138.943us 138.943us 3.183ms 33.07% 3.183ms 3.183ms 1
aten::view 0.11% 11.010us 0.11% 11.010us 1.835us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.31% 31.174us 0.31% 31.174us 3.464us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.05% 5.190us 0.05% 5.190us 1.730us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.73% 275.036us 2.73% 275.036us 91.679us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 93.86% 9.465ms 93.86% 9.465ms 9.465ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 10.085ms
Self CUDA time total: 9.625ms
impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.30it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.60it/s]