# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels",
# "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the layer norm kernel
layer_norm_kernel = get_kernel("kernels-community/layer-norm")
def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
B, S, D = x.shape
# The kernel expects [N, D] input; support beta (bias) if provided.
out = layer_norm_kernel.dropout_add_ln_fwd(
input=x.view(-1, D),
gamma=weight,
beta=bias,
rowscale=None,
colscale=None,
x0_subset=None,
z_subset=None,
dropout_p=0.0,
epsilon=eps,
rowscale_const=1.0,
z_numrows=S,
gen=None,
residual_in_fp32=False,
is_rms_norm=False,
)[0].view(B, S, D)
return out
run_benchmark(
kernel_type=KernelTypeEnum.LAYER_NORM,
impl_name="hf_kernels_layer_norm",
impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
impl_func=hf_kernels_layer_norm,
)
Running layer_norm benchmark on cuda with 4 workloads.
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 4.17% 177.304us 48.13% 2.048ms 2.048ms 0.000us 0.00% 3.167ms 3.167ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 1.47% 62.693us 43.45% 1.849ms 616.229us 2.429ms 100.00% 3.167ms 1.056ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.430ms 100.06% 2.430ms 2.430ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.429ms 100.00% 2.429ms 809.553us 3
Activity Buffer Request 39.70% 1.689ms 39.70% 1.689ms 1.689ms 738.629us 30.41% 738.629us 738.629us 1
aten::view 0.51% 21.739us 0.51% 21.739us 3.623us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 1.04% 44.400us 1.04% 44.400us 4.933us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.22% 9.310us 0.22% 9.310us 3.103us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.01% 43.131us 1.01% 43.131us 14.377us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 51.87% 2.207ms 51.87% 2.207ms 2.207ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.255ms
Self CUDA time total: 2.429ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 2.14% 140.133us 29.32% 1.923ms 1.923ms 0.000us 0.00% 6.388ms 6.388ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.053us 27.01% 1.772ms 590.648us 4.807ms 100.00% 6.388ms 2.129ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.808ms 100.03% 4.808ms 4.808ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.807ms 100.00% 4.807ms 1.602ms 3
Activity Buffer Request 25.34% 1.663ms 25.34% 1.663ms 1.663ms 1.581ms 32.89% 1.581ms 1.581ms 1
aten::view 0.17% 11.390us 0.17% 11.390us 1.898us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.45% 29.620us 0.45% 29.620us 3.291us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.820us 0.07% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.46% 29.860us 0.46% 29.860us 9.953us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 70.68% 4.637ms 70.68% 4.637ms 4.637ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.560ms
Self CUDA time total: 4.807ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.98% 129.253us 29.33% 1.919ms 1.919ms 0.000us 0.00% 6.330ms 6.330ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.71% 46.780us 27.18% 1.779ms 592.854us 4.774ms 100.00% 6.330ms 2.110ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.775ms 100.03% 4.775ms 4.775ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.774ms 100.00% 4.774ms 1.591ms 3
Activity Buffer Request 25.49% 1.668ms 25.49% 1.668ms 1.668ms 1.556ms 32.59% 1.556ms 1.556ms 1
aten::view 0.17% 11.271us 0.17% 11.271us 1.879us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.45% 29.221us 0.45% 29.221us 3.247us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.980us 0.08% 4.980us 1.660us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.45% 29.470us 0.45% 29.470us 9.823us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 70.67% 4.624ms 70.67% 4.624ms 4.624ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.543ms
Self CUDA time total: 4.774ms
======================================================================
PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_layer_norm 1.22% 142.314us 18.53% 2.155ms 2.155ms 0.000us 0.00% 12.836ms 12.836ms 1
_layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.492us 17.20% 2.000ms 666.802us 9.636ms 100.00% 12.836ms 4.279ms 3
hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.637ms 100.02% 9.637ms 9.637ms 1
void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.636ms 100.00% 9.636ms 3.212ms 3
Activity Buffer Request 14.57% 1.694ms 14.57% 1.694ms 1.694ms 3.200ms 33.21% 3.200ms 3.200ms 1
aten::view 0.10% 12.130us 0.10% 12.130us 2.022us 0.000us 0.00% 0.000us 0.000us 6
aten::empty 0.25% 29.499us 0.25% 29.499us 3.278us 0.000us 0.00% 0.000us 0.000us 9
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.820us 0.04% 4.820us 1.607us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.96% 227.814us 1.96% 227.814us 75.938us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 81.47% 9.472ms 81.47% 9.472ms 9.472ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 11.627ms
Self CUDA time total: 9.636ms
impl wl p50(ms) ok
hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True
hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
hf_kernels_layer_norm LN_B16_S4096_D8192 3.25 True
▶ UV Install Logs
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.35it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.71it/s]