+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels",
+# "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the layer norm kernel
+layer_norm_kernel = get_kernel("kernels-community/layer-norm")
+
+
+def hf_kernels_layer_norm(x, weight, bias, eps: float = 1e-5):
+ B, S, D = x.shape
+ # The kernel expects [N, D] input; support beta (bias) if provided.
+ out = layer_norm_kernel.dropout_add_ln_fwd(
+ input=x.view(-1, D),
+ gamma=weight,
+ beta=bias,
+ rowscale=None,
+ colscale=None,
+ x0_subset=None,
+ z_subset=None,
+ dropout_p=0.0,
+ epsilon=eps,
+ rowscale_const=1.0,
+ z_numrows=S,
+ gen=None,
+ residual_in_fp32=False,
+ is_rms_norm=False,
+ )[0].view(B, S, D)
+ return out
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.LAYER_NORM,
+ impl_name="hf_kernels_layer_norm",
+ impl_tags={"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"},
+ impl_func=hf_kernels_layer_norm,
+)
+
+
+
+Running layer_norm benchmark on cuda with 4 workloads. + +====================================================================== +PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_layer_norm 4.63% 185.406us 46.16% 1.847ms 1.847ms 0.000us 0.00% 3.120ms 3.120ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 1.69% 67.562us 40.98% 1.640ms 546.562us 2.384ms 100.00% 3.120ms 1.040ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.385ms 100.06% 2.385ms 2.385ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.384ms 100.00% 2.384ms 794.642us 3 + Activity Buffer Request 36.92% 1.477ms 36.92% 1.477ms 1.477ms 735.676us 30.86% 735.676us 735.676us 1 + aten::view 0.54% 21.751us 0.54% 21.751us 3.625us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 1.11% 44.581us 1.11% 44.581us 4.953us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.23% 9.360us 0.23% 9.360us 3.120us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.03% 41.042us 1.03% 41.042us 13.681us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 53.84% 2.154ms 53.84% 2.154ms 2.154ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.001ms +Self CUDA time total: 2.384ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_layer_norm 2.29% 145.447us 26.95% 1.711ms 1.711ms 0.000us 0.00% 6.386ms 6.386ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.75% 47.652us 24.47% 1.553ms 517.784us 4.812ms 100.00% 6.386ms 2.129ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.814ms 100.03% 4.814ms 4.814ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.812ms 100.00% 4.812ms 1.604ms 3 + Activity Buffer Request 22.77% 1.446ms 22.77% 1.446ms 1.446ms 1.574ms 32.71% 1.574ms 1.574ms 1 + aten::view 0.19% 11.759us 0.19% 11.759us 1.960us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.46% 29.151us 0.46% 29.151us 3.239us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 4.860us 0.08% 4.860us 1.620us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.41% 26.131us 0.41% 26.131us 8.710us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.05% 4.638ms 73.05% 4.638ms 4.638ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 6.348ms +Self CUDA time total: 4.812ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_layer_norm 2.00% 126.827us 27.00% 1.712ms 1.712ms 0.000us 0.00% 6.353ms 6.353ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.76% 48.491us 24.80% 1.572ms 524.088us 4.792ms 100.00% 6.353ms 2.118ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.793ms 100.03% 4.793ms 4.793ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.792ms 100.00% 4.792ms 1.597ms 3 + Activity Buffer Request 23.05% 1.462ms 23.05% 1.462ms 1.462ms 1.561ms 32.58% 1.561ms 1.561ms 1 + aten::view 0.20% 12.869us 0.20% 12.869us 2.145us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.48% 30.222us 0.48% 30.222us 3.358us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.090us 0.08% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.42% 26.901us 0.42% 26.901us 8.967us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 73.00% 4.628ms 73.00% 4.628ms 4.628ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 6.340ms +Self CUDA time total: 4.792ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_layer_norm 1.24% 144.853us 19.15% 2.240ms 2.240ms 0.000us 0.00% 12.815ms 12.815ms 1 + _layer_norm_f8ec252::dropout_add_ln_fwd 0.39% 45.741us 17.80% 2.083ms 694.211us 9.628ms 100.00% 12.815ms 4.272ms 3 + hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.629ms 100.01% 9.629ms 9.629ms 1 +void layer_norm::ln_fwd_kernel<layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.00% 9.628ms 3.209ms 3 + Activity Buffer Request 14.62% 1.710ms 14.62% 1.710ms 1.710ms 3.188ms 33.11% 3.188ms 3.188ms 1 + aten::view 0.11% 12.972us 0.11% 12.972us 2.162us 0.000us 0.00% 0.000us 0.000us 6 + aten::empty 0.26% 30.501us 0.26% 30.501us 3.389us 0.000us 0.00% 0.000us 0.000us 9 +cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 5.220us 0.04% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 2.49% 291.291us 2.49% 291.291us 97.097us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 80.85% 9.456ms 80.85% 9.456ms 9.456ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 11.697ms +Self CUDA time total: 9.628ms + + +impl wl p50(ms) ok +hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True +hf_kernels_layer_norm LN_B16_S2048_D8192 1.66 True +hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True +hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True +
+
+▶ UV Install Logs
+
+Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
+Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 6.81it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.12it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.56it/s]
+
+
+