# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention kernel
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn2")
def hf_flash_attention(query, key, value):
"""HuggingFace Kernels Flash Attention"""
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn",
impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
impl_func=hf_flash_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 3.35% 155.232us 44.97% 2.082ms 2.082ms 0.000us 0.00% 3.704ms 3.704ms 1
_flash_attn_9e27194::fwd 1.43% 66.152us 41.62% 1.927ms 642.264us 2.766ms 100.00% 3.704ms 1.235ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.06% 2.768ms 2.768ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.766ms 100.00% 2.766ms 922.153us 3
Activity Buffer Request 37.12% 1.719ms 37.12% 1.719ms 1.719ms 937.630us 33.89% 937.630us 937.630us 1
cudaDeviceGetAttribute 0.12% 5.360us 0.12% 5.360us 0.357us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.39% 18.222us 1.18% 54.592us 18.197us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.79% 36.370us 0.79% 36.370us 12.123us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.56% 25.741us 0.56% 25.741us 2.860us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.30% 13.770us 0.30% 13.770us 4.590us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.92% 42.401us 0.92% 42.401us 14.134us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 55.03% 2.548ms 55.03% 2.548ms 2.548ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.630ms
Self CUDA time total: 2.766ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.95% 91.533us 41.78% 1.962ms 1.962ms 0.000us 0.00% 3.856ms 3.856ms 1
_flash_attn_9e27194::fwd 1.04% 49.050us 39.83% 1.870ms 623.350us 2.882ms 100.00% 3.856ms 1.285ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.884ms 100.05% 2.884ms 2.884ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.882ms 100.00% 2.882ms 960.764us 3
Activity Buffer Request 36.88% 1.732ms 36.88% 1.732ms 1.732ms 973.756us 33.78% 973.756us 973.756us 1
cudaDeviceGetAttribute 0.09% 4.030us 0.09% 4.030us 0.269us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.18% 8.460us 0.61% 28.490us 9.497us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.43% 20.030us 0.43% 20.030us 6.677us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.55% 25.961us 0.55% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.700us 0.08% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.58% 27.091us 0.58% 27.091us 9.030us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.22% 2.734ms 58.22% 2.734ms 2.734ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.695ms
Self CUDA time total: 2.882ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.18% 107.861us 40.50% 2.008ms 2.008ms 0.000us 0.00% 4.125ms 4.125ms 1
_flash_attn_9e27194::fwd 0.99% 48.872us 38.32% 1.900ms 633.314us 3.094ms 100.00% 4.125ms 1.375ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.095ms 100.05% 3.095ms 3.095ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 100.00% 3.094ms 1.031ms 3
Activity Buffer Request 35.67% 1.768ms 35.67% 1.768ms 1.768ms 1.032ms 33.35% 1.032ms 1.032ms 1
cudaDeviceGetAttribute 0.09% 4.480us 0.09% 4.480us 0.299us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.13% 6.580us 0.45% 22.520us 7.507us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.32% 15.940us 0.32% 15.940us 5.313us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.47% 23.250us 0.47% 23.250us 2.583us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.58% 28.541us 0.58% 28.541us 9.514us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.50% 2.950ms 59.50% 2.950ms 2.950ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.958ms
Self CUDA time total: 3.094ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.18% 109.362us 41.99% 2.109ms 2.109ms 0.000us 0.00% 4.102ms 4.102ms 1
_flash_attn_9e27194::fwd 1.01% 50.650us 39.81% 1.999ms 666.498us 3.061ms 100.00% 4.102ms 1.367ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.062ms 100.05% 3.062ms 3.062ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.061ms 100.00% 3.061ms 1.020ms 3
Activity Buffer Request 33.41% 1.678ms 33.41% 1.678ms 1.678ms 1.041ms 34.02% 1.041ms 1.041ms 1
cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.14% 6.851us 0.49% 24.381us 8.127us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.35% 17.530us 0.35% 17.530us 5.843us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.44% 22.140us 0.44% 22.140us 2.460us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.810us 0.08% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.31% 216.396us 4.31% 216.396us 72.132us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.01% 2.914ms 58.01% 2.914ms 2.914ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.023ms
Self CUDA time total: 3.061ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.91% 108.693us 38.60% 2.193ms 2.193ms 0.000us 0.00% 4.850ms 4.850ms 1
_flash_attn_9e27194::fwd 0.87% 49.481us 36.69% 2.084ms 694.644us 3.635ms 100.00% 4.850ms 1.617ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.637ms 100.05% 3.637ms 3.637ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.635ms 100.00% 3.635ms 1.212ms 3
Activity Buffer Request 31.43% 1.785ms 31.43% 1.785ms 1.785ms 1.215ms 33.41% 1.215ms 1.215ms 1
cudaDeviceGetAttribute 0.07% 3.761us 0.07% 3.761us 0.251us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.12% 6.970us 0.43% 24.340us 8.113us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.31% 17.370us 0.31% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.43% 24.270us 0.43% 24.270us 2.697us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.40% 193.224us 3.40% 193.224us 64.408us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 61.40% 3.487ms 61.40% 3.487ms 3.487ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.680ms
Self CUDA time total: 3.635ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.90% 106.201us 36.85% 2.064ms 2.064ms 0.000us 0.00% 4.915ms 4.915ms 1
_flash_attn_9e27194::fwd 0.89% 50.062us 34.96% 1.958ms 652.751us 3.682ms 100.00% 4.915ms 1.638ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.684ms 100.05% 3.684ms 3.684ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.682ms 100.00% 3.682ms 1.227ms 3
Activity Buffer Request 29.73% 1.666ms 29.73% 1.666ms 1.666ms 1.233ms 33.48% 1.233ms 1.233ms 1
cudaDeviceGetAttribute 0.07% 4.189us 0.07% 4.189us 0.279us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.12% 6.851us 0.45% 25.301us 8.434us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.33% 18.450us 0.33% 18.450us 6.150us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.40% 22.632us 0.40% 22.632us 2.515us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.33% 186.623us 3.33% 186.623us 62.208us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.15% 3.537ms 63.15% 3.537ms 3.537ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.602ms
Self CUDA time total: 3.682ms
impl wl p50(ms) ok
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.24 True
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Fetching 20 files: 10%|█ | 2/20 [00:01<00:15, 1.16it/s]
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.63it/s]