# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention kernel
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn2")
def hf_flash_attention(query, key, value):
"""HuggingFace Kernels Flash Attention"""
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn",
impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
impl_func=hf_flash_attention,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 3.19% 147.591us 44.59% 2.062ms 2.062ms 0.000us 0.00% 3.719ms 3.719ms 1
_flash_attn_9e27194::fwd 1.32% 60.849us 41.40% 1.914ms 638.151us 2.771ms 100.00% 3.719ms 1.240ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.773ms 100.06% 2.773ms 2.773ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.771ms 100.00% 2.771ms 923.713us 3
Activity Buffer Request 37.16% 1.718ms 37.16% 1.718ms 1.718ms 947.777us 34.20% 947.777us 947.777us 1
cudaDeviceGetAttribute 0.09% 4.211us 0.09% 4.211us 0.281us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.37% 16.891us 1.10% 50.702us 16.901us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.73% 33.811us 0.73% 33.811us 11.270us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.54% 24.922us 0.54% 24.922us 2.769us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.27% 12.349us 0.27% 12.349us 4.116us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.93% 42.971us 0.93% 42.971us 14.324us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 55.41% 2.563ms 55.41% 2.563ms 2.563ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.625ms
Self CUDA time total: 2.771ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.95% 91.420us 40.89% 1.916ms 1.916ms 0.000us 0.00% 3.901ms 3.901ms 1
_flash_attn_9e27194::fwd 0.98% 45.792us 38.94% 1.825ms 608.181us 2.914ms 100.00% 3.901ms 1.300ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.916ms 100.05% 2.916ms 2.916ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.914ms 100.00% 2.914ms 971.481us 3
Activity Buffer Request 36.29% 1.700ms 36.29% 1.700ms 1.700ms 986.884us 33.86% 986.884us 986.884us 1
cudaDeviceGetAttribute 0.07% 3.500us 0.07% 3.500us 0.233us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.15% 6.960us 0.52% 24.320us 8.107us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.37% 17.360us 0.37% 17.360us 5.787us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.45% 21.021us 0.45% 21.021us 2.336us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.08% 3.519us 0.08% 3.519us 1.173us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.55% 25.931us 0.55% 25.931us 8.644us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.11% 2.770ms 59.11% 2.770ms 2.770ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.686ms
Self CUDA time total: 2.914ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.13% 103.462us 40.42% 1.967ms 1.967ms 0.000us 0.00% 4.069ms 4.069ms 1
_flash_attn_9e27194::fwd 0.94% 45.522us 38.30% 1.863ms 621.134us 3.040ms 100.00% 4.069ms 1.356ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.041ms 100.05% 3.041ms 3.041ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.040ms 100.00% 3.040ms 1.013ms 3
Activity Buffer Request 35.70% 1.737ms 35.70% 1.737ms 1.737ms 1.029ms 33.84% 1.029ms 1.029ms 1
cudaDeviceGetAttribute 0.07% 3.488us 0.07% 3.488us 0.233us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.13% 6.550us 0.49% 24.010us 8.003us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.36% 17.460us 0.36% 17.460us 5.820us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.47% 22.651us 0.47% 22.651us 2.517us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.621us 0.07% 3.621us 1.207us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.55% 26.960us 0.55% 26.960us 8.987us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.58% 2.899ms 59.58% 2.899ms 2.899ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.866ms
Self CUDA time total: 3.040ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.03% 100.371us 41.00% 2.032ms 2.032ms 0.000us 0.00% 4.098ms 4.098ms 1
_flash_attn_9e27194::fwd 0.92% 45.401us 38.98% 1.931ms 643.821us 3.066ms 100.00% 4.098ms 1.366ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.068ms 100.05% 3.068ms 3.068ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.066ms 100.00% 3.066ms 1.022ms 3
Activity Buffer Request 32.94% 1.632ms 32.94% 1.632ms 1.632ms 1.032ms 33.68% 1.032ms 1.032ms 1
cudaDeviceGetAttribute 0.07% 3.502us 0.07% 3.502us 0.233us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.14% 6.780us 0.47% 23.270us 7.757us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.33% 16.490us 0.33% 16.490us 5.497us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.45% 22.299us 0.45% 22.299us 2.478us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.09% 4.220us 0.09% 4.220us 1.407us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.04% 200.304us 4.04% 200.304us 66.768us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.00% 2.924ms 59.00% 2.924ms 2.924ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.956ms
Self CUDA time total: 3.066ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 2.01% 110.531us 38.27% 2.104ms 2.104ms 0.000us 0.00% 4.721ms 4.721ms 1
_flash_attn_9e27194::fwd 0.85% 46.845us 36.26% 1.993ms 664.435us 3.536ms 100.00% 4.721ms 1.574ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.537ms 100.04% 3.537ms 3.537ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.536ms 100.00% 3.536ms 1.179ms 3
Activity Buffer Request 31.52% 1.733ms 31.52% 1.733ms 1.733ms 1.186ms 33.53% 1.186ms 1.186ms 1
cudaDeviceGetAttribute 0.07% 3.850us 0.07% 3.850us 0.257us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.13% 7.081us 0.42% 23.120us 7.707us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.29% 16.039us 0.29% 16.039us 5.346us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.38% 21.099us 0.38% 21.099us 2.344us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.738us 0.07% 3.738us 1.246us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.95% 161.933us 2.95% 161.933us 53.978us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 61.73% 3.393ms 61.73% 3.393ms 3.393ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.497ms
Self CUDA time total: 3.536ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 1.92% 105.962us 36.83% 2.036ms 2.036ms 0.000us 0.00% 4.864ms 4.864ms 1
_flash_attn_9e27194::fwd 0.86% 47.350us 34.91% 1.930ms 643.481us 3.642ms 100.00% 4.864ms 1.621ms 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.643ms 100.04% 3.643ms 3.643ms 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.642ms 100.00% 3.642ms 1.214ms 3
Activity Buffer Request 30.16% 1.668ms 30.16% 1.668ms 1.668ms 1.222ms 33.55% 1.222ms 1.222ms 1
cudaDeviceGetAttribute 0.06% 3.551us 0.06% 3.551us 0.237us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.12% 6.900us 0.42% 23.180us 7.727us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.29% 16.280us 0.29% 16.280us 5.427us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.40% 21.939us 0.40% 21.939us 2.438us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.07% 3.861us 0.07% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.95% 163.043us 2.95% 163.043us 54.348us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 63.17% 3.493ms 63.17% 3.493ms 3.493ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.529ms
Self CUDA time total: 3.642ms
impl wl p50(ms) ok
hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True
hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.22 True
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:03, 6.04it/s]
Fetching 20 files: 10%|█ | 2/20 [00:01<00:20, 1.14s/it]
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 10.05it/s]