# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
import os
import kernels_benchmark_tools as kbt
from kernels import get_kernel
hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
def hf_flash_attention(query, key, value):
"""HuggingFace Kernels Flash Attention"""
return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
kbt.add(
"hf_kernels_flash_attn",
hf_flash_attention,
tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
)
if __name__ == "__main__":
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu":
print("HF Kernels Flash Attention requires CUDA - skipping benchmark")
sys.exit(0)
dtype = "bfloat16"
# Flux-like workloads
base = 1024
flux_sizes = [128, 256, 320, 384, 448, 512]
heads = 24
head_dim = 128
wl = []
for L in flux_sizes:
wl.append(
{
"name": f"flux_L{L}",
"batch": 1,
"seq_len": base + L,
"heads": heads,
"head_dim": head_dim,
"dtype": dtype,
"device": device,
"seed": 0,
}
)
kbt.run(
wl,
jsonl="attn.jsonl",
reps=5,
warmup=2,
gen=kbt.attn.gen_qkv,
ref=kbt.attn.ref_math,
cmp=kbt.attn.cmp_allclose,
profile_trace=True
)
kbt.summarize(["attn.jsonl"])
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | flux_L128
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 8.36% 154.078us 96.88% 1.786ms 1.786ms 0.000us 0.00% 362.493us 362.493us 1
_flash_attn_9e27194::fwd 3.99% 73.523us 88.52% 1.632ms 543.906us 271.102us 100.00% 362.493us 120.831us 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 272.638us 100.57% 272.638us 272.638us 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 271.102us 100.00% 271.102us 90.367us 3
Activity Buffer Request 76.97% 1.419ms 76.97% 1.419ms 1.419ms 91.391us 33.71% 91.391us 91.391us 1
cudaDeviceGetAttribute 0.25% 4.549us 0.25% 4.549us 0.303us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.95% 17.511us 2.83% 52.153us 17.384us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 1.88% 34.642us 1.88% 34.642us 11.547us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 1.44% 26.603us 1.44% 26.603us 2.956us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.78% 14.320us 0.78% 14.320us 4.773us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.27% 41.882us 2.27% 41.882us 13.961us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 3.12% 57.433us 3.12% 57.433us 57.433us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.843ms
Self CUDA time total: 271.102us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | flux_L256
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 6.38% 115.656us 91.71% 1.662ms 1.662ms 0.000us 0.00% 396.671us 396.671us 1
_flash_attn_9e27194::fwd 2.82% 51.131us 85.33% 1.547ms 515.555us 298.303us 100.00% 396.671us 132.224us 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 299.743us 100.48% 299.743us 299.743us 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 298.303us 100.00% 298.303us 99.434us 3
Activity Buffer Request 77.99% 1.414ms 77.99% 1.414ms 1.414ms 98.368us 32.98% 98.368us 98.368us 1
cudaDeviceGetAttribute 0.22% 3.931us 0.22% 3.931us 0.262us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.40% 7.190us 1.33% 24.041us 8.014us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.93% 16.851us 0.93% 16.851us 5.617us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 1.25% 22.681us 1.25% 22.681us 2.520us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.21% 3.730us 0.21% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.51% 27.451us 1.51% 27.451us 9.150us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 8.29% 150.237us 8.29% 150.237us 150.237us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.813ms
Self CUDA time total: 298.303us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | flux_L320
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 6.16% 112.885us 90.78% 1.663ms 1.663ms 0.000us 0.00% 427.613us 427.613us 1
_flash_attn_9e27194::fwd 2.80% 51.281us 84.62% 1.550ms 516.788us 318.526us 100.00% 427.613us 142.538us 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 319.901us 100.43% 319.901us 319.901us 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 318.526us 100.00% 318.526us 106.175us 3
Activity Buffer Request 77.28% 1.416ms 77.28% 1.416ms 1.416ms 109.087us 34.25% 109.087us 109.087us 1
cudaDeviceGetAttribute 0.21% 3.930us 0.21% 3.930us 0.262us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.41% 7.431us 1.40% 25.731us 8.577us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 1.00% 18.300us 1.00% 18.300us 6.100us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 1.26% 23.051us 1.26% 23.051us 2.561us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.22% 4.001us 0.22% 4.001us 1.334us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.45% 26.532us 1.45% 26.532us 8.844us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 9.22% 168.858us 9.22% 168.858us 168.858us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.832ms
Self CUDA time total: 318.526us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | flux_L384
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 5.43% 111.055us 91.19% 1.866ms 1.866ms 0.000us 0.00% 446.776us 446.776us 1
_flash_attn_9e27194::fwd 2.54% 51.901us 85.76% 1.755ms 584.928us 331.162us 100.00% 446.776us 148.925us 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 332.667us 100.45% 332.667us 332.667us 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 331.162us 100.00% 331.162us 110.387us 3
Activity Buffer Request 69.78% 1.428ms 69.78% 1.428ms 1.428ms 115.614us 34.91% 115.614us 115.614us 1
cudaDeviceGetAttribute 0.19% 3.942us 0.19% 3.942us 0.263us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.39% 8.070us 1.24% 25.461us 8.487us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.85% 17.391us 0.85% 17.391us 5.797us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 1.08% 22.080us 1.08% 22.080us 2.453us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.19% 3.861us 0.19% 3.861us 1.287us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 10.75% 219.880us 10.75% 219.880us 73.293us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 8.81% 180.219us 8.81% 180.219us 180.219us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.046ms
Self CUDA time total: 331.162us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | flux_L448
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 4.92% 108.784us 84.29% 1.864ms 1.864ms 0.000us 0.00% 663.288us 663.288us 1
_flash_attn_9e27194::fwd 2.26% 49.951us 79.37% 1.755ms 585.135us 493.882us 100.00% 663.288us 221.096us 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 495.418us 100.31% 495.418us 495.418us 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 493.882us 100.00% 493.882us 164.627us 3
Activity Buffer Request 65.22% 1.442ms 65.22% 1.442ms 1.442ms 169.406us 34.30% 169.406us 169.406us 1
cudaDeviceGetAttribute 0.18% 3.990us 0.18% 3.990us 0.266us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.34% 7.522us 1.12% 24.742us 8.247us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.78% 17.220us 0.78% 17.220us 5.740us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.96% 21.140us 0.96% 21.140us 2.349us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.19% 4.121us 0.19% 4.121us 1.374us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 9.45% 209.092us 9.45% 209.092us 69.697us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 15.71% 347.407us 15.71% 347.407us 347.407us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.212ms
Self CUDA time total: 493.882us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn | flux_L512
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn 4.96% 110.355us 83.23% 1.852ms 1.852ms 0.000us 0.00% 697.540us 697.540us 1
_flash_attn_9e27194::fwd 2.27% 50.469us 78.28% 1.742ms 580.665us 518.659us 100.00% 697.540us 232.513us 3
hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 520.068us 100.27% 520.068us 520.068us 1
void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 518.659us 100.00% 518.659us 172.886us 3
Activity Buffer Request 64.27% 1.430ms 64.27% 1.430ms 1.430ms 178.881us 34.49% 178.881us 178.881us 1
cudaDeviceGetAttribute 0.17% 3.832us 0.17% 3.832us 0.255us 0.000us 0.00% 0.000us 0.000us 15
aten::empty_like 0.33% 7.341us 1.15% 25.571us 8.524us 0.000us 0.00% 0.000us 0.000us 3
aten::empty_strided 0.82% 18.230us 0.82% 18.230us 6.077us 0.000us 0.00% 0.000us 0.000us 3
aten::empty 0.94% 20.812us 0.94% 20.812us 2.312us 0.000us 0.00% 0.000us 0.000us 9
cudaFuncSetAttribute 0.19% 4.171us 0.19% 4.171us 1.390us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 9.29% 206.809us 9.29% 206.809us 68.936us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 16.77% 373.119us 16.77% 373.119us 373.119us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.225ms
Self CUDA time total: 518.659us
impl wl p50(ms) ok
hf_kernels_flash_attn flux_L128 0.12 True
hf_kernels_flash_attn flux_L256 0.14 True
hf_kernels_flash_attn flux_L320 0.14 True
hf_kernels_flash_attn flux_L384 0.15 True
hf_kernels_flash_attn flux_L448 0.20 True
hf_kernels_flash_attn flux_L512 0.20 True
Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
Fetching 20 files: 10%|█ | 2/20 [00:01<00:16, 1.08it/s]
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 10.78it/s]