# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
import os
import kernels_benchmark_tools as kbt
from kernels import get_kernel
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
def hf_flash_attention3(query, key, value):
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
kbt.add(
"hf_kernels_flash_attn3",
hf_flash_attention3,
tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
)
if __name__ == "__main__":
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu":
print("HF Kernels Flash Attention 3 requires CUDA - skipping benchmark")
sys.exit(0)
dtype = "bfloat16"
# Flux-like workloads
base = 1024
flux_sizes = [128, 256, 320, 384, 448, 512]
heads = 24
head_dim = 128
wl = []
for L in flux_sizes:
wl.append(
{
"name": f"flux_L{L}",
"batch": 1,
"seq_len": base + L,
"heads": heads,
"head_dim": head_dim,
"dtype": dtype,
"device": device,
"seed": 0,
}
)
kbt.run(
wl,
jsonl="attn.jsonl",
reps=5,
warmup=2,
gen=kbt.attn.gen_qkv,
ref=kbt.attn.ref_math,
cmp=kbt.attn.cmp_allclose,
profile_trace=True
)
kbt.summarize(["attn.jsonl"])
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L128
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 9.00% 178.129us 99.63% 1.971ms 1.971ms 0.000us 0.00% 345.823us 345.823us 1
FlashAttnFunc 6.66% 131.797us 90.63% 1.793ms 597.659us 0.000us 0.00% 345.823us 115.274us 3
_flash_attn3_48fe103_dirty::fwd 4.56% 90.256us 83.97% 1.661ms 553.727us 259.583us 100.00% 345.823us 115.274us 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 292.158us 112.55% 292.158us 292.158us 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 259.583us 100.00% 259.583us 86.528us 3
Activity Buffer Request 73.82% 1.460ms 73.82% 1.460ms 1.460ms 86.240us 33.22% 86.240us 86.240us 1
aten::empty 2.53% 50.052us 2.53% 50.052us 8.342us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.86% 16.921us 0.86% 16.921us 5.640us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 2.20% 43.551us 2.20% 43.551us 14.517us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 0.37% 7.311us 0.37% 7.311us 7.311us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.978ms
Self CUDA time total: 259.583us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L256
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 7.20% 133.787us 96.41% 1.793ms 1.793ms 0.000us 0.00% 393.753us 393.753us 1
FlashAttnFunc 5.05% 93.854us 89.22% 1.659ms 552.953us 0.000us 0.00% 393.753us 131.251us 3
_flash_attn3_48fe103_dirty::fwd 2.68% 49.913us 84.17% 1.565ms 521.669us 293.595us 100.00% 393.753us 131.251us 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 295.003us 100.48% 295.003us 295.003us 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 293.595us 100.00% 293.595us 97.865us 3
Activity Buffer Request 78.08% 1.452ms 78.08% 1.452ms 1.452ms 100.158us 34.11% 100.158us 100.158us 1
aten::empty 1.44% 26.770us 1.44% 26.770us 4.462us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.31% 5.680us 0.31% 5.680us 1.893us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.66% 30.852us 1.66% 30.852us 10.284us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 3.59% 66.713us 3.59% 66.713us 66.713us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.859ms
Self CUDA time total: 293.595us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L320
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 6.76% 125.695us 94.13% 1.750ms 1.750ms 0.000us 0.00% 430.748us 430.748us 1
FlashAttnFunc 4.90% 91.016us 87.37% 1.624ms 541.277us 0.000us 0.00% 430.748us 143.583us 3
_flash_attn3_48fe103_dirty::fwd 2.79% 51.770us 82.47% 1.533ms 510.938us 324.541us 100.00% 430.748us 143.583us 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 325.948us 100.43% 325.948us 325.948us 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 324.541us 100.00% 324.541us 108.180us 3
Activity Buffer Request 76.46% 1.421ms 76.46% 1.421ms 1.421ms 106.207us 32.73% 106.207us 106.207us 1
aten::empty 1.41% 26.162us 1.41% 26.162us 4.360us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.27% 5.061us 0.27% 5.061us 1.687us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.55% 28.862us 1.55% 28.862us 9.621us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 5.87% 109.015us 5.87% 109.015us 109.015us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.859ms
Self CUDA time total: 324.541us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L384
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 6.04% 124.874us 95.07% 1.964ms 1.964ms 0.000us 0.00% 429.567us 429.567us 1
FlashAttnFunc 4.57% 94.345us 89.03% 1.840ms 613.174us 0.000us 0.00% 429.567us 143.189us 3
_flash_attn3_48fe103_dirty::fwd 2.60% 53.754us 84.46% 1.745ms 581.725us 322.591us 100.00% 429.567us 143.189us 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 324.063us 100.46% 324.063us 324.063us 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 322.591us 100.00% 322.591us 107.530us 3
Activity Buffer Request 69.43% 1.434ms 69.43% 1.434ms 1.434ms 106.976us 33.16% 106.976us 106.976us 1
aten::empty 1.29% 26.591us 1.29% 26.591us 4.432us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.25% 5.220us 0.25% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 10.90% 225.141us 10.90% 225.141us 75.047us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 4.93% 101.805us 4.93% 101.805us 101.805us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.066ms
Self CUDA time total: 322.591us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L448
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 5.77% 124.745us 87.87% 1.900ms 1.900ms 0.000us 0.00% 654.301us 654.301us 1
FlashAttnFunc 4.37% 94.576us 82.10% 1.775ms 591.589us 0.000us 0.00% 654.301us 218.100us 3
_flash_attn3_48fe103_dirty::fwd 2.37% 51.203us 77.72% 1.680ms 560.064us 488.670us 100.00% 654.301us 218.100us 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 490.142us 100.30% 490.142us 490.142us 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 488.670us 100.00% 488.670us 162.890us 3
Activity Buffer Request 66.37% 1.435ms 66.37% 1.435ms 1.435ms 165.631us 33.89% 165.631us 165.631us 1
aten::empty 1.25% 26.990us 1.25% 26.990us 4.498us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.24% 5.250us 0.24% 5.250us 1.750us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 7.49% 161.858us 7.49% 161.858us 53.953us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 12.13% 262.313us 12.13% 262.313us 262.313us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.162ms
Self CUDA time total: 488.670us
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L512
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 5.69% 119.216us 86.59% 1.815ms 1.815ms 0.000us 0.00% 666.625us 666.625us 1
FlashAttnFunc 4.40% 92.224us 80.91% 1.696ms 565.401us 0.000us 0.00% 666.625us 222.208us 3
_flash_attn3_48fe103_dirty::fwd 2.44% 51.234us 76.51% 1.604ms 534.659us 497.473us 100.00% 666.625us 222.208us 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 498.849us 100.28% 498.849us 498.849us 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 497.473us 100.00% 497.473us 165.824us 3
Activity Buffer Request 64.99% 1.363ms 64.99% 1.363ms 1.363ms 169.152us 34.00% 169.152us 169.152us 1
aten::empty 1.25% 26.300us 1.25% 26.300us 4.383us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.27% 5.600us 0.27% 5.600us 1.867us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 7.55% 158.288us 7.55% 158.288us 52.763us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 13.41% 281.113us 13.41% 281.113us 281.113us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.097ms
Self CUDA time total: 497.473us
impl wl p50(ms) ok
hf_kernels_flash_attn3 flux_L128 0.13 True
hf_kernels_flash_attn3 flux_L256 0.15 True
hf_kernels_flash_attn3 flux_L320 0.16 True
hf_kernels_flash_attn3 flux_L384 0.16 True
hf_kernels_flash_attn3 flux_L448 0.21 True
hf_kernels_flash_attn3 flux_L512 0.21 True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.23it/s]
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.46it/s]