# /// script
# requires-python = ">=3.10"
# dependencies = [
# "numpy",
# "torch==2.8.0",
# "kernels-benchmark-tools",
# "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
from kernels import get_kernel
# Load the flash attention 3 kernel
hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
def hf_flash_attention3(query, key, value):
return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
run_benchmark(
kernel_type=KernelTypeEnum.ATTENTION,
impl_name="hf_kernels_flash_attn3",
impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
impl_func=hf_flash_attention3,
)
Running attention benchmark on cuda with 6 workloads.
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 3.59% 165.413us 48.47% 2.234ms 2.234ms 0.000us 0.00% 3.561ms 3.561ms 1
FlashAttnFunc 2.69% 124.054us 44.88% 2.069ms 689.509us 0.000us 0.00% 3.561ms 1.187ms 3
_flash_attn3_1d39a44::fwd 1.63% 74.991us 42.19% 1.944ms 648.158us 2.673ms 100.00% 3.561ms 1.187ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.674ms 100.05% 2.674ms 2.674ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.673ms 100.00% 2.673ms 890.896us 3
Activity Buffer Request 38.25% 1.763ms 38.25% 1.763ms 1.763ms 888.250us 33.23% 888.250us 888.250us 1
aten::empty 0.95% 43.951us 0.95% 43.951us 7.325us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.32% 14.620us 0.32% 14.620us 4.873us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 1.04% 47.991us 1.04% 47.991us 15.997us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 51.53% 2.375ms 51.53% 2.375ms 2.375ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.609ms
Self CUDA time total: 2.673ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.68% 124.013us 44.92% 2.080ms 2.080ms 0.000us 0.00% 3.716ms 3.716ms 1
FlashAttnFunc 1.96% 90.863us 42.24% 1.956ms 652.078us 0.000us 0.00% 3.716ms 1.239ms 3
_flash_attn3_1d39a44::fwd 1.06% 49.109us 40.28% 1.865ms 621.790us 2.770ms 100.00% 3.716ms 1.239ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.772ms 100.05% 2.772ms 2.772ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.770ms 100.00% 2.770ms 923.461us 3
Activity Buffer Request 37.83% 1.752ms 37.83% 1.752ms 1.752ms 945.210us 34.12% 945.210us 945.210us 1
aten::empty 0.60% 27.931us 0.60% 27.931us 4.655us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.67% 30.831us 0.67% 30.831us 10.277us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 55.08% 2.551ms 55.08% 2.551ms 2.551ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.631ms
Self CUDA time total: 2.770ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.68% 125.914us 44.02% 2.072ms 2.072ms 0.000us 0.00% 3.816ms 3.816ms 1
FlashAttnFunc 1.89% 89.112us 41.34% 1.946ms 648.608us 0.000us 0.00% 3.816ms 1.272ms 3
_flash_attn3_1d39a44::fwd 1.01% 47.500us 39.45% 1.857ms 618.904us 2.847ms 100.00% 3.816ms 1.272ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.849ms 100.05% 2.849ms 2.849ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.847ms 100.00% 2.847ms 949.087us 3
Activity Buffer Request 37.07% 1.745ms 37.07% 1.745ms 1.745ms 968.895us 34.03% 968.895us 968.895us 1
aten::empty 0.58% 27.171us 0.58% 27.171us 4.529us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.12% 5.621us 0.12% 5.621us 1.874us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 0.67% 31.690us 0.67% 31.690us 10.563us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 55.98% 2.635ms 55.98% 2.635ms 2.635ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.706ms
Self CUDA time total: 2.847ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.55% 127.134us 45.51% 2.268ms 2.268ms 0.000us 0.00% 3.920ms 3.920ms 1
FlashAttnFunc 1.80% 89.881us 42.96% 2.141ms 713.505us 0.000us 0.00% 3.920ms 1.307ms 3
_flash_attn3_1d39a44::fwd 0.97% 48.541us 41.15% 2.051ms 683.545us 2.930ms 100.00% 3.920ms 1.307ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.05% 2.932ms 2.932ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.930ms 100.00% 2.930ms 976.824us 3
Activity Buffer Request 35.08% 1.748ms 35.08% 1.748ms 1.748ms 989.112us 33.75% 989.112us 989.112us 1
aten::empty 0.54% 27.071us 0.54% 27.071us 4.512us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.11% 5.498us 0.11% 5.498us 1.833us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 4.45% 221.646us 4.45% 221.646us 73.882us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 54.49% 2.715ms 54.49% 2.715ms 2.715ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 4.983ms
Self CUDA time total: 2.930ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.34% 128.034us 40.76% 2.227ms 2.227ms 0.000us 0.00% 4.607ms 4.607ms 1
FlashAttnFunc 1.67% 91.131us 38.42% 2.098ms 699.492us 0.000us 0.00% 4.607ms 1.536ms 3
_flash_attn3_1d39a44::fwd 0.87% 47.661us 36.75% 2.007ms 669.115us 3.452ms 100.00% 4.607ms 1.536ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.453ms 100.05% 3.453ms 3.453ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.452ms 100.00% 3.452ms 1.151ms 3
Activity Buffer Request 31.93% 1.744ms 31.93% 1.744ms 1.744ms 1.156ms 33.48% 1.156ms 1.156ms 1
aten::empty 0.52% 28.231us 0.52% 28.231us 4.705us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.270us 0.10% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.33% 181.994us 3.33% 181.994us 60.665us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 59.24% 3.235ms 59.24% 3.235ms 3.235ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.462ms
Self CUDA time total: 3.452ms
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
hf_kernels_flash_attn3 2.42% 135.303us 41.95% 2.345ms 2.345ms 0.000us 0.00% 4.617ms 4.617ms 1
FlashAttnFunc 1.78% 99.322us 39.53% 2.210ms 736.513us 0.000us 0.00% 4.617ms 1.539ms 3
_flash_attn3_1d39a44::fwd 0.92% 51.382us 37.75% 2.110ms 703.406us 3.463ms 100.00% 4.617ms 1.539ms 3
hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.05% 3.464ms 3.464ms 1
void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3
Activity Buffer Request 33.12% 1.851ms 33.12% 1.851ms 1.851ms 1.155ms 33.34% 1.155ms 1.155ms 1
aten::empty 0.54% 30.101us 0.54% 30.101us 5.017us 0.000us 0.00% 0.000us 0.000us 6
cudaFuncSetAttribute 0.10% 5.430us 0.10% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3
cudaLaunchKernel 3.08% 171.953us 3.08% 171.953us 57.318us 0.000us 0.00% 0.000us 0.000us 3
cudaDeviceSynchronize 58.05% 3.245ms 58.05% 3.245ms 3.245ms 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 5.590ms
Self CUDA time total: 3.463ms
impl wl p50(ms) ok
hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
▶ UV Install Logs
Fetching 5 files: 0%| | 0/5 [00:00<?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Fetching 5 files: 20%|██ | 1/5 [00:00<00:01, 3.45it/s]
Fetching 5 files: 40%|████ | 2/5 [00:01<00:02, 1.11it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00, 3.08it/s]