HF Kernels - Flash Attention 3

HuggingFace Kernels Flash Attention 3 Benchmark

▼ code ▼ output ▶ uv-logs | Cell: benchmark | 5.65s | Raw GitHub 🤗 HF
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "torch==2.8.0",
#     "kernels-benchmark-tools",
#     "kernels",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
import os
import kernels_benchmark_tools as kbt
from kernels import get_kernel

hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")


def hf_flash_attention3(query, key, value):
    return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]


kbt.add(
    "hf_kernels_flash_attn3",
    hf_flash_attention3,
    tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
)

if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if device == "cpu":
        print("HF Kernels Flash Attention 3 requires CUDA - skipping benchmark")
        sys.exit(0)

    dtype = "bfloat16"

    # Flux-like workloads
    base = 1024
    flux_sizes = [128, 256, 320, 384, 448, 512]
    heads = 24
    head_dim = 128

    wl = []
    for L in flux_sizes:
        wl.append(
            {
                "name": f"flux_L{L}",
                "batch": 1,
                "seq_len": base + L,
                "heads": heads,
                "head_dim": head_dim,
                "dtype": dtype,
                "device": device,
                "seed": 0,
            }
        )

    kbt.run(
        wl,
        jsonl="attn.jsonl",
        reps=5,
        warmup=2,
        gen=kbt.attn.gen_qkv,
        ref=kbt.attn.ref_math,
        cmp=kbt.attn.cmp_allclose,
        profile_trace=True
    )
    kbt.summarize(["attn.jsonl"])
======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L128
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 hf_kernels_flash_attn3         9.00%     178.129us        99.63%       1.971ms       1.971ms       0.000us         0.00%     345.823us     345.823us             1  
                                          FlashAttnFunc         6.66%     131.797us        90.63%       1.793ms     597.659us       0.000us         0.00%     345.823us     115.274us             3  
                        _flash_attn3_48fe103_dirty::fwd         4.56%      90.256us        83.97%       1.661ms     553.727us     259.583us       100.00%     345.823us     115.274us             3  
                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     292.158us       112.55%     292.158us     292.158us             1  
void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     259.583us       100.00%     259.583us      86.528us             3  
                                Activity Buffer Request        73.82%       1.460ms        73.82%       1.460ms       1.460ms      86.240us        33.22%      86.240us      86.240us             1  
                                            aten::empty         2.53%      50.052us         2.53%      50.052us       8.342us       0.000us         0.00%       0.000us       0.000us             6  
                                   cudaFuncSetAttribute         0.86%      16.921us         0.86%      16.921us       5.640us       0.000us         0.00%       0.000us       0.000us             3  
                                       cudaLaunchKernel         2.20%      43.551us         2.20%      43.551us      14.517us       0.000us         0.00%       0.000us       0.000us             3  
                                  cudaDeviceSynchronize         0.37%       7.311us         0.37%       7.311us       7.311us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.978ms
Self CUDA time total: 259.583us



======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L256
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 hf_kernels_flash_attn3         7.20%     133.787us        96.41%       1.793ms       1.793ms       0.000us         0.00%     393.753us     393.753us             1  
                                          FlashAttnFunc         5.05%      93.854us        89.22%       1.659ms     552.953us       0.000us         0.00%     393.753us     131.251us             3  
                        _flash_attn3_48fe103_dirty::fwd         2.68%      49.913us        84.17%       1.565ms     521.669us     293.595us       100.00%     393.753us     131.251us             3  
                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     295.003us       100.48%     295.003us     295.003us             1  
void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     293.595us       100.00%     293.595us      97.865us             3  
                                Activity Buffer Request        78.08%       1.452ms        78.08%       1.452ms       1.452ms     100.158us        34.11%     100.158us     100.158us             1  
                                            aten::empty         1.44%      26.770us         1.44%      26.770us       4.462us       0.000us         0.00%       0.000us       0.000us             6  
                                   cudaFuncSetAttribute         0.31%       5.680us         0.31%       5.680us       1.893us       0.000us         0.00%       0.000us       0.000us             3  
                                       cudaLaunchKernel         1.66%      30.852us         1.66%      30.852us      10.284us       0.000us         0.00%       0.000us       0.000us             3  
                                  cudaDeviceSynchronize         3.59%      66.713us         3.59%      66.713us      66.713us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.859ms
Self CUDA time total: 293.595us



======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L320
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 hf_kernels_flash_attn3         6.76%     125.695us        94.13%       1.750ms       1.750ms       0.000us         0.00%     430.748us     430.748us             1  
                                          FlashAttnFunc         4.90%      91.016us        87.37%       1.624ms     541.277us       0.000us         0.00%     430.748us     143.583us             3  
                        _flash_attn3_48fe103_dirty::fwd         2.79%      51.770us        82.47%       1.533ms     510.938us     324.541us       100.00%     430.748us     143.583us             3  
                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     325.948us       100.43%     325.948us     325.948us             1  
void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     324.541us       100.00%     324.541us     108.180us             3  
                                Activity Buffer Request        76.46%       1.421ms        76.46%       1.421ms       1.421ms     106.207us        32.73%     106.207us     106.207us             1  
                                            aten::empty         1.41%      26.162us         1.41%      26.162us       4.360us       0.000us         0.00%       0.000us       0.000us             6  
                                   cudaFuncSetAttribute         0.27%       5.061us         0.27%       5.061us       1.687us       0.000us         0.00%       0.000us       0.000us             3  
                                       cudaLaunchKernel         1.55%      28.862us         1.55%      28.862us       9.621us       0.000us         0.00%       0.000us       0.000us             3  
                                  cudaDeviceSynchronize         5.87%     109.015us         5.87%     109.015us     109.015us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.859ms
Self CUDA time total: 324.541us



======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L384
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 hf_kernels_flash_attn3         6.04%     124.874us        95.07%       1.964ms       1.964ms       0.000us         0.00%     429.567us     429.567us             1  
                                          FlashAttnFunc         4.57%      94.345us        89.03%       1.840ms     613.174us       0.000us         0.00%     429.567us     143.189us             3  
                        _flash_attn3_48fe103_dirty::fwd         2.60%      53.754us        84.46%       1.745ms     581.725us     322.591us       100.00%     429.567us     143.189us             3  
                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     324.063us       100.46%     324.063us     324.063us             1  
void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     322.591us       100.00%     322.591us     107.530us             3  
                                Activity Buffer Request        69.43%       1.434ms        69.43%       1.434ms       1.434ms     106.976us        33.16%     106.976us     106.976us             1  
                                            aten::empty         1.29%      26.591us         1.29%      26.591us       4.432us       0.000us         0.00%       0.000us       0.000us             6  
                                   cudaFuncSetAttribute         0.25%       5.220us         0.25%       5.220us       1.740us       0.000us         0.00%       0.000us       0.000us             3  
                                       cudaLaunchKernel        10.90%     225.141us        10.90%     225.141us      75.047us       0.000us         0.00%       0.000us       0.000us             3  
                                  cudaDeviceSynchronize         4.93%     101.805us         4.93%     101.805us     101.805us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.066ms
Self CUDA time total: 322.591us



======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L448
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 hf_kernels_flash_attn3         5.77%     124.745us        87.87%       1.900ms       1.900ms       0.000us         0.00%     654.301us     654.301us             1  
                                          FlashAttnFunc         4.37%      94.576us        82.10%       1.775ms     591.589us       0.000us         0.00%     654.301us     218.100us             3  
                        _flash_attn3_48fe103_dirty::fwd         2.37%      51.203us        77.72%       1.680ms     560.064us     488.670us       100.00%     654.301us     218.100us             3  
                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     490.142us       100.30%     490.142us     490.142us             1  
void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     488.670us       100.00%     488.670us     162.890us             3  
                                Activity Buffer Request        66.37%       1.435ms        66.37%       1.435ms       1.435ms     165.631us        33.89%     165.631us     165.631us             1  
                                            aten::empty         1.25%      26.990us         1.25%      26.990us       4.498us       0.000us         0.00%       0.000us       0.000us             6  
                                   cudaFuncSetAttribute         0.24%       5.250us         0.24%       5.250us       1.750us       0.000us         0.00%       0.000us       0.000us             3  
                                       cudaLaunchKernel         7.49%     161.858us         7.49%     161.858us      53.953us       0.000us         0.00%       0.000us       0.000us             3  
                                  cudaDeviceSynchronize        12.13%     262.313us        12.13%     262.313us     262.313us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.162ms
Self CUDA time total: 488.670us



======================================================================
PROFILE TRACE: hf_kernels_flash_attn3 | flux_L512
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 hf_kernels_flash_attn3         5.69%     119.216us        86.59%       1.815ms       1.815ms       0.000us         0.00%     666.625us     666.625us             1  
                                          FlashAttnFunc         4.40%      92.224us        80.91%       1.696ms     565.401us       0.000us         0.00%     666.625us     222.208us             3  
                        _flash_attn3_48fe103_dirty::fwd         2.44%      51.234us        76.51%       1.604ms     534.659us     497.473us       100.00%     666.625us     222.208us             3  
                                 hf_kernels_flash_attn3         0.00%       0.000us         0.00%       0.000us       0.000us     498.849us       100.28%     498.849us     498.849us             1  
void cutlass::device_kernel<flash::enable_sm80_to_sm...         0.00%       0.000us         0.00%       0.000us       0.000us     497.473us       100.00%     497.473us     165.824us             3  
                                Activity Buffer Request        64.99%       1.363ms        64.99%       1.363ms       1.363ms     169.152us        34.00%     169.152us     169.152us             1  
                                            aten::empty         1.25%      26.300us         1.25%      26.300us       4.383us       0.000us         0.00%       0.000us       0.000us             6  
                                   cudaFuncSetAttribute         0.27%       5.600us         0.27%       5.600us       1.867us       0.000us         0.00%       0.000us       0.000us             3  
                                       cudaLaunchKernel         7.55%     158.288us         7.55%     158.288us      52.763us       0.000us         0.00%       0.000us       0.000us             3  
                                  cudaDeviceSynchronize        13.41%     281.113us        13.41%     281.113us     281.113us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.097ms
Self CUDA time total: 497.473us


impl                     wl                  p50(ms)  ok
hf_kernels_flash_attn3   flux_L128              0.13  True
hf_kernels_flash_attn3   flux_L256              0.15  True
hf_kernels_flash_attn3   flux_L320              0.16  True
hf_kernels_flash_attn3   flux_L384              0.16  True
hf_kernels_flash_attn3   flux_L448              0.21  True
hf_kernels_flash_attn3   flux_L512              0.21  True
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s] Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.23it/s] Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.46it/s]

Artifacts:

attn.jsonl