Flash Attention Benchmarks - Aggregated Results

-

This document combines benchmark results from multiple attention implementations -using cross-file dependencies.

-

Combined Summary and Visualization

@@ -4012,7 +3870,7 @@ using cross-file dependencies.

- 2025-10-14T20:47:15.973248 + 2025-10-23T17:22:40.731286 image/svg+xml @@ -4025,2774 +3883,524 @@ using cross-file dependencies.

- + - + - + - + - flux_L128 + flux_L128 - + - + - flux_L256 + flux_L256 - + - + - flux_L320 + flux_L320 - + - + - flux_L384 + flux_L384 - + - + - flux_L448 + flux_L448 - + - + - flux_L512 + flux_L512 - Workload + Workload - + - + - 0.3 + 0.15 - + - + - 0.4 + 0.20 - + - + - 0.5 + 0.25 - + - + - 0.6 + 0.30 - + - + - 0.7 - - - - - - - - - - - - - 0.8 - - - - - - - - - - - - - 0.9 - - - - - - - - - - - - - 1.0 + 0.35 - Latency P50 (ms) + Latency P50 (ms) - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + - + - - - - - - - + + + + + - + - - - - - - - + + + + + + + - + - - - - - - - + + + + + + + - + - + - + - - Attention Implementation Latency + + Attention Implementation Latency - + - - + + - + - torch_flash_ma + torch_flash_ma - - + + - + - torch_mem_eff + torch_mem_eff - - + + - + - xformers_meff + xformers_meff - - + + - + - torch_flash_compiled_default + torch_flash_compiled_default - - + + - + - torch_flash_compiled_max_autotune + torch_flash_compiled_max_autotune - - + + - + - hf_kernels_flash_attn + hf_kernels_flash_attn - - + + - + - hf_kernels_flash_attn3 + hf_kernels_flash_attn3 - - + +
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ImplementationImpl IDWorkloadBatchSeq LengthHeadsHead DimDtypeMean (ms)P10 (ms)P50 (ms)P90 (ms)RepsPeak Mem (MB)BackendFamily
Flash (PyTorch SDPA)torch_flash_maflux_L1281115224128bfloat160.4071232020854950.405375987291336060.407552003860473630.407584011554718583.38FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L2561128024128bfloat160.52350078821182260.52121597528457640.52326399087905880.523360013961792590.62FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L3201134424128bfloat160.5458495974540710.54185599088668820.54681599140167240.5469120144844055595.06FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L3841140824128bfloat160.58924161195755010.58675199747085570.58880001306533810.5888000130653381599.88FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L4481147224128bfloat160.64492800235748290.64307200908660890.64422398805618290.64502400159835825103.81FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L5121153624128bfloat160.68234238624572750.67776000499725340.68095999956130980.68185597658157355109.12FLASHtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L1281115224128bfloat160.483712005615234360.48217600584030150.48332801461219790.4853760004043579583.38EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L2561128024128bfloat160.62688000202178950.62463998794555660.62668800354003910.6286720037460327590.62EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L3201134424128bfloat160.6997760057449340.69734400510787960.70041602849960330.7004479765892029595.94EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L3841140824128bfloat160.83333120346069330.82841598987579350.83251202106475830.83763200044631965100.0EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L4481147224128bfloat160.95334399938583370.95027202367782590.95129597187042240.95724797248840335103.81EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L5121153624128bfloat161.00663678646087651.00249600410461431.00454401969909671.00979197025299075109.12EFFICIENTtorch-sdpa
xFormersxformers_meffflux_L1281115224128bfloat160.34529280662536620.33894398808479310.34611201286315920.3461120128631592583.38memory_efficientxformers
xFormersxformers_meffflux_L2561128024128bfloat160.412345600128173840.409599989652633670.412800014019012450.41286399960517883590.62memory_efficientxformers
xFormersxformers_meffflux_L3201134424128bfloat160.43662080168724060.43103998899459840.43315199017524720.4362240135669708595.06memory_efficientxformers
xFormersxformers_meffflux_L3841140824128bfloat160.44506240487098690.43596801161766050.443616002798080440.447488009929657599.88memory_efficientxformers
xFormersxformers_meffflux_L4481147224128bfloat160.47504000067710880.47110399603843690.475136011838912960.47631999850273135103.81memory_efficientxformers
xFormersxformers_meffflux_L5121153624128bfloat160.50094079971313480.496639996767044070.49971199035644530.50380802154541025109.12memory_efficientxformers
Compiled (default)torch_flash_compiled_defaultflux_L1281115224128bfloat160.38563839197158810.35635200142860410.359423995018005370.3624959886074066583.38FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L2561128024128bfloat160.49829120039939880.49260801076889040.496639996767044070.5017600059509277590.62FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L3201134424128bfloat160.53699198961257930.53350400924682620.53660798072814940.5386239886283875595.25FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L3841140824128bfloat160.58414080142974860.57753598690032960.58688002824783330.5877760052680969599.88FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L4481147224128bfloat160.61847040653228760.60723197460174560.61132800579071040.61440002918243415103.81FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L5121153624128bfloat160.64286720752716060.63999998569488530.64307200908660890.64307200908660895109.12FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L1281115224128bfloat160.400204795598983750.36659198999404910.37683200836181640.41171199083328247581.75FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L2561128024128bfloat160.55358079671859740.51609599590301510.54895997047424320.5631359815597534592.88FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L3201134424128bfloat160.61439999341964720.5621759891510010.61440002918243410.6318079829216003595.13FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L3841140824128bfloat160.67544959783554080.65126401185989380.65843200683593750.6799359917640686597.13FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L4481147224128bfloat160.72107521295547480.69731199741363530.70140802860260010.7229440212249756599.0FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L5121153624128bfloat160.77353599071502690.74854397773742680.75574398040771480.77107197046279915101.63FLASHtorch-sdpa
HF Kernels Flash Attnhf_kernels_flash_attnflux_L1281115224128bfloat160.24569599926471710.243711993098258970.245664000511169430.2457599937915802583.38flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L2561128024128bfloat160.32155519723892210.31641599535942080.3194879889488220.32051199674606323590.62flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L3201134424128bfloat160.33847039937973020.336703985929489140.337920010089874270.33983999490737915595.06flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L3841140824128bfloat160.35102080106735230.34815999865531920.34918400645256040.35225600004196167599.88flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L4481147224128bfloat160.38298239707946780.380959987640380860.38297599554061890.38400000333786015103.81flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L5121153624128bfloat160.42593919038772580.42275199294090270.42495998740196230.42598399519920355109.12flash-attnhf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L1281115224128bfloat160.27550080418586730.267360001802444460.275615990161895750.27955201268196106583.38flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L2561128024128bfloat160.33974400162696840.33680000901222230.33996799588203430.34191998839378357590.62flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L3201134424128bfloat160.360198396444320670.35635200142860410.36044800281524660.36137598752975464595.06flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L3841140824128bfloat160.373420798778533960.37184000015258790.373791992664337160.3746879994869232599.88flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L4481147224128bfloat160.40244480371475220.39936000108718870.40147200226783750.40345600247383125103.81flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L5121153624128bfloat160.43050880432128910.42700800299644470.42915201187133790.43315199017524725109.12flash-attn3hf-kernels
-
-
-â–¶ code -â–¶ output +â–¼ code +â–¼ output â–¶ uv-logs | -Cell: combine | 34.17s +Cell: combine | 4.10s | Raw
-