diff --git "a/flash_attn/results/combined_results.html" "b/flash_attn/results/combined_results.html" --- "a/flash_attn/results/combined_results.html" +++ "b/flash_attn/results/combined_results.html" @@ -719,6 +719,41 @@ .artifact-preview svg { background: transparent; } + /* CSV table styling */ + .artifact-csv { + margin-top: 1rem; + overflow-x: auto; + } + .csv-table { + width: 100%; + border-collapse: collapse; + font-size: 0.9rem; + background: var(--bg-secondary); + border: 1px solid var(--border-primary); + border-radius: 1px; + } + .csv-table th, + .csv-table td { + padding: 0.5rem 0.75rem; + text-align: left; + border: 1px solid var(--border-primary); + } + .csv-table th { + background: var(--bg-tertiary); + font-weight: 600; + color: var(--text-primary); + } + .csv-table tbody tr:hover { + background: var(--bg-artifact-hover); + } + .artifact-csv-error { + margin-top: 1rem; + padding: 1rem; + background: var(--bg-error); + color: var(--text-error); + border: 1px solid var(--border-error); + border-radius: 1px; + } .cell-failed { border-color: var(--border-cell-failed); } @@ -1073,6 +1108,159 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: overflow: scroll; max-height: 300px; } +svg { + max-width: 100%; + height: auto; + cursor: crosshair; +} + +/* Hover effects for series lines */ +.series path { + stroke-width: 6 !important; /* make lines easier to hover */ + transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); + filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.1)); +} + +/* remove border on focus */ +g:focus { + outline: none !important; +} + +.series:hover path { + stroke-width: 3; + filter: drop-shadow(0 4px 8px rgba(0, 0, 0, 0.2)); + /* transform: translateY(-1px); */ +} + +.series:hover circle { + r: 5; + filter: drop-shadow(0 2px 6px rgba(0, 0, 0, 0.3)); +} + +/* Individual series hover colors with glow */ +#series--torch-flash-ma:hover path { + stroke: #0066cc; + stroke-width: 4; + filter: drop-shadow(0 0 8px #1f77b4); +} + +#series--torch-mem-eff:hover path { + stroke: #ff6600; + stroke-width: 4; + filter: drop-shadow(0 0 8px #ff7f0e); +} + +#series--xformers-meff:hover path { + stroke: #228833; + stroke-width: 4; + filter: drop-shadow(0 0 8px #2ca02c); +} + +#series--torch-flash-compiled-default:hover path { + stroke: #cc0000; + stroke-width: 4; + filter: drop-shadow(0 0 8px #d62728); +} + +#series--torch-flash-compiled-max-autotune:hover path { + stroke: #7733aa; + stroke-width: 4; + filter: drop-shadow(0 0 8px #9467bd); +} + +#series--hf-kernels-flash-attn:hover path { + stroke: #664422; + stroke-width: 4; + filter: drop-shadow(0 0 8px #8c564b); +} + +#series--hf-kernels-flash-attn3:hover path { + stroke: #cc3399; + stroke-width: 4; + filter: drop-shadow(0 0 8px #e377c2); +} + +/* Cursor changes */ +.series { + cursor: pointer; +} + +.series:hover { + cursor: pointer; +} + +/* Tooltip styles */ +.tooltip { + position: absolute; + background: rgba(0, 0, 0, 0.9); + color: white; + padding: 12px 16px; + border-radius: 8px; + font-size: 14px; + font-weight: 500; + pointer-events: none; + opacity: 0; + transition: opacity 0.3s ease; + z-index: 1000; + backdrop-filter: blur(10px); + border: 1px solid rgba(255, 255, 255, 0.2); + box-shadow: 0 8px 16px rgba(0, 0, 0, 0.3); +} + +.tooltip.show { + opacity: 1; +} + +/* Legend hover effects */ +.legend g:hover text { + font-weight: bold; + fill: #333; +} + +.legend g { + cursor: pointer; + transition: all 0.2s ease; +} + +.legend g:hover { + transform: translateX(5px); +} + +/* Subtle animations */ +@keyframes pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.7; } +} + +.series:active path { + animation: pulse 0.3s ease; +} + +/* Responsive design */ +@media (max-width: 768px) { + .chart-container { + padding: 15px; + margin: 10px; + } +} + +/* Loading animation */ +.chart-container::before { + content: ''; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: linear-gradient(90deg, transparent, rgba(255,255,255,0.4), transparent); + transform: translateX(-100%); + animation: shimmer 2s ease-in-out; +} + +@keyframes shimmer { + 0% { transform: translateX(-100%); } + 100% { transform: translateX(100%); } +} @@ -3718,20 +3906,1192 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:

Flash Attention Benchmarks - Aggregated Results

This document combines benchmark results from multiple attention implementations using cross-file dependencies.

-

Combined Summary and Visualization

+

Combined Summary and Visualization>

+
+ + + + + + + 2025-10-02T18:12:57.416034 + image/svg+xml + + + Matplotlib v3.10.6, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + flux_L128 + + + + + + + + + + + + + flux_L256 + + + + + + + + + + + + + flux_L320 + + + + + + + + + + + + + flux_L384 + + + + + + + + + + + + + flux_L448 + + + + + + + + + + + + + flux_L512 + + + + Workload + + + + + + + + + + + + + + + + + 0.4 + + + + + + + + + + + + + 0.5 + + + + + + + + + + + + + 0.6 + + + + + + + + + + + + + 0.7 + + + + + + + + + + + + + 0.8 + + + + + + + + + + + + + 0.9 + + + + Latency P50 (ms) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Attention Implementation Latency + + + + + + + + + + + + + torch_flash_ma + + + + + + + + + torch_mem_eff + + + + + + + + + xformers_meff + + + + + + + + + torch_flash_compiled_default + + + + + + + + + torch_flash_compiled_max_autotune + + + + + + + + + hf_kernels_flash_attn + + + + + + + + + hf_kernels_flash_attn3 + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ImplementationImpl IDWorkloadBatchSeq LengthHeadsHead DimDtypeMean (ms)P10 (ms)P50 (ms)P90 (ms)RepsPeak Mem (MB)BackendFamily
Flash (PyTorch SDPA)torch_flash_maflux_L1281115224128bfloat160.485772800445556650.478368014097213750.48035201430320740.4827199876308441583.38FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L2561128024128bfloat160.52291840314865110.5217279791831970.52288001775741580.5234559774398804590.62FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L3201134424128bfloat160.65159678459167480.65039998292922970.6508799791336060.6513599753379822595.06FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L3841140824128bfloat160.68076159954071050.67974400520324710.68086397647857670.6815680265426636599.88FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L4481147224128bfloat160.7110271930694580.70588797330856320.71219199895858760.71315199136734015103.81FLASHtorch-sdpa
Flash (PyTorch SDPA)torch_flash_maflux_L5121153624128bfloat160.73914239406585690.73692798614501950.73839998245239260.74089598655700685109.12FLASHtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L1281115224128bfloat160.58756479024887090.58636802434921260.58745598793029790.5876479744911194583.38EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L2561128024128bfloat160.6536576032638550.64854401350021360.65376001596450810.656544029712677590.62EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L3201134424128bfloat160.77845120429992680.7749440073966980.7786560058593750.7801600098609924595.94EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L3841140824128bfloat160.79229439496994010.7912639975547790.79247999191284180.79270398616790775100.0EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L4481147224128bfloat160.8480895876884460.84447997808456420.84700798988342290.84991997480392465103.81EFFICIENTtorch-sdpa
MemEff (PyTorch SDPA)torch_mem_effflux_L5121153624128bfloat160.95237760543823250.950048029422760.95190399885177610.95411199331283575109.12EFFICIENTtorch-sdpa
xFormersxformers_meffflux_L1281115224128bfloat160.450668799877166740.44742399454116820.449216008186340330.45241600275039673583.38memory_efficientxformers
xFormersxformers_meffflux_L2561128024128bfloat160.470041596889495870.46524798870086670.47059199213981630.4716799855232239590.62memory_efficientxformers
xFormersxformers_meffflux_L3201134424128bfloat160.60227839946746830.59878402948379520.60217601060867310.6045759916305542595.06memory_efficientxformers
xFormersxformers_meffflux_L3841140824128bfloat160.60130559206008920.60003197193145750.6003839969635010.6016640067100525599.88memory_efficientxformers
xFormersxformers_meffflux_L4481147224128bfloat160.64081920385360710.6391360163688660.64044797420501710.64163202047348025103.81memory_efficientxformers
xFormersxformers_meffflux_L5121153624128bfloat160.64661118984222410.64473599195480350.64623999595642090.64838397502899175109.12memory_efficientxformers
Compiled (default)torch_flash_compiled_defaultflux_L1281115224128bfloat160.5273472070693970.51945602893829350.52729600667953490.5312960147857666583.38FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L2561128024128bfloat160.55863679647445670.55606400966644290.55715197324752810.5611839890480042590.62FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L3201134424128bfloat160.68607360124588010.68419200181961060.68601602315902710.6869760155677795595.25FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L3841140824128bfloat160.71673600673675540.71520000696182250.71619200706481930.7164160013198853599.88FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L4481147224128bfloat160.74232958555221560.74009597301483150.7421439886093140.74310398101806645103.81FLASHtorch-sdpa
Compiled (default)torch_flash_compiled_defaultflux_L5121153624128bfloat160.77438719272613520.77180802822113040.77459198236465450.77481597661972055109.12FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L1281115224128bfloat160.64890880584716790.61481600999832150.62969601154327390.6522240042686462567.5FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L2561128024128bfloat160.7007615923881530.66153597831726070.68211197853088380.7128959894180298575.0FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L3201134424128bfloat160.8344447970390320.79670399427413940.81647998094558720.8463680148124695580.38FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L3841140824128bfloat160.87093759775161740.84323197603225710.84985601902008060.8750079870223999582.5FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L4481147224128bfloat160.90696319341659550.87753599882125850.90307199954986570.903872013092041586.25FLASHtorch-sdpa
Compiled (max-autotune)torch_flash_compiled_max_autotuneflux_L5121153624128bfloat160.93715840578079220.91459202766418460.91641598939895630.9357439875602722590.0FLASHtorch-sdpa
HF Kernels Flash Attnhf_kernels_flash_attnflux_L1281115224128bfloat160.344467198848724340.34387201070785520.34451198577880860.34457600116729736583.38flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L2561128024128bfloat160.375718390941619850.374047994613647460.37638399004936220.3766399919986725590.62flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L3201134424128bfloat160.49459200501441960.49254399538040160.4931200146675110.4938240051269531595.06flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L3841140824128bfloat160.51396478414535520.51231998205184940.51427197456359860.5147839784622192599.88flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L4481147224128bfloat160.53538559675216680.53391999006271360.53500801324844360.53523200750350955103.81flash-attnhf-kernels
HF Kernels Flash Attnhf_kernels_flash_attnflux_L5121153624128bfloat160.55488001108169560.55385601520538330.55488002300262450.55532801151275635109.12flash-attnhf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L1281115224128bfloat160.36173439621925350.361023992300033570.36169600486755370.36211198568344116583.38flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L2561128024128bfloat160.39079679846763610.38854399323463440.390560001134872440.3906239867210388590.62flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L3201134424128bfloat160.52289919853210450.5213440060615540.52307200431823730.5232319831848145595.06flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L3841140824128bfloat160.52546560764312740.5239040255546570.52499198913574220.526528000831604599.88flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L4481147224128bfloat160.56465920209884650.56278401613235470.5653439760208130.5653439760208135103.81flash-attn3hf-kernels
HF Kernels Flash Attn3hf_kernels_flash_attn3flux_L5121153624128bfloat160.56988799571990970.5673599839210510.56966400146484380.56985598802566535109.12flash-attn3hf-kernels
+
+
▶ code -▼ output +▶ output ▶ uv-logs | -Cell: combine | 36.17s +Cell: combine | 36.30s | Raw
-