Spaces:

kernels-community
/

kernels-benchmarks

Running

App Files Files Community

drbh HF Staff commited on Oct 2

Commit

9ad2ef6

verified ·

1 Parent(s): 0cce993

Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -6
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -6
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -6
flash_attn/impls/compiled_variants.html +44 -44
flash_attn/impls/flash_attention.html +21 -21
flash_attn/impls/hf_kernels_flash_attn.html +27 -27
flash_attn/impls/hf_kernels_flash_attn3.html +24 -24
flash_attn/impls/mem_efficient_attention.html +17 -17
flash_attn/impls/sage_attention.html +27 -29
flash_attn/impls/xformers.html +22 -22
flash_attn/results/artifacts/combine/latency.png +3 -0
flash_attn/results/cells/combine.py +96 -0
flash_attn/results/combined_results.html +0 -0

flash_attn/impls/artifacts/benchmark/attn.jsonl CHANGED Viewed

@@ -1,6 +1,6 @@
-{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4453760087490082, "p50": 0.45241600275039673, "p90": 0.45257601141929626, "mean": 0.4501312017440796, "reps": 5, "warmup": 2}, "compile_ms": 1.8144960403442383, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4647679924964905, "p50": 0.46665599942207336, "p90": 0.47142401337623596, "mean": 0.46863360404968263, "reps": 5, "warmup": 2}, "compile_ms": 0.3614720106124878, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.597823977470398, "p50": 0.6007360219955444, "p90": 0.6015999913215637, "mean": 0.6010496020317078, "reps": 5, "warmup": 2}, "compile_ms": 0.4886080026626587, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6014080047607422, "p50": 0.6025919914245605, "p90": 0.6026239991188049, "mean": 0.6072191953659057, "reps": 5, "warmup": 2}, "compile_ms": 0.4956800043582916, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6389120221138, "p50": 0.6423360109329224, "p90": 0.6447039842605591, "mean": 0.6453696012496948, "reps": 5, "warmup": 2}, "compile_ms": 0.532256007194519, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6434879899024963, "p50": 0.6450560092926025, "p90": 0.6518719792366028, "mean": 0.6475072026252746, "reps": 5, "warmup": 2}, "compile_ms": 0.535040020942688, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}

+{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4424000084400177, "p50": 0.4480000138282776, "p90": 0.45020800828933716, "mean": 0.448172801733017, "reps": 5, "warmup": 2}, "compile_ms": 1.8151999711990356, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.46480000019073486, "p50": 0.4689280092716217, "p90": 0.47071999311447144, "mean": 0.46839680075645446, "reps": 5, "warmup": 2}, "compile_ms": 0.35923200845718384, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5970879793167114, "p50": 0.5986559987068176, "p90": 0.6020799875259399, "mean": 0.6001919984817505, "reps": 5, "warmup": 2}, "compile_ms": 0.48611199855804443, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5994560122489929, "p50": 0.6028159856796265, "p90": 0.6028800010681152, "mean": 0.6018815994262695, "reps": 5, "warmup": 2}, "compile_ms": 0.49404799938201904, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6379839777946472, "p50": 0.6402559876441956, "p90": 0.6423360109329224, "mean": 0.6404095888137817, "reps": 5, "warmup": 2}, "compile_ms": 0.531391978263855, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6431040167808533, "p50": 0.6442880034446716, "p90": 0.6445119976997375, "mean": 0.644704008102417, "reps": 5, "warmup": 2}, "compile_ms": 0.5358719825744629, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl CHANGED Viewed

@@ -1,6 +1,6 @@
-{"ts": "2025-10-02T15:02:05Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5145599842071533, "p50": 0.5220479965209961, "p90": 0.5232319831848145, "mean": 0.5199103951454163, "reps": 5, "warmup": 2}, "compile_ms": 3343.085205078125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5552319884300232, "p50": 0.5602560043334961, "p90": 0.5604159832000732, "mean": 0.5585088014602662, "reps": 5, "warmup": 2}, "compile_ms": 471.8746032714844, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6817600131034851, "p50": 0.6845120191574097, "p90": 0.6866880059242249, "mean": 0.6862144112586975, "reps": 5, "warmup": 2}, "compile_ms": 469.6441650390625, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7130560278892517, "p50": 0.7160000205039978, "p90": 0.7172480225563049, "mean": 0.7158400177955627, "reps": 5, "warmup": 2}, "compile_ms": 471.8545227050781, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7383360266685486, "p50": 0.746783971786499, "p90": 0.7520319819450378, "mean": 0.7461183905601502, "reps": 5, "warmup": 2}, "compile_ms": 473.72625732421875, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:08Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7711359858512878, "p50": 0.7734079957008362, "p90": 0.7748159766197205, "mean": 0.7733887910842896, "reps": 5, "warmup": 2}, "compile_ms": 476.75982666015625, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

+{"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5143679976463318, "p50": 0.5232959985733032, "p90": 0.5257599949836731, "mean": 0.5211328029632568, "reps": 5, "warmup": 2}, "compile_ms": 3112.67236328125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5569279789924622, "p50": 0.558784008026123, "p90": 0.5599679946899414, "mean": 0.5588735938072205, "reps": 5, "warmup": 2}, "compile_ms": 272.2660217285156, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.685375988483429, "p50": 0.6888960003852844, "p90": 0.6940159797668457, "mean": 0.6904960036277771, "reps": 5, "warmup": 2}, "compile_ms": 272.7831726074219, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7146559953689575, "p50": 0.7190399765968323, "p90": 0.7200639843940735, "mean": 0.7184319853782654, "reps": 5, "warmup": 2}, "compile_ms": 270.6763916015625, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.739359974861145, "p50": 0.7402240037918091, "p90": 0.7426239848136902, "mean": 0.741484797000885, "reps": 5, "warmup": 2}, "compile_ms": 270.3490295410156, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7703679800033569, "p50": 0.7723519802093506, "p90": 0.7728000283241272, "mean": 0.7723968029022217, "reps": 5, "warmup": 2}, "compile_ms": 269.7756652832031, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl CHANGED Viewed

@@ -1,6 +1,6 @@
-{"ts": "2025-10-02T15:02:53Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6211519837379456, "p50": 0.6424639821052551, "p90": 0.6726719737052917, "mean": 0.6559999942779541, "reps": 5, "warmup": 2}, "compile_ms": 4537.6962890625, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:55Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.665503978729248, "p50": 0.6812480092048645, "p90": 0.7109439969062805, "mean": 0.7009024024009705, "reps": 5, "warmup": 2}, "compile_ms": 1491.3409423828125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:56Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8014079928398132, "p50": 0.8136320114135742, "p90": 0.8414080142974854, "mean": 0.8342463970184326, "reps": 5, "warmup": 2}, "compile_ms": 1269.2235107421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:02:58Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8420799970626831, "p50": 0.8514879941940308, "p90": 0.8752319812774658, "mean": 0.8708159923553467, "reps": 5, "warmup": 2}, "compile_ms": 1631.2921142578125, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:03:00Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8840640187263489, "p50": 0.8960639834403992, "p90": 0.9062719941139221, "mean": 0.9071423888206482, "reps": 5, "warmup": 2}, "compile_ms": 1919.3294677734375, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
-{"ts": "2025-10-02T15:03:02Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9141759872436523, "p50": 0.9165440201759338, "p90": 0.9380800127983093, "mean": 0.9373440027236939, "reps": 5, "warmup": 2}, "compile_ms": 1484.717529296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

+{"ts": "2025-10-02T15:50:03Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.63155198097229, "p50": 0.6451839804649353, "p90": 0.665727972984314, "mean": 0.6618239879608154, "reps": 5, "warmup": 2}, "compile_ms": 4977.1767578125, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:05Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6628159880638123, "p50": 0.6843519806861877, "p90": 0.7063680291175842, "mean": 0.7008576035499573, "reps": 5, "warmup": 2}, "compile_ms": 1701.4315185546875, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:07Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8050559759140015, "p50": 0.8155199885368347, "p90": 0.8389120101928711, "mean": 0.833843195438385, "reps": 5, "warmup": 2}, "compile_ms": 1701.230712890625, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:09Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8313599824905396, "p50": 0.849407970905304, "p90": 0.8810880184173584, "mean": 0.8694527983665467, "reps": 5, "warmup": 2}, "compile_ms": 2027.875, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:11Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8768960237503052, "p50": 0.8824639916419983, "p90": 0.9011520147323608, "mean": 0.9017536044120789, "reps": 5, "warmup": 2}, "compile_ms": 2269.297607421875, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
+{"ts": "2025-10-02T15:50:13Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9179520010948181, "p50": 0.9188479781150818, "p90": 0.9378560185432434, "mean": 0.9400512099266052, "reps": 5, "warmup": 2}, "compile_ms": 1835.313720703125, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}

flash_attn/impls/compiled_variants.html CHANGED Viewed

@@ -3711,7 +3711,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark_default | 46.71s
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
 <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
@@ -3797,9 +3797,9 @@ Cell: benchmark_default | 46.71s
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
 torch_flash_compiled_default flux_L128              0.52  True
 torch_flash_compiled_default flux_L256              0.56  True
-torch_flash_compiled_default flux_L320              0.68  True
 torch_flash_compiled_default flux_L384              0.72  True
-torch_flash_compiled_default flux_L448              0.75  True
 torch_flash_compiled_default flux_L512              0.77  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark_default">
@@ -3807,34 +3807,34 @@ torch_flash_compiled_default flux_L512              0.77  True
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 Downloading triton (148.3MiB)
-Downloading numpy (16.2MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading sympy (6.0MiB)
 Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
 Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading pillow (6.3MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
 Downloading setuptools (1.1MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading matplotlib (8.3MiB)
-Downloading networkx (1.9MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading torch (846.9MiB)
-Downloading kiwisolver (1.4MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
- Downloading fonttools
  Downloading networkx
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
@@ -3847,13 +3847,13 @@ Downloading kiwisolver (1.4MiB)
  Downloading triton
  Downloading nvidia-cufft-cu12
  Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
  Downloading nvidia-cusparselt-cu12
  Downloading nvidia-nccl-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 37 packages in 545ms
 </div>
 </div>
 <div class="cell-artifacts">
@@ -3871,7 +3871,7 @@ Installed 37 packages in 545ms
 <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark_max_autotune | 53.95s
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
 <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
@@ -3955,52 +3955,52 @@ Cell: benchmark_max_autotune | 53.95s
 </div>
 <div id="output-benchmark_max_autotune" class="cell-output">
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
-torch_flash_compiled_max_autotune flux_L128              0.64  True
 torch_flash_compiled_max_autotune flux_L256              0.68  True
-torch_flash_compiled_max_autotune flux_L320              0.81  True
 torch_flash_compiled_max_autotune flux_L384              0.85  True
-torch_flash_compiled_max_autotune flux_L448              0.90  True
 torch_flash_compiled_max_autotune flux_L512              0.92  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading numpy (16.2MiB)
 Downloading matplotlib (8.3MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
 Downloading setuptools (1.1MiB)
-Downloading pillow (6.3MiB)
-Downloading sympy (6.0MiB)
 Downloading fonttools (4.7MiB)
 Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
 Downloading networkx (1.9MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading triton (148.3MiB)
 Downloading torch (846.9MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
- Downloading networkx
  Downloading fonttools
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
- Downloading nvidia-cuda-cupti-cu12
  Downloading matplotlib
- Downloading numpy
  Downloading sympy
  Downloading nvidia-nvjitlink-cu12
  Downloading nvidia-curand-cu12
  Downloading nvidia-cuda-nvrtc-cu12
@@ -4013,7 +4013,7 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 37 packages in 526ms
 </div>
 </div>
 <div class="cell-artifacts">

 <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark_default | 44.25s
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
 <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
 torch_flash_compiled_default flux_L128              0.52  True
 torch_flash_compiled_default flux_L256              0.56  True
+torch_flash_compiled_default flux_L320              0.69  True
 torch_flash_compiled_default flux_L384              0.72  True
+torch_flash_compiled_default flux_L448              0.74  True
 torch_flash_compiled_default flux_L512              0.77  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark_default">
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 Downloading triton (148.3MiB)
+Downloading torch (846.9MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
 Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 Downloading nvidia-cufft-cu12 (184.2MiB)
 Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading setuptools (1.1MiB)
 Downloading matplotlib (8.3MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading numpy (16.2MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading networkx (1.9MiB)
+Downloading pillow (6.3MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
  Downloading networkx
+ Downloading fonttools
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading triton
  Downloading nvidia-cufft-cu12
  Downloading nvidia-cusolver-cu12
  Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
  Downloading nvidia-nccl-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 37 packages in 516ms
 </div>
 </div>
 <div class="cell-artifacts">
 <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark_max_autotune | 56.94s
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
 <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 <div id="output-benchmark_max_autotune" class="cell-output">
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
+torch_flash_compiled_max_autotune flux_L128              0.65  True
 torch_flash_compiled_max_autotune flux_L256              0.68  True
+torch_flash_compiled_max_autotune flux_L320              0.82  True
 torch_flash_compiled_max_autotune flux_L384              0.85  True
+torch_flash_compiled_max_autotune flux_L448              0.88  True
 torch_flash_compiled_max_autotune flux_L512              0.92  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 Downloading matplotlib (8.3MiB)
 Downloading setuptools (1.1MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 Downloading fonttools (4.7MiB)
+Downloading numpy (16.2MiB)
+Downloading pillow (6.3MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
 Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading networkx (1.9MiB)
 Downloading torch (846.9MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading sympy (6.0MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
  Downloading fonttools
+ Downloading networkx
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading matplotlib
+ Downloading nvidia-cuda-cupti-cu12
  Downloading sympy
+ Downloading numpy
  Downloading nvidia-nvjitlink-cu12
  Downloading nvidia-curand-cu12
  Downloading nvidia-cuda-nvrtc-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 37 packages in 547ms
 </div>
 </div>
 <div class="cell-artifacts">

flash_attn/impls/flash_attention.html CHANGED Viewed

@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
-Cell: nv | 0.68s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3726,7 +3726,7 @@ Cell: nv | 0.68s
 </div>
 </div>
 <div id="output-nv" class="cell-output">
-<div class="cell-stdout">Thu Oct  2 15:03:41 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3735,19 +3735,19 @@ Cell: nv | 0.68s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA A10G                    On  |   00000000:00:1B.0 Off |                    0 |
-|  0%   31C    P0             87W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA A10G                    On  |   00000000:00:1C.0 Off |                    0 |
-|  0%   25C    P8             23W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA A10G                    On  |   00000000:00:1D.0 Off |                    0 |
-|  0%   26C    P8             23W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA A10G                    On  |   00000000:00:1E.0 Off |                    0 |
-|  0%   25C    P8             24W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
@@ -3771,7 +3771,7 @@ Cell: nv | 0.68s
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 35.67s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3862,28 +3862,28 @@ torch_flash_ma           flux_L512              0.74  True
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
-Downloading networkx (1.9MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading nvidia-cusparse-cu12 (274.9MiB)
 Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 Downloading setuptools (1.1MiB)
-Downloading sympy (6.0MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
 Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading pillow (6.3MiB)
 Downloading kiwisolver (1.4MiB)
-Downloading numpy (16.2MiB)
-Downloading matplotlib (8.3MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 Downloading torch (846.9MiB)
 Downloading triton (148.3MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
@@ -3909,7 +3909,7 @@ Downloading nvidia-curand-cu12 (60.7MiB)
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 37 packages in 560ms
 </div>
 </div>
 <div class="cell-artifacts">

 <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> |
+Cell: nv | 0.66s
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
 <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 </div>
 <div id="output-nv" class="cell-output">
+<div class="cell-stdout">Thu Oct  2 15:53:02 2025
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA A10G                    On  |   00000000:00:1B.0 Off |                    0 |
+|  0%   29C    P0             87W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA A10G                    On  |   00000000:00:1C.0 Off |                    0 |
+|  0%   25C    P8             24W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA A10G                    On  |   00000000:00:1D.0 Off |                    0 |
+|  0%   25C    P8             23W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA A10G                    On  |   00000000:00:1E.0 Off |                    0 |
+|  0%   25C    P8             23W /  300W |       0MiB /  23028MiB |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 37.94s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading sympy (6.0MiB)
 Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading networkx (1.9MiB)
+Downloading fonttools (4.7MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
 Downloading setuptools (1.1MiB)
+Downloading pillow (6.3MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
 Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
 Downloading kiwisolver (1.4MiB)
 Downloading torch (846.9MiB)
 Downloading triton (148.3MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 37 packages in 567ms
 </div>
 </div>
 <div class="cell-artifacts">

flash_attn/impls/hf_kernels_flash_attn.html CHANGED Viewed

@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 37.93s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3796,48 +3796,48 @@ Cell: benchmark | 37.93s
 </div>
 <div id="output-benchmark" class="cell-output">
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
-hf_kernels_flash_attn    flux_L128              0.35  True
-hf_kernels_flash_attn    flux_L256              0.38  True
-hf_kernels_flash_attn    flux_L320              0.50  True
-hf_kernels_flash_attn    flux_L384              0.52  True
-hf_kernels_flash_attn    flux_L448              0.54  True
 hf_kernels_flash_attn    flux_L512              0.56  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
-Downloading sympy (6.0MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading pillow (6.3MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
 Downloading triton (148.3MiB)
-Downloading setuptools (1.1MiB)
-Downloading kiwisolver (1.4MiB)
 Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading matplotlib (8.3MiB)
-Downloading torch (846.9MiB)
-Downloading hf-xet (3.0MiB)
 Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading numpy (16.2MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
 Downloading nvidia-cublas-cu12 (566.8MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading hf-xet
  Downloading setuptools
- Downloading fonttools
  Downloading networkx
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
@@ -3856,13 +3856,13 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 47 packages in 457ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
-Fetching 20 files:   5%|▌         | 1/20 [00:00&lt;00:03,  6.33it/s]
-Fetching 20 files:  10%|█         | 2/20 [00:01&lt;00:10,  1.75it/s]
-Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 19.65it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>

 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 38.08s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 </div>
 <div id="output-benchmark" class="cell-output">
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
+hf_kernels_flash_attn    flux_L128              0.34  True
+hf_kernels_flash_attn    flux_L256              0.37  True
+hf_kernels_flash_attn    flux_L320              0.49  True
+hf_kernels_flash_attn    flux_L384              0.51  True
+hf_kernels_flash_attn    flux_L448              0.53  True
 hf_kernels_flash_attn    flux_L512              0.56  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading numpy (16.2MiB)
+Downloading setuptools (1.1MiB)
+Downloading hf-xet (3.0MiB)
 Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
 Downloading triton (148.3MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading sympy (6.0MiB)
 Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 Downloading nvidia-cusparse-cu12 (274.9MiB)
 Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading kiwisolver (1.4MiB)
 Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading pillow (6.3MiB)
+Downloading fonttools (4.7MiB)
+Downloading matplotlib (8.3MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading hf-xet
  Downloading setuptools
  Downloading networkx
+ Downloading fonttools
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 47 packages in 519ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
+Fetching 20 files:   5%|▌         | 1/20 [00:00&lt;00:06,  2.87it/s]
+Fetching 20 files:  10%|█         | 2/20 [00:01&lt;00:12,  1.49it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 16.01it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>

flash_attn/impls/hf_kernels_flash_attn3.html CHANGED Viewed

@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 38.68s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3807,61 +3807,61 @@ hf_kernels_flash_attn3   flux_L512              0.57  True
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 Downloading sympy (6.0MiB)
 Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading numpy (16.2MiB)
 Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading torch (846.9MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading kiwisolver (1.4MiB)
 Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading matplotlib (8.3MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
 Downloading hf-xet (3.0MiB)
-Downloading fonttools (4.7MiB)
 Downloading pillow (6.3MiB)
-Downloading triton (148.3MiB)
-Downloading networkx (1.9MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading setuptools (1.1MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading hf-xet
  Downloading setuptools
- Downloading fonttools
  Downloading networkx
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading matplotlib
- Downloading numpy
  Downloading sympy
  Downloading nvidia-nvjitlink-cu12
  Downloading nvidia-curand-cu12
  Downloading nvidia-cuda-nvrtc-cu12
  Downloading triton
  Downloading nvidia-cufft-cu12
  Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparse-cu12
  Downloading nvidia-cusparselt-cu12
  Downloading nvidia-nccl-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 47 packages in 453ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
-Fetching 4 files:  25%|██▌       | 1/4 [00:00&lt;00:00,  5.61it/s]
-Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.13it/s]
-Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.56it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>

 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 41.76s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 Downloading sympy (6.0MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading matplotlib (8.3MiB)
+Downloading setuptools (1.1MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
 Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading numpy (16.2MiB)
 Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading hf-xet (3.0MiB)
 Downloading pillow (6.3MiB)
 Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading hf-xet
  Downloading setuptools
  Downloading networkx
+ Downloading fonttools
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading matplotlib
  Downloading sympy
+ Downloading numpy
  Downloading nvidia-nvjitlink-cu12
  Downloading nvidia-curand-cu12
  Downloading nvidia-cuda-nvrtc-cu12
  Downloading triton
  Downloading nvidia-cufft-cu12
  Downloading nvidia-cusolver-cu12
  Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
  Downloading nvidia-nccl-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 47 packages in 515ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
+Fetching 4 files:  25%|██▌       | 1/4 [00:00&lt;00:00,  4.20it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:01&lt;00:01,  1.09it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00,  2.44it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>

flash_attn/impls/mem_efficient_attention.html CHANGED Viewed

@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 35.60s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3794,44 +3794,44 @@ Cell: benchmark | 35.60s
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
 torch_mem_eff            flux_L128              0.59  True
 torch_mem_eff            flux_L256              0.65  True
-torch_mem_eff            flux_L320              0.77  True
 torch_mem_eff            flux_L384              0.79  True
-torch_mem_eff            flux_L448              0.84  True
 torch_mem_eff            flux_L512              0.95  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
-Downloading matplotlib (8.3MiB)
-Downloading pillow (6.3MiB)
 Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 Downloading nvidia-cufft-cu12 (184.2MiB)
 Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading numpy (16.2MiB)
-Downloading setuptools (1.1MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading torch (846.9MiB)
 Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading sympy (6.0MiB)
 Downloading fonttools (4.7MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
 Downloading nvidia-cublas-cu12 (566.8MiB)
 Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading triton (148.3MiB)
 Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading nvidia-nccl-cu12 (307.4MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
- Downloading networkx
  Downloading fonttools
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
@@ -3850,7 +3850,7 @@ Downloading nvidia-nccl-cu12 (307.4MiB)
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 37 packages in 453ms
 </div>
 </div>
 <div class="cell-artifacts">

 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 35.95s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
 torch_mem_eff            flux_L128              0.59  True
 torch_mem_eff            flux_L256              0.65  True
+torch_mem_eff            flux_L320              0.78  True
 torch_mem_eff            flux_L384              0.79  True
+torch_mem_eff            flux_L448              0.85  True
 torch_mem_eff            flux_L512              0.95  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading nvidia-curand-cu12 (60.7MiB)
 Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading networkx (1.9MiB)
+Downloading kiwisolver (1.4MiB)
 Downloading nvidia-cufft-cu12 (184.2MiB)
 Downloading nvidia-cusparse-cu12 (274.9MiB)
 Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading sympy (6.0MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
 Downloading fonttools (4.7MiB)
 Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading numpy (16.2MiB)
 Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading torch (846.9MiB)
+Downloading triton (148.3MiB)
+Downloading pillow (6.3MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
  Downloading fonttools
+ Downloading networkx
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 37 packages in 556ms
 </div>
 </div>
 <div class="cell-artifacts">

flash_attn/impls/sage_attention.html CHANGED Viewed

@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 40.11s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3804,53 +3804,53 @@ Cell: benchmark | 40.11s
 <div id="output-benchmark" class="cell-output">
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
 sage_int8_fp16           flux_L128             FAIL  False
-  Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L256             FAIL  False
-  Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L320             FAIL  False
-  Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L384             FAIL  False
-  Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L448             FAIL  False
-  Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L512             FAIL  False
-  Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading setuptools (1.1MiB)
 Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading triton (148.3MiB)
 Downloading nvidia-curand-cu12 (60.7MiB)
 Downloading torch (846.9MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
 Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading pillow (6.3MiB)
 Downloading kiwisolver (1.4MiB)
-Downloading hf-xet (3.0MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
 Downloading sympy (6.0MiB)
-Downloading numpy (16.2MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading matplotlib (8.3MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading hf-xet
  Downloading setuptools
- Downloading fonttools
  Downloading networkx
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
@@ -3869,15 +3869,13 @@ Downloading matplotlib (8.3MiB)
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 48 packages in 513ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
-Fetching 11 files:   9%|▉         | 1/11 [00:00&lt;00:01,  5.70it/s]
-Fetching 11 files:  18%|█▊        | 2/11 [00:00&lt;00:01,  6.67it/s]
-Fetching 11 files:  27%|██▋       | 3/11 [00:00&lt;00:01,  6.46it/s]
-Fetching 11 files:  64%|██████▎   | 7/11 [00:00&lt;00:00, 11.66it/s]
-Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 15.59it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>

 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 40.43s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 <div id="output-benchmark" class="cell-output">
 <div class="cell-stdout">impl                     wl                  p50(ms)  ok
 sage_int8_fp16           flux_L128             FAIL  False
+  Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L256             FAIL  False
+  Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L320             FAIL  False
+  Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L384             FAIL  False
+  Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L448             FAIL  False
+  Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
 sage_int8_fp16           flux_L512             FAIL  False
+  Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading networkx (1.9MiB)
 Downloading setuptools (1.1MiB)
+Downloading numpy (16.2MiB)
 Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading hf-xet (3.0MiB)
 Downloading torch (846.9MiB)
+Downloading triton (148.3MiB)
 Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
 Downloading kiwisolver (1.4MiB)
+Downloading pillow (6.3MiB)
 Downloading sympy (6.0MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
 Downloading matplotlib (8.3MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading fonttools (4.7MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading hf-xet
  Downloading setuptools
  Downloading networkx
+ Downloading fonttools
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading nvidia-cublas-cu12
  Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 48 packages in 525ms
 </div>
 </div>
 <div class="cell-stderr">Fetching 11 files:   0%|          | 0/11 [00:00&lt;?, ?it/s]
+Fetching 11 files:   9%|▉         | 1/11 [00:00&lt;00:01,  5.55it/s]
+Fetching 11 files:  73%|███████▎  | 8/11 [00:00&lt;00:00, 12.93it/s]
+Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.93it/s]</div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
 <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>

flash_attn/impls/xformers.html CHANGED Viewed

@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
-Cell: benchmark | 40.76s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3797,48 +3797,48 @@ xformers_meff            flux_L256              0.47  True
 xformers_meff            flux_L320              0.60  True
 xformers_meff            flux_L384              0.60  True
 xformers_meff            flux_L448              0.64  True
-xformers_meff            flux_L512              0.65  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
-Downloading numpy (16.2MiB)
-Downloading xformers (111.8MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
 Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading setuptools (1.1MiB)
 Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading pillow (6.3MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
 Downloading sympy (6.0MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
 Downloading fonttools (4.7MiB)
-Downloading triton (148.3MiB)
 Downloading torch (846.9MiB)
-Downloading kiwisolver (1.4MiB)
-Downloading matplotlib (8.3MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
- Downloading networkx
  Downloading fonttools
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading matplotlib
- Downloading sympy
  Downloading numpy
  Downloading nvidia-nvjitlink-cu12
  Downloading nvidia-curand-cu12
  Downloading nvidia-cuda-nvrtc-cu12
@@ -3849,10 +3849,10 @@ Downloading matplotlib (8.3MiB)
  Downloading nvidia-cusparse-cu12
  Downloading nvidia-cusparselt-cu12
  Downloading nvidia-nccl-cu12
- Downloading nvidia-cudnn-cu12
  Downloading nvidia-cublas-cu12
  Downloading torch
-Installed 38 packages in 522ms
 </div>
 </div>
 <div class="cell-artifacts">

 <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
 </span> |
+Cell: benchmark | 40.64s
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
 <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 xformers_meff            flux_L320              0.60  True
 xformers_meff            flux_L384              0.60  True
 xformers_meff            flux_L448              0.64  True
+xformers_meff            flux_L512              0.64  True
 </div>
 <div class="uv-install-logs" id="uv-logs-benchmark">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
    Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading pillow (6.3MiB)
 Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
 Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
 Downloading sympy (6.0MiB)
+Downloading matplotlib (8.3MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading xformers (111.8MiB)
+Downloading setuptools (1.1MiB)
 Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading kiwisolver (1.4MiB)
 Downloading fonttools (4.7MiB)
 Downloading torch (846.9MiB)
     Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
    Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cufile-cu12
  Downloading kiwisolver
  Downloading setuptools
  Downloading fonttools
+ Downloading networkx
  Downloading pillow
       Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
  Downloading nvidia-cuda-cupti-cu12
  Downloading matplotlib
  Downloading numpy
+ Downloading sympy
  Downloading nvidia-nvjitlink-cu12
  Downloading nvidia-curand-cu12
  Downloading nvidia-cuda-nvrtc-cu12
  Downloading nvidia-cusparse-cu12
  Downloading nvidia-cusparselt-cu12
  Downloading nvidia-nccl-cu12
  Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
  Downloading torch
+Installed 38 packages in 562ms
 </div>
 </div>
 <div class="cell-artifacts">

flash_attn/results/artifacts/combine/latency.png ADDED Viewed

Git LFS Details

SHA256: 87dbea8f2773d7fcee9fd191cb6e67cd1e2ddd379cef90ee01bb4ac40a55b5f1
Pointer size: 131 Bytes
Size of remote file: 110 kB

flash_attn/results/cells/combine.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+#     "matplotlib",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
+# ///
+import torch
+import sys
+import os
+import kernels_benchmark_tools as kbt
+from pathlib import Path
+# Discover the upstream artifact directories from environment variables
+cache_dirs = {
+    "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
+    "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
+    "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
+    "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
+    "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
+    "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
+    "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
+    "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
+    "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
+}
+print("LOADING BENCHMARK DATA")
+for name, cache_dir in cache_dirs.items():
+    print(f"{name:30s}: {cache_dir}")
+print()
+# Collect all JSONL paths
+all_paths = []
+file_mapping = {
+    "Flash (PyTorch SDPA)": "attn.jsonl",
+    "MemEff (PyTorch SDPA)": "attn.jsonl",
+    "Flash Attn 2": "attn.jsonl",
+    "xFormers": "attn.jsonl",
+    "SageAttention": "attn.jsonl",
+    "Compiled (default)": "attn_default.jsonl",
+    "Compiled (max-autotune)": "attn_max_autotune.jsonl",
+    "HF Kernels Flash Attn": "attn.jsonl",
+    "HF Kernels Flash Attn3": "attn.jsonl",
+}
+for name, cache_dir in cache_dirs.items():
+    if cache_dir:
+        jsonl_file = file_mapping[name]
+        path = Path(cache_dir) / jsonl_file
+        if path.exists() and path.stat().st_size > 0:
+            all_paths.append(str(path))
+            print(f"✓ Found {name}: {path}")
+        else:
+            print(f"⊘ Empty/Missing {name}: {path}")
+    else:
+        print(f"✗ No cache dir for {name}")
+print()
+if not all_paths:
+    print("ERROR: No benchmark data files found!")
+    sys.exit(1)
+# Generate combined summary
+print("COMBINED BENCHMARK SUMMARY")
+print()
+kbt.summarize(all_paths)
+print()
+print("GENERATING COMBINED VISUALIZATION")
+print()
+try:
+    kbt.viz(all_paths)
+    print("✓ Combined visualization saved as latency.png")
+except ImportError as e:
+    print(f"✗ Visualization requires matplotlib: {e}")
+except Exception as e:
+    print(f"✗ Visualization failed: {e}")
+print()
+print("ANALYSIS COMPLETE")
+print(f"Total implementations analyzed: {len(all_paths)}")
+print(f"\nImplementations included:")
+for name, cache_dir in cache_dirs.items():
+    if cache_dir:
+        jsonl_file = file_mapping[name]
+        path = Path(cache_dir) / jsonl_file
+        if path.exists() and path.stat().st_size > 0:
+            print(f"  ✓ {name}")

flash_attn/results/combined_results.html ADDED Viewed

The diff for this file is too large to render. See raw diff