drbh HF Staff commited on
Commit
9ad2ef6
·
verified ·
1 Parent(s): 0cce993

Upload folder using huggingface_hub

Browse files
flash_attn/impls/artifacts/benchmark/attn.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4453760087490082, "p50": 0.45241600275039673, "p90": 0.45257601141929626, "mean": 0.4501312017440796, "reps": 5, "warmup": 2}, "compile_ms": 1.8144960403442383, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4647679924964905, "p50": 0.46665599942207336, "p90": 0.47142401337623596, "mean": 0.46863360404968263, "reps": 5, "warmup": 2}, "compile_ms": 0.3614720106124878, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.597823977470398, "p50": 0.6007360219955444, "p90": 0.6015999913215637, "mean": 0.6010496020317078, "reps": 5, "warmup": 2}, "compile_ms": 0.4886080026626587, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6014080047607422, "p50": 0.6025919914245605, "p90": 0.6026239991188049, "mean": 0.6072191953659057, "reps": 5, "warmup": 2}, "compile_ms": 0.4956800043582916, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6389120221138, "p50": 0.6423360109329224, "p90": 0.6447039842605591, "mean": 0.6453696012496948, "reps": 5, "warmup": 2}, "compile_ms": 0.532256007194519, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T15:06:53Z", "run": "51b5e112d90d4002b860aa287067aed6", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6434879899024963, "p50": 0.6450560092926025, "p90": 0.6518719792366028, "mean": 0.6475072026252746, "reps": 5, "warmup": 2}, "compile_ms": 0.535040020942688, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4424000084400177, "p50": 0.4480000138282776, "p90": 0.45020800828933716, "mean": 0.448172801733017, "reps": 5, "warmup": 2}, "compile_ms": 1.8151999711990356, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.46480000019073486, "p50": 0.4689280092716217, "p90": 0.47071999311447144, "mean": 0.46839680075645446, "reps": 5, "warmup": 2}, "compile_ms": 0.35923200845718384, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5970879793167114, "p50": 0.5986559987068176, "p90": 0.6020799875259399, "mean": 0.6001919984817505, "reps": 5, "warmup": 2}, "compile_ms": 0.48611199855804443, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5994560122489929, "p50": 0.6028159856796265, "p90": 0.6028800010681152, "mean": 0.6018815994262695, "reps": 5, "warmup": 2}, "compile_ms": 0.49404799938201904, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6379839777946472, "p50": 0.6402559876441956, "p90": 0.6423360109329224, "mean": 0.6404095888137817, "reps": 5, "warmup": 2}, "compile_ms": 0.531391978263855, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6431040167808533, "p50": 0.6442880034446716, "p90": 0.6445119976997375, "mean": 0.644704008102417, "reps": 5, "warmup": 2}, "compile_ms": 0.5358719825744629, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T15:02:05Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5145599842071533, "p50": 0.5220479965209961, "p90": 0.5232319831848145, "mean": 0.5199103951454163, "reps": 5, "warmup": 2}, "compile_ms": 3343.085205078125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5552319884300232, "p50": 0.5602560043334961, "p90": 0.5604159832000732, "mean": 0.5585088014602662, "reps": 5, "warmup": 2}, "compile_ms": 471.8746032714844, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T15:02:06Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6817600131034851, "p50": 0.6845120191574097, "p90": 0.6866880059242249, "mean": 0.6862144112586975, "reps": 5, "warmup": 2}, "compile_ms": 469.6441650390625, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7130560278892517, "p50": 0.7160000205039978, "p90": 0.7172480225563049, "mean": 0.7158400177955627, "reps": 5, "warmup": 2}, "compile_ms": 471.8545227050781, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T15:02:07Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7383360266685486, "p50": 0.746783971786499, "p90": 0.7520319819450378, "mean": 0.7461183905601502, "reps": 5, "warmup": 2}, "compile_ms": 473.72625732421875, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T15:02:08Z", "run": "286ea637ebf04d38bebc7563bb8ba8de", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7711359858512878, "p50": 0.7734079957008362, "p90": 0.7748159766197205, "mean": 0.7733887910842896, "reps": 5, "warmup": 2}, "compile_ms": 476.75982666015625, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5143679976463318, "p50": 0.5232959985733032, "p90": 0.5257599949836731, "mean": 0.5211328029632568, "reps": 5, "warmup": 2}, "compile_ms": 3112.67236328125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5569279789924622, "p50": 0.558784008026123, "p90": 0.5599679946899414, "mean": 0.5588735938072205, "reps": 5, "warmup": 2}, "compile_ms": 272.2660217285156, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.685375988483429, "p50": 0.6888960003852844, "p90": 0.6940159797668457, "mean": 0.6904960036277771, "reps": 5, "warmup": 2}, "compile_ms": 272.7831726074219, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7146559953689575, "p50": 0.7190399765968323, "p90": 0.7200639843940735, "mean": 0.7184319853782654, "reps": 5, "warmup": 2}, "compile_ms": 270.6763916015625, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.739359974861145, "p50": 0.7402240037918091, "p90": 0.7426239848136902, "mean": 0.741484797000885, "reps": 5, "warmup": 2}, "compile_ms": 270.3490295410156, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7703679800033569, "p50": 0.7723519802093506, "p90": 0.7728000283241272, "mean": 0.7723968029022217, "reps": 5, "warmup": 2}, "compile_ms": 269.7756652832031, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-02T15:02:53Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6211519837379456, "p50": 0.6424639821052551, "p90": 0.6726719737052917, "mean": 0.6559999942779541, "reps": 5, "warmup": 2}, "compile_ms": 4537.6962890625, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-10-02T15:02:55Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.665503978729248, "p50": 0.6812480092048645, "p90": 0.7109439969062805, "mean": 0.7009024024009705, "reps": 5, "warmup": 2}, "compile_ms": 1491.3409423828125, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-10-02T15:02:56Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8014079928398132, "p50": 0.8136320114135742, "p90": 0.8414080142974854, "mean": 0.8342463970184326, "reps": 5, "warmup": 2}, "compile_ms": 1269.2235107421875, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-10-02T15:02:58Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8420799970626831, "p50": 0.8514879941940308, "p90": 0.8752319812774658, "mean": 0.8708159923553467, "reps": 5, "warmup": 2}, "compile_ms": 1631.2921142578125, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-10-02T15:03:00Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8840640187263489, "p50": 0.8960639834403992, "p90": 0.9062719941139221, "mean": 0.9071423888206482, "reps": 5, "warmup": 2}, "compile_ms": 1919.3294677734375, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-10-02T15:03:02Z", "run": "e3e874218b9e459a95cc24c1c341cb43", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9141759872436523, "p50": 0.9165440201759338, "p90": 0.9380800127983093, "mean": 0.9373440027236939, "reps": 5, "warmup": 2}, "compile_ms": 1484.717529296875, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-02T15:50:03Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.63155198097229, "p50": 0.6451839804649353, "p90": 0.665727972984314, "mean": 0.6618239879608154, "reps": 5, "warmup": 2}, "compile_ms": 4977.1767578125, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-02T15:50:05Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6628159880638123, "p50": 0.6843519806861877, "p90": 0.7063680291175842, "mean": 0.7008576035499573, "reps": 5, "warmup": 2}, "compile_ms": 1701.4315185546875, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-02T15:50:07Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8050559759140015, "p50": 0.8155199885368347, "p90": 0.8389120101928711, "mean": 0.833843195438385, "reps": 5, "warmup": 2}, "compile_ms": 1701.230712890625, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-02T15:50:09Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8313599824905396, "p50": 0.849407970905304, "p90": 0.8810880184173584, "mean": 0.8694527983665467, "reps": 5, "warmup": 2}, "compile_ms": 2027.875, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-02T15:50:11Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8768960237503052, "p50": 0.8824639916419983, "p90": 0.9011520147323608, "mean": 0.9017536044120789, "reps": 5, "warmup": 2}, "compile_ms": 2269.297607421875, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-02T15:50:13Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9179520010948181, "p50": 0.9188479781150818, "p90": 0.9378560185432434, "mean": 0.9400512099266052, "reps": 5, "warmup": 2}, "compile_ms": 1835.313720703125, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/compiled_variants.html CHANGED
@@ -3711,7 +3711,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3711
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3712
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3713
  </span> |
3714
- Cell: benchmark_default | 46.71s
3715
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3716
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3717
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
@@ -3797,9 +3797,9 @@ Cell: benchmark_default | 46.71s
3797
  <div class="cell-stdout">impl wl p50(ms) ok
3798
  torch_flash_compiled_default flux_L128 0.52 True
3799
  torch_flash_compiled_default flux_L256 0.56 True
3800
- torch_flash_compiled_default flux_L320 0.68 True
3801
  torch_flash_compiled_default flux_L384 0.72 True
3802
- torch_flash_compiled_default flux_L448 0.75 True
3803
  torch_flash_compiled_default flux_L512 0.77 True
3804
  </div>
3805
  <div class="uv-install-logs" id="uv-logs-benchmark_default">
@@ -3807,34 +3807,34 @@ torch_flash_compiled_default flux_L512 0.77 True
3807
  <div class="uv-logs-content" style="display: none;">
3808
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3809
  Downloading triton (148.3MiB)
3810
- Downloading numpy (16.2MiB)
3811
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3812
- Downloading sympy (6.0MiB)
 
3813
  Downloading nvidia-cufile-cu12 (1.1MiB)
3814
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3815
  Downloading nvidia-cufft-cu12 (184.2MiB)
3816
- Downloading fonttools (4.7MiB)
3817
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3818
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3819
- Downloading pillow (6.3MiB)
3820
- Downloading nvidia-curand-cu12 (60.7MiB)
3821
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3822
- Downloading nvidia-cublas-cu12 (566.8MiB)
3823
  Downloading setuptools (1.1MiB)
3824
- Downloading nvidia-nccl-cu12 (307.4MiB)
3825
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3826
  Downloading matplotlib (8.3MiB)
3827
- Downloading networkx (1.9MiB)
 
3828
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3829
- Downloading torch (846.9MiB)
3830
- Downloading kiwisolver (1.4MiB)
 
 
 
 
 
 
3831
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3832
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3833
  Downloading nvidia-cufile-cu12
3834
  Downloading kiwisolver
3835
  Downloading setuptools
3836
- Downloading fonttools
3837
  Downloading networkx
 
3838
  Downloading pillow
3839
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3840
  Downloading nvidia-cuda-cupti-cu12
@@ -3847,13 +3847,13 @@ Downloading kiwisolver (1.4MiB)
3847
  Downloading triton
3848
  Downloading nvidia-cufft-cu12
3849
  Downloading nvidia-cusolver-cu12
3850
- Downloading nvidia-cusparse-cu12
3851
  Downloading nvidia-cusparselt-cu12
 
3852
  Downloading nvidia-nccl-cu12
3853
  Downloading nvidia-cublas-cu12
3854
  Downloading nvidia-cudnn-cu12
3855
  Downloading torch
3856
- Installed 37 packages in 545ms
3857
  </div>
3858
  </div>
3859
  <div class="cell-artifacts">
@@ -3871,7 +3871,7 @@ Installed 37 packages in 545ms
3871
  <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark_max_autotune | 53.95s
3875
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
3877
  <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
@@ -3955,52 +3955,52 @@ Cell: benchmark_max_autotune | 53.95s
3955
  </div>
3956
  <div id="output-benchmark_max_autotune" class="cell-output">
3957
  <div class="cell-stdout">impl wl p50(ms) ok
3958
- torch_flash_compiled_max_autotune flux_L128 0.64 True
3959
  torch_flash_compiled_max_autotune flux_L256 0.68 True
3960
- torch_flash_compiled_max_autotune flux_L320 0.81 True
3961
  torch_flash_compiled_max_autotune flux_L384 0.85 True
3962
- torch_flash_compiled_max_autotune flux_L448 0.90 True
3963
  torch_flash_compiled_max_autotune flux_L512 0.92 True
3964
  </div>
3965
  <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
3966
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3967
  <div class="uv-logs-content" style="display: none;">
3968
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3969
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3970
- Downloading numpy (16.2MiB)
3971
  Downloading matplotlib (8.3MiB)
3972
- Downloading kiwisolver (1.4MiB)
3973
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3974
- Downloading nvidia-nccl-cu12 (307.4MiB)
3975
  Downloading setuptools (1.1MiB)
3976
- Downloading pillow (6.3MiB)
3977
- Downloading sympy (6.0MiB)
 
3978
  Downloading fonttools (4.7MiB)
 
 
 
 
 
 
3979
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3980
- Downloading nvidia-curand-cu12 (60.7MiB)
3981
- Downloading nvidia-cublas-cu12 (566.8MiB)
3982
  Downloading networkx (1.9MiB)
3983
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3984
- Downloading nvidia-cufile-cu12 (1.1MiB)
3985
- Downloading triton (148.3MiB)
3986
  Downloading torch (846.9MiB)
 
 
 
 
3987
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3988
- Downloading nvidia-cufft-cu12 (184.2MiB)
3989
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3990
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3991
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3992
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3993
  Downloading nvidia-cufile-cu12
3994
  Downloading kiwisolver
3995
  Downloading setuptools
3996
- Downloading networkx
3997
  Downloading fonttools
 
3998
  Downloading pillow
3999
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
4000
- Downloading nvidia-cuda-cupti-cu12
4001
  Downloading matplotlib
4002
- Downloading numpy
4003
  Downloading sympy
 
4004
  Downloading nvidia-nvjitlink-cu12
4005
  Downloading nvidia-curand-cu12
4006
  Downloading nvidia-cuda-nvrtc-cu12
@@ -4013,7 +4013,7 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4013
  Downloading nvidia-cublas-cu12
4014
  Downloading nvidia-cudnn-cu12
4015
  Downloading torch
4016
- Installed 37 packages in 526ms
4017
  </div>
4018
  </div>
4019
  <div class="cell-artifacts">
 
3711
  <span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
3712
  <span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
3713
  </span> |
3714
+ Cell: benchmark_default | 44.25s
3715
  | <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
3716
  <button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
3717
  <a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
 
3797
  <div class="cell-stdout">impl wl p50(ms) ok
3798
  torch_flash_compiled_default flux_L128 0.52 True
3799
  torch_flash_compiled_default flux_L256 0.56 True
3800
+ torch_flash_compiled_default flux_L320 0.69 True
3801
  torch_flash_compiled_default flux_L384 0.72 True
3802
+ torch_flash_compiled_default flux_L448 0.74 True
3803
  torch_flash_compiled_default flux_L512 0.77 True
3804
  </div>
3805
  <div class="uv-install-logs" id="uv-logs-benchmark_default">
 
3807
  <div class="uv-logs-content" style="display: none;">
3808
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3809
  Downloading triton (148.3MiB)
3810
+ Downloading torch (846.9MiB)
3811
+ Downloading kiwisolver (1.4MiB)
3812
+ Downloading fonttools (4.7MiB)
3813
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3814
  Downloading nvidia-cufile-cu12 (1.1MiB)
3815
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3816
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
3817
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
 
3818
  Downloading setuptools (1.1MiB)
 
 
3819
  Downloading matplotlib (8.3MiB)
3820
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3821
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3822
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3823
+ Downloading numpy (16.2MiB)
3824
+ Downloading sympy (6.0MiB)
3825
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3826
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3827
+ Downloading nvidia-curand-cu12 (60.7MiB)
3828
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3829
+ Downloading networkx (1.9MiB)
3830
+ Downloading pillow (6.3MiB)
3831
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3832
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3833
  Downloading nvidia-cufile-cu12
3834
  Downloading kiwisolver
3835
  Downloading setuptools
 
3836
  Downloading networkx
3837
+ Downloading fonttools
3838
  Downloading pillow
3839
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3840
  Downloading nvidia-cuda-cupti-cu12
 
3847
  Downloading triton
3848
  Downloading nvidia-cufft-cu12
3849
  Downloading nvidia-cusolver-cu12
 
3850
  Downloading nvidia-cusparselt-cu12
3851
+ Downloading nvidia-cusparse-cu12
3852
  Downloading nvidia-nccl-cu12
3853
  Downloading nvidia-cublas-cu12
3854
  Downloading nvidia-cudnn-cu12
3855
  Downloading torch
3856
+ Installed 37 packages in 516ms
3857
  </div>
3858
  </div>
3859
  <div class="cell-artifacts">
 
3871
  <span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark_max_autotune | 56.94s
3875
  | <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
3877
  <a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
 
3955
  </div>
3956
  <div id="output-benchmark_max_autotune" class="cell-output">
3957
  <div class="cell-stdout">impl wl p50(ms) ok
3958
+ torch_flash_compiled_max_autotune flux_L128 0.65 True
3959
  torch_flash_compiled_max_autotune flux_L256 0.68 True
3960
+ torch_flash_compiled_max_autotune flux_L320 0.82 True
3961
  torch_flash_compiled_max_autotune flux_L384 0.85 True
3962
+ torch_flash_compiled_max_autotune flux_L448 0.88 True
3963
  torch_flash_compiled_max_autotune flux_L512 0.92 True
3964
  </div>
3965
  <div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
3966
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3967
  <div class="uv-logs-content" style="display: none;">
3968
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
 
3969
  Downloading matplotlib (8.3MiB)
 
 
 
3970
  Downloading setuptools (1.1MiB)
3971
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3972
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3973
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3974
  Downloading fonttools (4.7MiB)
3975
+ Downloading numpy (16.2MiB)
3976
+ Downloading pillow (6.3MiB)
3977
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3978
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3979
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3980
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3981
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
3982
  Downloading networkx (1.9MiB)
 
 
 
3983
  Downloading torch (846.9MiB)
3984
+ Downloading triton (148.3MiB)
3985
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3986
+ Downloading kiwisolver (1.4MiB)
3987
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3988
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3989
+ Downloading nvidia-curand-cu12 (60.7MiB)
3990
+ Downloading sympy (6.0MiB)
 
3991
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3992
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3993
  Downloading nvidia-cufile-cu12
3994
  Downloading kiwisolver
3995
  Downloading setuptools
 
3996
  Downloading fonttools
3997
+ Downloading networkx
3998
  Downloading pillow
3999
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
 
4000
  Downloading matplotlib
4001
+ Downloading nvidia-cuda-cupti-cu12
4002
  Downloading sympy
4003
+ Downloading numpy
4004
  Downloading nvidia-nvjitlink-cu12
4005
  Downloading nvidia-curand-cu12
4006
  Downloading nvidia-cuda-nvrtc-cu12
 
4013
  Downloading nvidia-cublas-cu12
4014
  Downloading nvidia-cudnn-cu12
4015
  Downloading torch
4016
+ Installed 37 packages in 547ms
4017
  </div>
4018
  </div>
4019
  <div class="cell-artifacts">
flash_attn/impls/flash_attention.html CHANGED
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: nv | 0.68s
3714
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3716
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3726,7 +3726,7 @@ Cell: nv | 0.68s
3726
  </div>
3727
  </div>
3728
  <div id="output-nv" class="cell-output">
3729
- <div class="cell-stdout">Thu Oct 2 15:03:41 2025
3730
  +-----------------------------------------------------------------------------------------+
3731
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3732
  |-----------------------------------------+------------------------+----------------------+
@@ -3735,19 +3735,19 @@ Cell: nv | 0.68s
3735
  | | | MIG M. |
3736
  |=========================================+========================+======================|
3737
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3738
- | 0% 31C P0 87W / 300W | 0MiB / 23028MiB | 0% Default |
3739
  | | | N/A |
3740
  +-----------------------------------------+------------------------+----------------------+
3741
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3742
- | 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3743
  | | | N/A |
3744
  +-----------------------------------------+------------------------+----------------------+
3745
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3746
- | 0% 26C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3747
  | | | N/A |
3748
  +-----------------------------------------+------------------------+----------------------+
3749
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3750
- | 0% 25C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3751
  | | | N/A |
3752
  +-----------------------------------------+------------------------+----------------------+
3753
 
@@ -3771,7 +3771,7 @@ Cell: nv | 0.68s
3771
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3772
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3773
  </span> |
3774
- Cell: benchmark | 35.67s
3775
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3776
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3777
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3862,28 +3862,28 @@ torch_flash_ma flux_L512 0.74 True
3862
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3863
  <div class="uv-logs-content" style="display: none;">
3864
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3865
- Downloading networkx (1.9MiB)
3866
- Downloading nvidia-cufile-cu12 (1.1MiB)
3867
- Downloading nvidia-cufft-cu12 (184.2MiB)
3868
- Downloading nvidia-cublas-cu12 (566.8MiB)
3869
- Downloading nvidia-nccl-cu12 (307.4MiB)
3870
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3871
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
3872
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3873
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
 
 
 
3874
  Downloading setuptools (1.1MiB)
3875
- Downloading sympy (6.0MiB)
 
 
 
3876
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
3877
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3878
- Downloading pillow (6.3MiB)
3879
  Downloading kiwisolver (1.4MiB)
3880
- Downloading numpy (16.2MiB)
3881
- Downloading matplotlib (8.3MiB)
3882
- Downloading fonttools (4.7MiB)
3883
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3884
  Downloading torch (846.9MiB)
3885
  Downloading triton (148.3MiB)
3886
- Downloading nvidia-curand-cu12 (60.7MiB)
3887
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3888
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3889
  Downloading nvidia-cufile-cu12
@@ -3909,7 +3909,7 @@ Downloading nvidia-curand-cu12 (60.7MiB)
3909
  Downloading nvidia-cublas-cu12
3910
  Downloading nvidia-cudnn-cu12
3911
  Downloading torch
3912
- Installed 37 packages in 560ms
3913
  </div>
3914
  </div>
3915
  <div class="cell-artifacts">
 
3710
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3712
  </span> |
3713
+ Cell: nv | 0.66s
3714
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3716
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3726
  </div>
3727
  </div>
3728
  <div id="output-nv" class="cell-output">
3729
+ <div class="cell-stdout">Thu Oct 2 15:53:02 2025
3730
  +-----------------------------------------------------------------------------------------+
3731
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3732
  |-----------------------------------------+------------------------+----------------------+
 
3735
  | | | MIG M. |
3736
  |=========================================+========================+======================|
3737
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3738
+ | 0% 29C P0 87W / 300W | 0MiB / 23028MiB | 0% Default |
3739
  | | | N/A |
3740
  +-----------------------------------------+------------------------+----------------------+
3741
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3742
+ | 0% 25C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
3743
  | | | N/A |
3744
  +-----------------------------------------+------------------------+----------------------+
3745
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3746
+ | 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3747
  | | | N/A |
3748
  +-----------------------------------------+------------------------+----------------------+
3749
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3750
+ | 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
3751
  | | | N/A |
3752
  +-----------------------------------------+------------------------+----------------------+
3753
 
 
3771
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3772
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3773
  </span> |
3774
+ Cell: benchmark | 37.94s
3775
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3776
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3777
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3862
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3863
  <div class="uv-logs-content" style="display: none;">
3864
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
 
 
 
 
 
 
3865
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3866
+ Downloading sympy (6.0MiB)
3867
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3868
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3869
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3870
+ Downloading networkx (1.9MiB)
3871
+ Downloading fonttools (4.7MiB)
3872
+ Downloading matplotlib (8.3MiB)
3873
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3874
  Downloading setuptools (1.1MiB)
3875
+ Downloading pillow (6.3MiB)
3876
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3877
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3878
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3879
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3880
+ Downloading numpy (16.2MiB)
3881
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3882
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3883
+ Downloading nvidia-curand-cu12 (60.7MiB)
3884
  Downloading kiwisolver (1.4MiB)
 
 
 
 
3885
  Downloading torch (846.9MiB)
3886
  Downloading triton (148.3MiB)
 
3887
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3888
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3889
  Downloading nvidia-cufile-cu12
 
3909
  Downloading nvidia-cublas-cu12
3910
  Downloading nvidia-cudnn-cu12
3911
  Downloading torch
3912
+ Installed 37 packages in 567ms
3913
  </div>
3914
  </div>
3915
  <div class="cell-artifacts">
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 37.93s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3796,48 +3796,48 @@ Cell: benchmark | 37.93s
3796
  </div>
3797
  <div id="output-benchmark" class="cell-output">
3798
  <div class="cell-stdout">impl wl p50(ms) ok
3799
- hf_kernels_flash_attn flux_L128 0.35 True
3800
- hf_kernels_flash_attn flux_L256 0.38 True
3801
- hf_kernels_flash_attn flux_L320 0.50 True
3802
- hf_kernels_flash_attn flux_L384 0.52 True
3803
- hf_kernels_flash_attn flux_L448 0.54 True
3804
  hf_kernels_flash_attn flux_L512 0.56 True
3805
  </div>
3806
  <div class="uv-install-logs" id="uv-logs-benchmark">
3807
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3808
  <div class="uv-logs-content" style="display: none;">
3809
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3810
- Downloading sympy (6.0MiB)
3811
- Downloading networkx (1.9MiB)
3812
- Downloading nvidia-cufile-cu12 (1.1MiB)
3813
- Downloading fonttools (4.7MiB)
3814
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3815
- Downloading nvidia-curand-cu12 (60.7MiB)
3816
- Downloading pillow (6.3MiB)
3817
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3818
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3819
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
 
 
 
3820
  Downloading triton (148.3MiB)
3821
- Downloading setuptools (1.1MiB)
3822
- Downloading kiwisolver (1.4MiB)
3823
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
3824
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3825
- Downloading nvidia-cufft-cu12 (184.2MiB)
3826
- Downloading matplotlib (8.3MiB)
3827
- Downloading torch (846.9MiB)
3828
- Downloading hf-xet (3.0MiB)
3829
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3830
- Downloading numpy (16.2MiB)
3831
- Downloading nvidia-nccl-cu12 (307.4MiB)
3832
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
 
3833
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3834
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3835
  Downloading nvidia-cufile-cu12
3836
  Downloading kiwisolver
3837
  Downloading hf-xet
3838
  Downloading setuptools
3839
- Downloading fonttools
3840
  Downloading networkx
 
3841
  Downloading pillow
3842
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3843
  Downloading nvidia-cuda-cupti-cu12
@@ -3856,13 +3856,13 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
3856
  Downloading nvidia-cublas-cu12
3857
  Downloading nvidia-cudnn-cu12
3858
  Downloading torch
3859
- Installed 47 packages in 457ms
3860
  </div>
3861
  </div>
3862
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
3863
- Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:03, 6.33it/s]
3864
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:10, 1.75it/s]
3865
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 19.65it/s]</div>
3866
  <div class="cell-artifacts">
3867
  <h4>Artifacts:</h4>
3868
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
+ Cell: benchmark | 38.08s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3796
  </div>
3797
  <div id="output-benchmark" class="cell-output">
3798
  <div class="cell-stdout">impl wl p50(ms) ok
3799
+ hf_kernels_flash_attn flux_L128 0.34 True
3800
+ hf_kernels_flash_attn flux_L256 0.37 True
3801
+ hf_kernels_flash_attn flux_L320 0.49 True
3802
+ hf_kernels_flash_attn flux_L384 0.51 True
3803
+ hf_kernels_flash_attn flux_L448 0.53 True
3804
  hf_kernels_flash_attn flux_L512 0.56 True
3805
  </div>
3806
  <div class="uv-install-logs" id="uv-logs-benchmark">
3807
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3808
  <div class="uv-logs-content" style="display: none;">
3809
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3810
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3811
+ Downloading numpy (16.2MiB)
3812
+ Downloading setuptools (1.1MiB)
3813
+ Downloading hf-xet (3.0MiB)
 
 
 
 
3814
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3815
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3816
+ Downloading nvidia-curand-cu12 (60.7MiB)
3817
+ Downloading networkx (1.9MiB)
3818
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3819
+ Downloading torch (846.9MiB)
3820
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3821
  Downloading triton (148.3MiB)
3822
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3823
+ Downloading sympy (6.0MiB)
3824
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3825
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3826
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
 
3827
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3828
+ Downloading kiwisolver (1.4MiB)
 
3829
  Downloading nvidia-cublas-cu12 (566.8MiB)
3830
+ Downloading pillow (6.3MiB)
3831
+ Downloading fonttools (4.7MiB)
3832
+ Downloading matplotlib (8.3MiB)
3833
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3834
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3835
  Downloading nvidia-cufile-cu12
3836
  Downloading kiwisolver
3837
  Downloading hf-xet
3838
  Downloading setuptools
 
3839
  Downloading networkx
3840
+ Downloading fonttools
3841
  Downloading pillow
3842
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3843
  Downloading nvidia-cuda-cupti-cu12
 
3856
  Downloading nvidia-cublas-cu12
3857
  Downloading nvidia-cudnn-cu12
3858
  Downloading torch
3859
+ Installed 47 packages in 519ms
3860
  </div>
3861
  </div>
3862
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
3863
+ Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:06, 2.87it/s]
3864
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:12, 1.49it/s]
3865
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 16.01it/s]</div>
3866
  <div class="cell-artifacts">
3867
  <h4>Artifacts:</h4>
3868
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 38.68s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3807,61 +3807,61 @@ hf_kernels_flash_attn3 flux_L512 0.57 True
3807
  <div class="uv-logs-content" style="display: none;">
3808
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3809
  Downloading sympy (6.0MiB)
 
 
 
 
 
 
3810
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3811
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3812
- Downloading numpy (16.2MiB)
3813
  Downloading nvidia-cufft-cu12 (184.2MiB)
3814
- Downloading torch (846.9MiB)
3815
- Downloading nvidia-cufile-cu12 (1.1MiB)
3816
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3817
- Downloading kiwisolver (1.4MiB)
3818
  Downloading nvidia-curand-cu12 (60.7MiB)
3819
- Downloading nvidia-cublas-cu12 (566.8MiB)
3820
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3821
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3822
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3823
- Downloading matplotlib (8.3MiB)
3824
- Downloading nvidia-nccl-cu12 (307.4MiB)
3825
  Downloading hf-xet (3.0MiB)
3826
- Downloading fonttools (4.7MiB)
3827
  Downloading pillow (6.3MiB)
3828
- Downloading triton (148.3MiB)
3829
- Downloading networkx (1.9MiB)
3830
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3831
- Downloading setuptools (1.1MiB)
 
 
 
 
 
3832
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3833
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3834
  Downloading nvidia-cufile-cu12
3835
  Downloading kiwisolver
3836
  Downloading hf-xet
3837
  Downloading setuptools
3838
- Downloading fonttools
3839
  Downloading networkx
 
3840
  Downloading pillow
3841
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3842
  Downloading nvidia-cuda-cupti-cu12
3843
  Downloading matplotlib
3844
- Downloading numpy
3845
  Downloading sympy
 
3846
  Downloading nvidia-nvjitlink-cu12
3847
  Downloading nvidia-curand-cu12
3848
  Downloading nvidia-cuda-nvrtc-cu12
3849
  Downloading triton
3850
  Downloading nvidia-cufft-cu12
3851
  Downloading nvidia-cusolver-cu12
3852
- Downloading nvidia-cusparse-cu12
3853
  Downloading nvidia-cusparselt-cu12
 
3854
  Downloading nvidia-nccl-cu12
3855
  Downloading nvidia-cublas-cu12
3856
  Downloading nvidia-cudnn-cu12
3857
  Downloading torch
3858
- Installed 47 packages in 453ms
3859
  </div>
3860
  </div>
3861
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3862
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 5.61it/s]
3863
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.13it/s]
3864
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.56it/s]</div>
3865
  <div class="cell-artifacts">
3866
  <h4>Artifacts:</h4>
3867
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
+ Cell: benchmark | 41.76s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3807
  <div class="uv-logs-content" style="display: none;">
3808
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3809
  Downloading sympy (6.0MiB)
3810
+ Downloading networkx (1.9MiB)
3811
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3812
+ Downloading matplotlib (8.3MiB)
3813
+ Downloading setuptools (1.1MiB)
3814
+ Downloading fonttools (4.7MiB)
3815
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3816
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3817
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3818
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3819
  Downloading nvidia-cufft-cu12 (184.2MiB)
3820
+ Downloading numpy (16.2MiB)
 
 
 
3821
  Downloading nvidia-curand-cu12 (60.7MiB)
3822
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
 
 
3823
  Downloading hf-xet (3.0MiB)
 
3824
  Downloading pillow (6.3MiB)
 
 
3825
  Downloading nvidia-cusolver-cu12 (255.1MiB)
3826
+ Downloading kiwisolver (1.4MiB)
3827
+ Downloading torch (846.9MiB)
3828
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3829
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3830
+ Downloading triton (148.3MiB)
3831
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3832
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3833
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3834
  Downloading nvidia-cufile-cu12
3835
  Downloading kiwisolver
3836
  Downloading hf-xet
3837
  Downloading setuptools
 
3838
  Downloading networkx
3839
+ Downloading fonttools
3840
  Downloading pillow
3841
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3842
  Downloading nvidia-cuda-cupti-cu12
3843
  Downloading matplotlib
 
3844
  Downloading sympy
3845
+ Downloading numpy
3846
  Downloading nvidia-nvjitlink-cu12
3847
  Downloading nvidia-curand-cu12
3848
  Downloading nvidia-cuda-nvrtc-cu12
3849
  Downloading triton
3850
  Downloading nvidia-cufft-cu12
3851
  Downloading nvidia-cusolver-cu12
 
3852
  Downloading nvidia-cusparselt-cu12
3853
+ Downloading nvidia-cusparse-cu12
3854
  Downloading nvidia-nccl-cu12
3855
  Downloading nvidia-cublas-cu12
3856
  Downloading nvidia-cudnn-cu12
3857
  Downloading torch
3858
+ Installed 47 packages in 515ms
3859
  </div>
3860
  </div>
3861
  <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
3862
+ Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 4.20it/s]
3863
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.09it/s]
3864
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.44it/s]</div>
3865
  <div class="cell-artifacts">
3866
  <h4>Artifacts:</h4>
3867
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 35.60s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3794,44 +3794,44 @@ Cell: benchmark | 35.60s
3794
  <div class="cell-stdout">impl wl p50(ms) ok
3795
  torch_mem_eff flux_L128 0.59 True
3796
  torch_mem_eff flux_L256 0.65 True
3797
- torch_mem_eff flux_L320 0.77 True
3798
  torch_mem_eff flux_L384 0.79 True
3799
- torch_mem_eff flux_L448 0.84 True
3800
  torch_mem_eff flux_L512 0.95 True
3801
  </div>
3802
  <div class="uv-install-logs" id="uv-logs-benchmark">
3803
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3804
  <div class="uv-logs-content" style="display: none;">
3805
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3806
- Downloading matplotlib (8.3MiB)
3807
- Downloading pillow (6.3MiB)
3808
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
3809
  Downloading nvidia-cufft-cu12 (184.2MiB)
3810
  Downloading nvidia-cusparse-cu12 (274.9MiB)
3811
- Downloading numpy (16.2MiB)
3812
- Downloading setuptools (1.1MiB)
3813
- Downloading kiwisolver (1.4MiB)
3814
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3815
- Downloading torch (846.9MiB)
3816
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3817
- Downloading nvidia-cufile-cu12 (1.1MiB)
3818
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3819
  Downloading sympy (6.0MiB)
 
3820
  Downloading fonttools (4.7MiB)
3821
- Downloading networkx (1.9MiB)
3822
- Downloading nvidia-curand-cu12 (60.7MiB)
3823
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
3824
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3825
- Downloading triton (148.3MiB)
 
 
 
3826
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3827
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
 
3828
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3829
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3830
  Downloading nvidia-cufile-cu12
3831
  Downloading kiwisolver
3832
  Downloading setuptools
3833
- Downloading networkx
3834
  Downloading fonttools
 
3835
  Downloading pillow
3836
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3837
  Downloading nvidia-cuda-cupti-cu12
@@ -3850,7 +3850,7 @@ Downloading nvidia-nccl-cu12 (307.4MiB)
3850
  Downloading nvidia-cublas-cu12
3851
  Downloading nvidia-cudnn-cu12
3852
  Downloading torch
3853
- Installed 37 packages in 453ms
3854
  </div>
3855
  </div>
3856
  <div class="cell-artifacts">
 
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
+ Cell: benchmark | 35.95s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3794
  <div class="cell-stdout">impl wl p50(ms) ok
3795
  torch_mem_eff flux_L128 0.59 True
3796
  torch_mem_eff flux_L256 0.65 True
3797
+ torch_mem_eff flux_L320 0.78 True
3798
  torch_mem_eff flux_L384 0.79 True
3799
+ torch_mem_eff flux_L448 0.85 True
3800
  torch_mem_eff flux_L512 0.95 True
3801
  </div>
3802
  <div class="uv-install-logs" id="uv-logs-benchmark">
3803
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3804
  <div class="uv-logs-content" style="display: none;">
3805
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3806
+ Downloading nvidia-curand-cu12 (60.7MiB)
 
3807
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3808
+ Downloading networkx (1.9MiB)
3809
+ Downloading kiwisolver (1.4MiB)
3810
  Downloading nvidia-cufft-cu12 (184.2MiB)
3811
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
 
 
3812
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
3813
  Downloading sympy (6.0MiB)
3814
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3815
  Downloading fonttools (4.7MiB)
 
 
3816
  Downloading nvidia-cublas-cu12 (566.8MiB)
3817
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3818
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3819
+ Downloading setuptools (1.1MiB)
3820
+ Downloading matplotlib (8.3MiB)
3821
+ Downloading nvidia-cufile-cu12 (1.1MiB)
3822
+ Downloading numpy (16.2MiB)
3823
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
3824
  Downloading nvidia-nccl-cu12 (307.4MiB)
3825
+ Downloading torch (846.9MiB)
3826
+ Downloading triton (148.3MiB)
3827
+ Downloading pillow (6.3MiB)
3828
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3829
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3830
  Downloading nvidia-cufile-cu12
3831
  Downloading kiwisolver
3832
  Downloading setuptools
 
3833
  Downloading fonttools
3834
+ Downloading networkx
3835
  Downloading pillow
3836
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3837
  Downloading nvidia-cuda-cupti-cu12
 
3850
  Downloading nvidia-cublas-cu12
3851
  Downloading nvidia-cudnn-cu12
3852
  Downloading torch
3853
+ Installed 37 packages in 556ms
3854
  </div>
3855
  </div>
3856
  <div class="cell-artifacts">
flash_attn/impls/sage_attention.html CHANGED
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 40.11s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3804,53 +3804,53 @@ Cell: benchmark | 40.11s
3804
  <div id="output-benchmark" class="cell-output">
3805
  <div class="cell-stdout">impl wl p50(ms) ok
3806
  sage_int8_fp16 flux_L128 FAIL False
3807
- Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
3808
  sage_int8_fp16 flux_L256 FAIL False
3809
- Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
3810
  sage_int8_fp16 flux_L320 FAIL False
3811
- Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
3812
  sage_int8_fp16 flux_L384 FAIL False
3813
- Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
3814
  sage_int8_fp16 flux_L448 FAIL False
3815
- Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
3816
  sage_int8_fp16 flux_L512 FAIL False
3817
- Error: module &#x27;sage_attention_a39c012a73160148&#x27; has no attribute &#x27;fwd&#x27;
3818
  </div>
3819
  <div class="uv-install-logs" id="uv-logs-benchmark">
3820
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3821
  <div class="uv-logs-content" style="display: none;">
3822
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3823
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3824
- Downloading nvidia-cudnn-cu12 (674.0MiB)
 
3825
  Downloading setuptools (1.1MiB)
 
3826
  Downloading nvidia-cufile-cu12 (1.1MiB)
3827
- Downloading nvidia-cublas-cu12 (566.8MiB)
3828
- Downloading triton (148.3MiB)
3829
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
3830
  Downloading torch (846.9MiB)
3831
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3832
  Downloading nvidia-nccl-cu12 (307.4MiB)
3833
- Downloading pillow (6.3MiB)
3834
  Downloading kiwisolver (1.4MiB)
3835
- Downloading hf-xet (3.0MiB)
3836
- Downloading nvidia-cufft-cu12 (184.2MiB)
3837
- Downloading networkx (1.9MiB)
3838
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3839
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3840
  Downloading sympy (6.0MiB)
3841
- Downloading numpy (16.2MiB)
3842
- Downloading fonttools (4.7MiB)
3843
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3844
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3845
  Downloading matplotlib (8.3MiB)
 
 
3846
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3847
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3848
  Downloading nvidia-cufile-cu12
3849
  Downloading kiwisolver
3850
  Downloading hf-xet
3851
  Downloading setuptools
3852
- Downloading fonttools
3853
  Downloading networkx
 
3854
  Downloading pillow
3855
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3856
  Downloading nvidia-cuda-cupti-cu12
@@ -3869,15 +3869,13 @@ Downloading matplotlib (8.3MiB)
3869
  Downloading nvidia-cublas-cu12
3870
  Downloading nvidia-cudnn-cu12
3871
  Downloading torch
3872
- Installed 48 packages in 513ms
3873
  </div>
3874
  </div>
3875
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3876
- Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:01, 5.70it/s]
3877
- Fetching 11 files: 18%|█▊ | 2/11 [00:00&lt;00:01, 6.67it/s]
3878
- Fetching 11 files: 27%|██▋ | 3/11 [00:00&lt;00:01, 6.46it/s]
3879
- Fetching 11 files: 64%|██████▎ | 7/11 [00:00&lt;00:00, 11.66it/s]
3880
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 15.59it/s]</div>
3881
  <div class="cell-artifacts">
3882
  <h4>Artifacts:</h4>
3883
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
 
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
+ Cell: benchmark | 40.43s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3804
  <div id="output-benchmark" class="cell-output">
3805
  <div class="cell-stdout">impl wl p50(ms) ok
3806
  sage_int8_fp16 flux_L128 FAIL False
3807
+ Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3808
  sage_int8_fp16 flux_L256 FAIL False
3809
+ Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3810
  sage_int8_fp16 flux_L320 FAIL False
3811
+ Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3812
  sage_int8_fp16 flux_L384 FAIL False
3813
+ Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3814
  sage_int8_fp16 flux_L448 FAIL False
3815
+ Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3816
  sage_int8_fp16 flux_L512 FAIL False
3817
+ Error: module &#x27;sage_attention_46758c422d547a47&#x27; has no attribute &#x27;fwd&#x27;
3818
  </div>
3819
  <div class="uv-install-logs" id="uv-logs-benchmark">
3820
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3821
  <div class="uv-logs-content" style="display: none;">
3822
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3823
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3824
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3825
+ Downloading networkx (1.9MiB)
3826
  Downloading setuptools (1.1MiB)
3827
+ Downloading numpy (16.2MiB)
3828
  Downloading nvidia-cufile-cu12 (1.1MiB)
3829
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
3830
  Downloading nvidia-curand-cu12 (60.7MiB)
3831
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3832
+ Downloading hf-xet (3.0MiB)
3833
  Downloading torch (846.9MiB)
3834
+ Downloading triton (148.3MiB)
3835
  Downloading nvidia-nccl-cu12 (307.4MiB)
3836
+ Downloading nvidia-cublas-cu12 (566.8MiB)
3837
  Downloading kiwisolver (1.4MiB)
3838
+ Downloading pillow (6.3MiB)
 
 
 
 
3839
  Downloading sympy (6.0MiB)
3840
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3841
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3842
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
 
3843
  Downloading matplotlib (8.3MiB)
3844
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3845
+ Downloading fonttools (4.7MiB)
3846
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3847
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3848
  Downloading nvidia-cufile-cu12
3849
  Downloading kiwisolver
3850
  Downloading hf-xet
3851
  Downloading setuptools
 
3852
  Downloading networkx
3853
+ Downloading fonttools
3854
  Downloading pillow
3855
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3856
  Downloading nvidia-cuda-cupti-cu12
 
3869
  Downloading nvidia-cublas-cu12
3870
  Downloading nvidia-cudnn-cu12
3871
  Downloading torch
3872
+ Installed 48 packages in 525ms
3873
  </div>
3874
  </div>
3875
  <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3876
+ Fetching 11 files: 9%|▉ | 1/11 [00:00&lt;00:01, 5.55it/s]
3877
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 12.93it/s]
3878
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 16.93it/s]</div>
 
 
3879
  <div class="cell-artifacts">
3880
  <h4>Artifacts:</h4>
3881
  <a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
- Cell: benchmark | 40.76s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3797,48 +3797,48 @@ xformers_meff flux_L256 0.47 True
3797
  xformers_meff flux_L320 0.60 True
3798
  xformers_meff flux_L384 0.60 True
3799
  xformers_meff flux_L448 0.64 True
3800
- xformers_meff flux_L512 0.65 True
3801
  </div>
3802
  <div class="uv-install-logs" id="uv-logs-benchmark">
3803
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3804
  <div class="uv-logs-content" style="display: none;">
3805
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3806
- Downloading numpy (16.2MiB)
3807
- Downloading xformers (111.8MiB)
3808
- Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
 
 
 
3809
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3810
- Downloading setuptools (1.1MiB)
3811
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
3812
  Downloading nvidia-cublas-cu12 (566.8MiB)
3813
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3814
- Downloading pillow (6.3MiB)
3815
- Downloading nvidia-curand-cu12 (60.7MiB)
3816
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3817
  Downloading sympy (6.0MiB)
3818
- Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
3819
  Downloading nvidia-cufile-cu12 (1.1MiB)
3820
- Downloading nvidia-cufft-cu12 (184.2MiB)
3821
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3822
- Downloading networkx (1.9MiB)
3823
- Downloading nvidia-nccl-cu12 (307.4MiB)
3824
  Downloading fonttools (4.7MiB)
3825
- Downloading triton (148.3MiB)
3826
  Downloading torch (846.9MiB)
3827
- Downloading kiwisolver (1.4MiB)
3828
- Downloading matplotlib (8.3MiB)
3829
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3830
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3831
  Downloading nvidia-cufile-cu12
3832
  Downloading kiwisolver
3833
  Downloading setuptools
3834
- Downloading networkx
3835
  Downloading fonttools
 
3836
  Downloading pillow
3837
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3838
  Downloading nvidia-cuda-cupti-cu12
3839
  Downloading matplotlib
3840
- Downloading sympy
3841
  Downloading numpy
 
3842
  Downloading nvidia-nvjitlink-cu12
3843
  Downloading nvidia-curand-cu12
3844
  Downloading nvidia-cuda-nvrtc-cu12
@@ -3849,10 +3849,10 @@ Downloading matplotlib (8.3MiB)
3849
  Downloading nvidia-cusparse-cu12
3850
  Downloading nvidia-cusparselt-cu12
3851
  Downloading nvidia-nccl-cu12
3852
- Downloading nvidia-cudnn-cu12
3853
  Downloading nvidia-cublas-cu12
 
3854
  Downloading torch
3855
- Installed 38 packages in 522ms
3856
  </div>
3857
  </div>
3858
  <div class="cell-artifacts">
 
3710
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3711
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3712
  </span> |
3713
+ Cell: benchmark | 40.64s
3714
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3715
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3716
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3797
  xformers_meff flux_L320 0.60 True
3798
  xformers_meff flux_L384 0.60 True
3799
  xformers_meff flux_L448 0.64 True
3800
+ xformers_meff flux_L512 0.64 True
3801
  </div>
3802
  <div class="uv-install-logs" id="uv-logs-benchmark">
3803
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3804
  <div class="uv-logs-content" style="display: none;">
3805
  Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
3806
+ Downloading networkx (1.9MiB)
3807
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3808
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3809
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3810
+ Downloading nvidia-curand-cu12 (60.7MiB)
3811
+ Downloading triton (148.3MiB)
3812
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3813
+ Downloading pillow (6.3MiB)
3814
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
3815
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3816
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
3817
  Downloading nvidia-cublas-cu12 (566.8MiB)
3818
+ Downloading numpy (16.2MiB)
3819
+ Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
3820
  Downloading sympy (6.0MiB)
3821
+ Downloading matplotlib (8.3MiB)
3822
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3823
+ Downloading xformers (111.8MiB)
3824
+ Downloading setuptools (1.1MiB)
3825
  Downloading nvidia-cufile-cu12 (1.1MiB)
3826
+ Downloading kiwisolver (1.4MiB)
 
 
 
3827
  Downloading fonttools (4.7MiB)
 
3828
  Downloading torch (846.9MiB)
 
 
3829
  Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
3830
  Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3831
  Downloading nvidia-cufile-cu12
3832
  Downloading kiwisolver
3833
  Downloading setuptools
 
3834
  Downloading fonttools
3835
+ Downloading networkx
3836
  Downloading pillow
3837
  Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
3838
  Downloading nvidia-cuda-cupti-cu12
3839
  Downloading matplotlib
 
3840
  Downloading numpy
3841
+ Downloading sympy
3842
  Downloading nvidia-nvjitlink-cu12
3843
  Downloading nvidia-curand-cu12
3844
  Downloading nvidia-cuda-nvrtc-cu12
 
3849
  Downloading nvidia-cusparse-cu12
3850
  Downloading nvidia-cusparselt-cu12
3851
  Downloading nvidia-nccl-cu12
 
3852
  Downloading nvidia-cublas-cu12
3853
+ Downloading nvidia-cudnn-cu12
3854
  Downloading torch
3855
+ Installed 38 packages in 562ms
3856
  </div>
3857
  </div>
3858
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.png ADDED

Git LFS Details

  • SHA256: 87dbea8f2773d7fcee9fd191cb6e67cd1e2ddd379cef90ee01bb4ac40a55b5f1
  • Pointer size: 131 Bytes
  • Size of remote file: 110 kB
flash_attn/results/cells/combine.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
12
+ # ///
13
+ import torch
14
+ import sys
15
+ import os
16
+ import kernels_benchmark_tools as kbt
17
+ from pathlib import Path
18
+
19
+ # Discover the upstream artifact directories from environment variables
20
+ cache_dirs = {
21
+ "Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
22
+ "MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
23
+ "Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
24
+ "xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
25
+ "SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
26
+ "Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
27
+ "Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
28
+ "HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
29
+ "HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
30
+ }
31
+
32
+ print("LOADING BENCHMARK DATA")
33
+ for name, cache_dir in cache_dirs.items():
34
+ print(f"{name:30s}: {cache_dir}")
35
+ print()
36
+
37
+ # Collect all JSONL paths
38
+ all_paths = []
39
+ file_mapping = {
40
+ "Flash (PyTorch SDPA)": "attn.jsonl",
41
+ "MemEff (PyTorch SDPA)": "attn.jsonl",
42
+ "Flash Attn 2": "attn.jsonl",
43
+ "xFormers": "attn.jsonl",
44
+ "SageAttention": "attn.jsonl",
45
+ "Compiled (default)": "attn_default.jsonl",
46
+ "Compiled (max-autotune)": "attn_max_autotune.jsonl",
47
+ "HF Kernels Flash Attn": "attn.jsonl",
48
+ "HF Kernels Flash Attn3": "attn.jsonl",
49
+ }
50
+
51
+ for name, cache_dir in cache_dirs.items():
52
+ if cache_dir:
53
+ jsonl_file = file_mapping[name]
54
+ path = Path(cache_dir) / jsonl_file
55
+ if path.exists() and path.stat().st_size > 0:
56
+ all_paths.append(str(path))
57
+ print(f"✓ Found {name}: {path}")
58
+ else:
59
+ print(f"⊘ Empty/Missing {name}: {path}")
60
+ else:
61
+ print(f"✗ No cache dir for {name}")
62
+
63
+ print()
64
+
65
+ if not all_paths:
66
+ print("ERROR: No benchmark data files found!")
67
+ sys.exit(1)
68
+
69
+ # Generate combined summary
70
+ print("COMBINED BENCHMARK SUMMARY")
71
+ print()
72
+
73
+ kbt.summarize(all_paths)
74
+
75
+ print()
76
+ print("GENERATING COMBINED VISUALIZATION")
77
+ print()
78
+
79
+ try:
80
+ kbt.viz(all_paths)
81
+ print("✓ Combined visualization saved as latency.png")
82
+ except ImportError as e:
83
+ print(f"✗ Visualization requires matplotlib: {e}")
84
+ except Exception as e:
85
+ print(f"✗ Visualization failed: {e}")
86
+
87
+ print()
88
+ print("ANALYSIS COMPLETE")
89
+ print(f"Total implementations analyzed: {len(all_paths)}")
90
+ print(f"\nImplementations included:")
91
+ for name, cache_dir in cache_dirs.items():
92
+ if cache_dir:
93
+ jsonl_file = file_mapping[name]
94
+ path = Path(cache_dir) / jsonl_file
95
+ if path.exists() and path.stat().st_size > 0:
96
+ print(f" ✓ {name}")
flash_attn/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff