Upload folder using huggingface_hub
Browse files- flash_attn/impls/artifacts/benchmark/attn.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl +6 -6
- flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl +6 -6
- flash_attn/impls/compiled_variants.html +44 -44
- flash_attn/impls/flash_attention.html +21 -21
- flash_attn/impls/hf_kernels_flash_attn.html +27 -27
- flash_attn/impls/hf_kernels_flash_attn3.html +24 -24
- flash_attn/impls/mem_efficient_attention.html +17 -17
- flash_attn/impls/sage_attention.html +27 -29
- flash_attn/impls/xformers.html +22 -22
- flash_attn/results/artifacts/combine/latency.png +3 -0
- flash_attn/results/cells/combine.py +96 -0
- flash_attn/results/combined_results.html +0 -0
flash_attn/impls/artifacts/benchmark/attn.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-02T15:
|
| 2 |
-
{"ts": "2025-10-02T15:
|
| 3 |
-
{"ts": "2025-10-02T15:
|
| 4 |
-
{"ts": "2025-10-02T15:
|
| 5 |
-
{"ts": "2025-10-02T15:
|
| 6 |
-
{"ts": "2025-10-02T15:
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.4424000084400177, "p50": 0.4480000138282776, "p90": 0.45020800828933716, "mean": 0.448172801733017, "reps": 5, "warmup": 2}, "compile_ms": 1.8151999711990356, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.46480000019073486, "p50": 0.4689280092716217, "p90": 0.47071999311447144, "mean": 0.46839680075645446, "reps": 5, "warmup": 2}, "compile_ms": 0.35923200845718384, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5970879793167114, "p50": 0.5986559987068176, "p90": 0.6020799875259399, "mean": 0.6001919984817505, "reps": 5, "warmup": 2}, "compile_ms": 0.48611199855804443, "peak_bytes": 99680256, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.905726432800293e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5994560122489929, "p50": 0.6028159856796265, "p90": 0.6028800010681152, "mean": 0.6018815994262695, "reps": 5, "warmup": 2}, "compile_ms": 0.49404799938201904, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8908252716064453e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6379839777946472, "p50": 0.6402559876441956, "p90": 0.6423360109329224, "mean": 0.6404095888137817, "reps": 5, "warmup": 2}, "compile_ms": 0.531391978263855, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003566741943359375, "mse": 2.86102294921875e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T15:53:41Z", "run": "110abee5a11144f086ff362569489d61", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6431040167808533, "p50": 0.6442880034446716, "p90": 0.6445119976997375, "mean": 0.644704008102417, "reps": 5, "warmup": 2}, "compile_ms": 0.5358719825744629, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.8759241104125977e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_default/attn_default.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-02T15:
|
| 2 |
-
{"ts": "2025-10-02T15:
|
| 3 |
-
{"ts": "2025-10-02T15:
|
| 4 |
-
{"ts": "2025-10-02T15:
|
| 5 |
-
{"ts": "2025-10-02T15:
|
| 6 |
-
{"ts": "2025-10-02T15:
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5143679976463318, "p50": 0.5232959985733032, "p90": 0.5257599949836731, "mean": 0.5211328029632568, "reps": 5, "warmup": 2}, "compile_ms": 3112.67236328125, "peak_bytes": 87425024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T15:50:57Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.5569279789924622, "p50": 0.558784008026123, "p90": 0.5599679946899414, "mean": 0.5588735938072205, "reps": 5, "warmup": 2}, "compile_ms": 272.2660217285156, "peak_bytes": 95027200, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.685375988483429, "p50": 0.6888960003852844, "p90": 0.6940159797668457, "mean": 0.6904960036277771, "reps": 5, "warmup": 2}, "compile_ms": 272.7831726074219, "peak_bytes": 99876864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7146559953689575, "p50": 0.7190399765968323, "p90": 0.7200639843940735, "mean": 0.7184319853782654, "reps": 5, "warmup": 2}, "compile_ms": 270.6763916015625, "peak_bytes": 104726528, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.739359974861145, "p50": 0.7402240037918091, "p90": 0.7426239848136902, "mean": 0.741484797000885, "reps": 5, "warmup": 2}, "compile_ms": 270.3490295410156, "peak_bytes": 108855296, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T15:50:58Z", "run": "072dd2e8601f475db00e349e59df9f0c", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.7703679800033569, "p50": 0.7723519802093506, "p90": 0.7728000283241272, "mean": 0.7723968029022217, "reps": 5, "warmup": 2}, "compile_ms": 269.7756652832031, "peak_bytes": 114425856, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/artifacts/benchmark_max_autotune/attn_max_autotune.jsonl
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
{"ts": "2025-10-02T15:
|
| 2 |
-
{"ts": "2025-10-02T15:
|
| 3 |
-
{"ts": "2025-10-02T15:
|
| 4 |
-
{"ts": "2025-10-02T15:
|
| 5 |
-
{"ts": "2025-10-02T15:
|
| 6 |
-
{"ts": "2025-10-02T15:
|
|
|
|
| 1 |
+
{"ts": "2025-10-02T15:50:03Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.63155198097229, "p50": 0.6451839804649353, "p90": 0.665727972984314, "mean": 0.6618239879608154, "reps": 5, "warmup": 2}, "compile_ms": 4977.1767578125, "peak_bytes": 70779904, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000339508056640625, "mse": 2.726912498474121e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 2 |
+
{"ts": "2025-10-02T15:50:05Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.6628159880638123, "p50": 0.6843519806861877, "p90": 0.7063680291175842, "mean": 0.7008576035499573, "reps": 5, "warmup": 2}, "compile_ms": 1701.4315185546875, "peak_bytes": 78644224, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003414154052734375, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 3 |
+
{"ts": "2025-10-02T15:50:07Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8050559759140015, "p50": 0.8155199885368347, "p90": 0.8389120101928711, "mean": 0.833843195438385, "reps": 5, "warmup": 2}, "compile_ms": 1701.230712890625, "peak_bytes": 84280320, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 4 |
+
{"ts": "2025-10-02T15:50:09Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8313599824905396, "p50": 0.849407970905304, "p90": 0.8810880184173584, "mean": 0.8694527983665467, "reps": 5, "warmup": 2}, "compile_ms": 2027.875, "peak_bytes": 86508544, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 5 |
+
{"ts": "2025-10-02T15:50:11Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.8768960237503052, "p50": 0.8824639916419983, "p90": 0.9011520147323608, "mean": 0.9017536044120789, "reps": 5, "warmup": 2}, "compile_ms": 2269.297607421875, "peak_bytes": 90440704, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00034332275390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
| 6 |
+
{"ts": "2025-10-02T15:50:13Z", "run": "9ea21ad802cc490893a0c45ca82ce166", "impl": "torch_flash_compiled_max_autotune", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA A10G", "sm": "8.6", "py": "3.11.13", "plat": "Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36"}, "lat_ms": {"p10": 0.9179520010948181, "p50": 0.9188479781150818, "p90": 0.9378560185432434, "mean": 0.9400512099266052, "reps": 5, "warmup": 2}, "compile_ms": 1835.313720703125, "peak_bytes": 94372864, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.771615982055664e-06, "ref": "sdpa_math_fp32"}, "err": null}
|
flash_attn/impls/compiled_variants.html
CHANGED
|
@@ -3711,7 +3711,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3711 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3712 |
<span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3713 |
</span> |
|
| 3714 |
-
Cell: benchmark_default |
|
| 3715 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3716 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3717 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3797,9 +3797,9 @@ Cell: benchmark_default | 46.71s
|
|
| 3797 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3798 |
torch_flash_compiled_default flux_L128 0.52 True
|
| 3799 |
torch_flash_compiled_default flux_L256 0.56 True
|
| 3800 |
-
torch_flash_compiled_default flux_L320 0.
|
| 3801 |
torch_flash_compiled_default flux_L384 0.72 True
|
| 3802 |
-
torch_flash_compiled_default flux_L448 0.
|
| 3803 |
torch_flash_compiled_default flux_L512 0.77 True
|
| 3804 |
</div>
|
| 3805 |
<div class="uv-install-logs" id="uv-logs-benchmark_default">
|
|
@@ -3807,34 +3807,34 @@ torch_flash_compiled_default flux_L512 0.77 True
|
|
| 3807 |
<div class="uv-logs-content" style="display: none;">
|
| 3808 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3809 |
Downloading triton (148.3MiB)
|
| 3810 |
-
Downloading
|
| 3811 |
-
Downloading
|
| 3812 |
-
Downloading
|
|
|
|
| 3813 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3814 |
-
Downloading nvidia-
|
| 3815 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3816 |
-
Downloading fonttools (4.7MiB)
|
| 3817 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3818 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3819 |
-
Downloading pillow (6.3MiB)
|
| 3820 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3821 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3822 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3823 |
Downloading setuptools (1.1MiB)
|
| 3824 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3825 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3826 |
Downloading matplotlib (8.3MiB)
|
| 3827 |
-
Downloading
|
|
|
|
| 3828 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3829 |
-
Downloading
|
| 3830 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3831 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3832 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3833 |
Downloading nvidia-cufile-cu12
|
| 3834 |
Downloading kiwisolver
|
| 3835 |
Downloading setuptools
|
| 3836 |
-
Downloading fonttools
|
| 3837 |
Downloading networkx
|
|
|
|
| 3838 |
Downloading pillow
|
| 3839 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3840 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3847,13 +3847,13 @@ Downloading kiwisolver (1.4MiB)
|
|
| 3847 |
Downloading triton
|
| 3848 |
Downloading nvidia-cufft-cu12
|
| 3849 |
Downloading nvidia-cusolver-cu12
|
| 3850 |
-
Downloading nvidia-cusparse-cu12
|
| 3851 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 3852 |
Downloading nvidia-nccl-cu12
|
| 3853 |
Downloading nvidia-cublas-cu12
|
| 3854 |
Downloading nvidia-cudnn-cu12
|
| 3855 |
Downloading torch
|
| 3856 |
-
Installed 37 packages in
|
| 3857 |
</div>
|
| 3858 |
</div>
|
| 3859 |
<div class="cell-artifacts">
|
|
@@ -3871,7 +3871,7 @@ Installed 37 packages in 545ms
|
|
| 3871 |
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
-
Cell: benchmark_max_autotune |
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 3877 |
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3955,52 +3955,52 @@ Cell: benchmark_max_autotune | 53.95s
|
|
| 3955 |
</div>
|
| 3956 |
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 3957 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3958 |
-
torch_flash_compiled_max_autotune flux_L128 0.
|
| 3959 |
torch_flash_compiled_max_autotune flux_L256 0.68 True
|
| 3960 |
-
torch_flash_compiled_max_autotune flux_L320 0.
|
| 3961 |
torch_flash_compiled_max_autotune flux_L384 0.85 True
|
| 3962 |
-
torch_flash_compiled_max_autotune flux_L448 0.
|
| 3963 |
torch_flash_compiled_max_autotune flux_L512 0.92 True
|
| 3964 |
</div>
|
| 3965 |
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 3966 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3967 |
<div class="uv-logs-content" style="display: none;">
|
| 3968 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3969 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3970 |
-
Downloading numpy (16.2MiB)
|
| 3971 |
Downloading matplotlib (8.3MiB)
|
| 3972 |
-
Downloading kiwisolver (1.4MiB)
|
| 3973 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3974 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3975 |
Downloading setuptools (1.1MiB)
|
| 3976 |
-
Downloading
|
| 3977 |
-
Downloading
|
|
|
|
| 3978 |
Downloading fonttools (4.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3979 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3980 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3981 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3982 |
Downloading networkx (1.9MiB)
|
| 3983 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3984 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3985 |
-
Downloading triton (148.3MiB)
|
| 3986 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3987 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3988 |
-
Downloading nvidia-
|
| 3989 |
-
Downloading
|
| 3990 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3991 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3992 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3993 |
Downloading nvidia-cufile-cu12
|
| 3994 |
Downloading kiwisolver
|
| 3995 |
Downloading setuptools
|
| 3996 |
-
Downloading networkx
|
| 3997 |
Downloading fonttools
|
|
|
|
| 3998 |
Downloading pillow
|
| 3999 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 4000 |
-
Downloading nvidia-cuda-cupti-cu12
|
| 4001 |
Downloading matplotlib
|
| 4002 |
-
Downloading
|
| 4003 |
Downloading sympy
|
|
|
|
| 4004 |
Downloading nvidia-nvjitlink-cu12
|
| 4005 |
Downloading nvidia-curand-cu12
|
| 4006 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
@@ -4013,7 +4013,7 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
| 4013 |
Downloading nvidia-cublas-cu12
|
| 4014 |
Downloading nvidia-cudnn-cu12
|
| 4015 |
Downloading torch
|
| 4016 |
-
Installed 37 packages in
|
| 4017 |
</div>
|
| 4018 |
</div>
|
| 4019 |
<div class="cell-artifacts">
|
|
|
|
| 3711 |
<span onclick="toggleOutput('benchmark_default')" style="cursor: pointer;">▼ output</span>
|
| 3712 |
<span id="uv-indicator-benchmark_default" onclick="toggleUvLogsFromHeader('benchmark_default')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3713 |
</span> |
|
| 3714 |
+
Cell: benchmark_default | 44.25s
|
| 3715 |
| <button class="run-btn" onclick="runCell('benchmark_default')">▶ run</button>
|
| 3716 |
<button class="copy-btn" onclick="copyCell('benchmark_default')">Copy</button>
|
| 3717 |
<a href="cells/benchmark_default.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3797 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3798 |
torch_flash_compiled_default flux_L128 0.52 True
|
| 3799 |
torch_flash_compiled_default flux_L256 0.56 True
|
| 3800 |
+
torch_flash_compiled_default flux_L320 0.69 True
|
| 3801 |
torch_flash_compiled_default flux_L384 0.72 True
|
| 3802 |
+
torch_flash_compiled_default flux_L448 0.74 True
|
| 3803 |
torch_flash_compiled_default flux_L512 0.77 True
|
| 3804 |
</div>
|
| 3805 |
<div class="uv-install-logs" id="uv-logs-benchmark_default">
|
|
|
|
| 3807 |
<div class="uv-logs-content" style="display: none;">
|
| 3808 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3809 |
Downloading triton (148.3MiB)
|
| 3810 |
+
Downloading torch (846.9MiB)
|
| 3811 |
+
Downloading kiwisolver (1.4MiB)
|
| 3812 |
+
Downloading fonttools (4.7MiB)
|
| 3813 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3814 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3815 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3816 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 3817 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3818 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
| 3819 |
Downloading matplotlib (8.3MiB)
|
| 3820 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3821 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3822 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3823 |
+
Downloading numpy (16.2MiB)
|
| 3824 |
+
Downloading sympy (6.0MiB)
|
| 3825 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3826 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3827 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3828 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3829 |
+
Downloading networkx (1.9MiB)
|
| 3830 |
+
Downloading pillow (6.3MiB)
|
| 3831 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3832 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3833 |
Downloading nvidia-cufile-cu12
|
| 3834 |
Downloading kiwisolver
|
| 3835 |
Downloading setuptools
|
|
|
|
| 3836 |
Downloading networkx
|
| 3837 |
+
Downloading fonttools
|
| 3838 |
Downloading pillow
|
| 3839 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3840 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3847 |
Downloading triton
|
| 3848 |
Downloading nvidia-cufft-cu12
|
| 3849 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3850 |
Downloading nvidia-cusparselt-cu12
|
| 3851 |
+
Downloading nvidia-cusparse-cu12
|
| 3852 |
Downloading nvidia-nccl-cu12
|
| 3853 |
Downloading nvidia-cublas-cu12
|
| 3854 |
Downloading nvidia-cudnn-cu12
|
| 3855 |
Downloading torch
|
| 3856 |
+
Installed 37 packages in 516ms
|
| 3857 |
</div>
|
| 3858 |
</div>
|
| 3859 |
<div class="cell-artifacts">
|
|
|
|
| 3871 |
<span onclick="toggleOutput('benchmark_max_autotune')" style="cursor: pointer;">▼ output</span>
|
| 3872 |
<span id="uv-indicator-benchmark_max_autotune" onclick="toggleUvLogsFromHeader('benchmark_max_autotune')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3873 |
</span> |
|
| 3874 |
+
Cell: benchmark_max_autotune | 56.94s
|
| 3875 |
| <button class="run-btn" onclick="runCell('benchmark_max_autotune')">▶ run</button>
|
| 3876 |
<button class="copy-btn" onclick="copyCell('benchmark_max_autotune')">Copy</button>
|
| 3877 |
<a href="cells/benchmark_max_autotune.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3955 |
</div>
|
| 3956 |
<div id="output-benchmark_max_autotune" class="cell-output">
|
| 3957 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3958 |
+
torch_flash_compiled_max_autotune flux_L128 0.65 True
|
| 3959 |
torch_flash_compiled_max_autotune flux_L256 0.68 True
|
| 3960 |
+
torch_flash_compiled_max_autotune flux_L320 0.82 True
|
| 3961 |
torch_flash_compiled_max_autotune flux_L384 0.85 True
|
| 3962 |
+
torch_flash_compiled_max_autotune flux_L448 0.88 True
|
| 3963 |
torch_flash_compiled_max_autotune flux_L512 0.92 True
|
| 3964 |
</div>
|
| 3965 |
<div class="uv-install-logs" id="uv-logs-benchmark_max_autotune">
|
| 3966 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3967 |
<div class="uv-logs-content" style="display: none;">
|
| 3968 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
|
|
|
| 3969 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
|
|
|
| 3970 |
Downloading setuptools (1.1MiB)
|
| 3971 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3972 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3973 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3974 |
Downloading fonttools (4.7MiB)
|
| 3975 |
+
Downloading numpy (16.2MiB)
|
| 3976 |
+
Downloading pillow (6.3MiB)
|
| 3977 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3978 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3979 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3980 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3981 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
| 3982 |
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
|
|
|
| 3983 |
Downloading torch (846.9MiB)
|
| 3984 |
+
Downloading triton (148.3MiB)
|
| 3985 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3986 |
+
Downloading kiwisolver (1.4MiB)
|
| 3987 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3988 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3989 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3990 |
+
Downloading sympy (6.0MiB)
|
|
|
|
| 3991 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3992 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3993 |
Downloading nvidia-cufile-cu12
|
| 3994 |
Downloading kiwisolver
|
| 3995 |
Downloading setuptools
|
|
|
|
| 3996 |
Downloading fonttools
|
| 3997 |
+
Downloading networkx
|
| 3998 |
Downloading pillow
|
| 3999 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
|
|
|
| 4000 |
Downloading matplotlib
|
| 4001 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4002 |
Downloading sympy
|
| 4003 |
+
Downloading numpy
|
| 4004 |
Downloading nvidia-nvjitlink-cu12
|
| 4005 |
Downloading nvidia-curand-cu12
|
| 4006 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
|
|
| 4013 |
Downloading nvidia-cublas-cu12
|
| 4014 |
Downloading nvidia-cudnn-cu12
|
| 4015 |
Downloading torch
|
| 4016 |
+
Installed 37 packages in 547ms
|
| 4017 |
</div>
|
| 4018 |
</div>
|
| 4019 |
<div class="cell-artifacts">
|
flash_attn/impls/flash_attention.html
CHANGED
|
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: nv | 0.
|
| 3714 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3716 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3726,7 +3726,7 @@ Cell: nv | 0.68s
|
|
| 3726 |
</div>
|
| 3727 |
</div>
|
| 3728 |
<div id="output-nv" class="cell-output">
|
| 3729 |
-
<div class="cell-stdout">Thu Oct 2 15:
|
| 3730 |
+-----------------------------------------------------------------------------------------+
|
| 3731 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3732 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3735,19 +3735,19 @@ Cell: nv | 0.68s
|
|
| 3735 |
| | | MIG M. |
|
| 3736 |
|=========================================+========================+======================|
|
| 3737 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3738 |
-
| 0%
|
| 3739 |
| | | N/A |
|
| 3740 |
+-----------------------------------------+------------------------+----------------------+
|
| 3741 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3742 |
-
| 0% 25C P8
|
| 3743 |
| | | N/A |
|
| 3744 |
+-----------------------------------------+------------------------+----------------------+
|
| 3745 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3746 |
-
| 0%
|
| 3747 |
| | | N/A |
|
| 3748 |
+-----------------------------------------+------------------------+----------------------+
|
| 3749 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3750 |
-
| 0% 25C P8
|
| 3751 |
| | | N/A |
|
| 3752 |
+-----------------------------------------+------------------------+----------------------+
|
| 3753 |
|
|
@@ -3771,7 +3771,7 @@ Cell: nv | 0.68s
|
|
| 3771 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3772 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3773 |
</span> |
|
| 3774 |
-
Cell: benchmark |
|
| 3775 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3776 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3777 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3862,28 +3862,28 @@ torch_flash_ma flux_L512 0.74 True
|
|
| 3862 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3863 |
<div class="uv-logs-content" style="display: none;">
|
| 3864 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3865 |
-
Downloading networkx (1.9MiB)
|
| 3866 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3867 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3868 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3869 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3870 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3871 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 3872 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3873 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3874 |
Downloading setuptools (1.1MiB)
|
| 3875 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 3876 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
| 3877 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3878 |
-
Downloading
|
| 3879 |
Downloading kiwisolver (1.4MiB)
|
| 3880 |
-
Downloading numpy (16.2MiB)
|
| 3881 |
-
Downloading matplotlib (8.3MiB)
|
| 3882 |
-
Downloading fonttools (4.7MiB)
|
| 3883 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3884 |
Downloading torch (846.9MiB)
|
| 3885 |
Downloading triton (148.3MiB)
|
| 3886 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3887 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3888 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3889 |
Downloading nvidia-cufile-cu12
|
|
@@ -3909,7 +3909,7 @@ Downloading nvidia-curand-cu12 (60.7MiB)
|
|
| 3909 |
Downloading nvidia-cublas-cu12
|
| 3910 |
Downloading nvidia-cudnn-cu12
|
| 3911 |
Downloading torch
|
| 3912 |
-
Installed 37 packages in
|
| 3913 |
</div>
|
| 3914 |
</div>
|
| 3915 |
<div class="cell-artifacts">
|
|
|
|
| 3710 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
+
Cell: nv | 0.66s
|
| 3714 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3716 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3726 |
</div>
|
| 3727 |
</div>
|
| 3728 |
<div id="output-nv" class="cell-output">
|
| 3729 |
+
<div class="cell-stdout">Thu Oct 2 15:53:02 2025
|
| 3730 |
+-----------------------------------------------------------------------------------------+
|
| 3731 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3732 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3735 |
| | | MIG M. |
|
| 3736 |
|=========================================+========================+======================|
|
| 3737 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3738 |
+
| 0% 29C P0 87W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3739 |
| | | N/A |
|
| 3740 |
+-----------------------------------------+------------------------+----------------------+
|
| 3741 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3742 |
+
| 0% 25C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3743 |
| | | N/A |
|
| 3744 |
+-----------------------------------------+------------------------+----------------------+
|
| 3745 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3746 |
+
| 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3747 |
| | | N/A |
|
| 3748 |
+-----------------------------------------+------------------------+----------------------+
|
| 3749 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3750 |
+
| 0% 25C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3751 |
| | | N/A |
|
| 3752 |
+-----------------------------------------+------------------------+----------------------+
|
| 3753 |
|
|
|
|
| 3771 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3772 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3773 |
</span> |
|
| 3774 |
+
Cell: benchmark | 37.94s
|
| 3775 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3776 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3777 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3862 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3863 |
<div class="uv-logs-content" style="display: none;">
|
| 3864 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3865 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3866 |
+
Downloading sympy (6.0MiB)
|
| 3867 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3868 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3869 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3870 |
+
Downloading networkx (1.9MiB)
|
| 3871 |
+
Downloading fonttools (4.7MiB)
|
| 3872 |
+
Downloading matplotlib (8.3MiB)
|
| 3873 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3874 |
Downloading setuptools (1.1MiB)
|
| 3875 |
+
Downloading pillow (6.3MiB)
|
| 3876 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3877 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3878 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3879 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3880 |
+
Downloading numpy (16.2MiB)
|
| 3881 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3882 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3883 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3884 |
Downloading kiwisolver (1.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3885 |
Downloading torch (846.9MiB)
|
| 3886 |
Downloading triton (148.3MiB)
|
|
|
|
| 3887 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3888 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3889 |
Downloading nvidia-cufile-cu12
|
|
|
|
| 3909 |
Downloading nvidia-cublas-cu12
|
| 3910 |
Downloading nvidia-cudnn-cu12
|
| 3911 |
Downloading torch
|
| 3912 |
+
Installed 37 packages in 567ms
|
| 3913 |
</div>
|
| 3914 |
</div>
|
| 3915 |
<div class="cell-artifacts">
|
flash_attn/impls/hf_kernels_flash_attn.html
CHANGED
|
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark |
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3796,48 +3796,48 @@ Cell: benchmark | 37.93s
|
|
| 3796 |
</div>
|
| 3797 |
<div id="output-benchmark" class="cell-output">
|
| 3798 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3799 |
-
hf_kernels_flash_attn flux_L128 0.
|
| 3800 |
-
hf_kernels_flash_attn flux_L256 0.
|
| 3801 |
-
hf_kernels_flash_attn flux_L320 0.
|
| 3802 |
-
hf_kernels_flash_attn flux_L384 0.
|
| 3803 |
-
hf_kernels_flash_attn flux_L448 0.
|
| 3804 |
hf_kernels_flash_attn flux_L512 0.56 True
|
| 3805 |
</div>
|
| 3806 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3807 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3808 |
<div class="uv-logs-content" style="display: none;">
|
| 3809 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3810 |
-
Downloading
|
| 3811 |
-
Downloading
|
| 3812 |
-
Downloading
|
| 3813 |
-
Downloading
|
| 3814 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3815 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3816 |
-
Downloading pillow (6.3MiB)
|
| 3817 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3818 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3819 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3820 |
Downloading triton (148.3MiB)
|
| 3821 |
-
Downloading
|
| 3822 |
-
Downloading
|
| 3823 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
| 3824 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3825 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3826 |
-
Downloading matplotlib (8.3MiB)
|
| 3827 |
-
Downloading torch (846.9MiB)
|
| 3828 |
-
Downloading hf-xet (3.0MiB)
|
| 3829 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3830 |
-
Downloading
|
| 3831 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3832 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
|
|
|
| 3833 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3834 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3835 |
Downloading nvidia-cufile-cu12
|
| 3836 |
Downloading kiwisolver
|
| 3837 |
Downloading hf-xet
|
| 3838 |
Downloading setuptools
|
| 3839 |
-
Downloading fonttools
|
| 3840 |
Downloading networkx
|
|
|
|
| 3841 |
Downloading pillow
|
| 3842 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3843 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3856,13 +3856,13 @@ Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
| 3856 |
Downloading nvidia-cublas-cu12
|
| 3857 |
Downloading nvidia-cudnn-cu12
|
| 3858 |
Downloading torch
|
| 3859 |
-
Installed 47 packages in
|
| 3860 |
</div>
|
| 3861 |
</div>
|
| 3862 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 3863 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:
|
| 3864 |
-
Fetching 20 files: 10%|█ | 2/20 [00:01<00:
|
| 3865 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00,
|
| 3866 |
<div class="cell-artifacts">
|
| 3867 |
<h4>Artifacts:</h4>
|
| 3868 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
+
Cell: benchmark | 38.08s
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3796 |
</div>
|
| 3797 |
<div id="output-benchmark" class="cell-output">
|
| 3798 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3799 |
+
hf_kernels_flash_attn flux_L128 0.34 True
|
| 3800 |
+
hf_kernels_flash_attn flux_L256 0.37 True
|
| 3801 |
+
hf_kernels_flash_attn flux_L320 0.49 True
|
| 3802 |
+
hf_kernels_flash_attn flux_L384 0.51 True
|
| 3803 |
+
hf_kernels_flash_attn flux_L448 0.53 True
|
| 3804 |
hf_kernels_flash_attn flux_L512 0.56 True
|
| 3805 |
</div>
|
| 3806 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3807 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3808 |
<div class="uv-logs-content" style="display: none;">
|
| 3809 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3810 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3811 |
+
Downloading numpy (16.2MiB)
|
| 3812 |
+
Downloading setuptools (1.1MiB)
|
| 3813 |
+
Downloading hf-xet (3.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3814 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3815 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3816 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3817 |
+
Downloading networkx (1.9MiB)
|
| 3818 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3819 |
+
Downloading torch (846.9MiB)
|
| 3820 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3821 |
Downloading triton (148.3MiB)
|
| 3822 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3823 |
+
Downloading sympy (6.0MiB)
|
| 3824 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3825 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3826 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3827 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3828 |
+
Downloading kiwisolver (1.4MiB)
|
|
|
|
| 3829 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3830 |
+
Downloading pillow (6.3MiB)
|
| 3831 |
+
Downloading fonttools (4.7MiB)
|
| 3832 |
+
Downloading matplotlib (8.3MiB)
|
| 3833 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3834 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3835 |
Downloading nvidia-cufile-cu12
|
| 3836 |
Downloading kiwisolver
|
| 3837 |
Downloading hf-xet
|
| 3838 |
Downloading setuptools
|
|
|
|
| 3839 |
Downloading networkx
|
| 3840 |
+
Downloading fonttools
|
| 3841 |
Downloading pillow
|
| 3842 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3843 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3856 |
Downloading nvidia-cublas-cu12
|
| 3857 |
Downloading nvidia-cudnn-cu12
|
| 3858 |
Downloading torch
|
| 3859 |
+
Installed 47 packages in 519ms
|
| 3860 |
</div>
|
| 3861 |
</div>
|
| 3862 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 3863 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:06, 2.87it/s]
|
| 3864 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:12, 1.49it/s]
|
| 3865 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 16.01it/s]</div>
|
| 3866 |
<div class="cell-artifacts">
|
| 3867 |
<h4>Artifacts:</h4>
|
| 3868 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/hf_kernels_flash_attn3.html
CHANGED
|
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark |
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3807,61 +3807,61 @@ hf_kernels_flash_attn3 flux_L512 0.57 True
|
|
| 3807 |
<div class="uv-logs-content" style="display: none;">
|
| 3808 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3809 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3810 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3811 |
-
Downloading nvidia-
|
| 3812 |
-
Downloading
|
| 3813 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3814 |
-
Downloading
|
| 3815 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3816 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3817 |
-
Downloading kiwisolver (1.4MiB)
|
| 3818 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3819 |
-
Downloading nvidia-
|
| 3820 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3821 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3822 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3823 |
-
Downloading matplotlib (8.3MiB)
|
| 3824 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3825 |
Downloading hf-xet (3.0MiB)
|
| 3826 |
-
Downloading fonttools (4.7MiB)
|
| 3827 |
Downloading pillow (6.3MiB)
|
| 3828 |
-
Downloading triton (148.3MiB)
|
| 3829 |
-
Downloading networkx (1.9MiB)
|
| 3830 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3831 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3832 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3833 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3834 |
Downloading nvidia-cufile-cu12
|
| 3835 |
Downloading kiwisolver
|
| 3836 |
Downloading hf-xet
|
| 3837 |
Downloading setuptools
|
| 3838 |
-
Downloading fonttools
|
| 3839 |
Downloading networkx
|
|
|
|
| 3840 |
Downloading pillow
|
| 3841 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3842 |
Downloading nvidia-cuda-cupti-cu12
|
| 3843 |
Downloading matplotlib
|
| 3844 |
-
Downloading numpy
|
| 3845 |
Downloading sympy
|
|
|
|
| 3846 |
Downloading nvidia-nvjitlink-cu12
|
| 3847 |
Downloading nvidia-curand-cu12
|
| 3848 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3849 |
Downloading triton
|
| 3850 |
Downloading nvidia-cufft-cu12
|
| 3851 |
Downloading nvidia-cusolver-cu12
|
| 3852 |
-
Downloading nvidia-cusparse-cu12
|
| 3853 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 3854 |
Downloading nvidia-nccl-cu12
|
| 3855 |
Downloading nvidia-cublas-cu12
|
| 3856 |
Downloading nvidia-cudnn-cu12
|
| 3857 |
Downloading torch
|
| 3858 |
-
Installed 47 packages in
|
| 3859 |
</div>
|
| 3860 |
</div>
|
| 3861 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3862 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00,
|
| 3863 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 3864 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 3865 |
<div class="cell-artifacts">
|
| 3866 |
<h4>Artifacts:</h4>
|
| 3867 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
+
Cell: benchmark | 41.76s
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3807 |
<div class="uv-logs-content" style="display: none;">
|
| 3808 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3809 |
Downloading sympy (6.0MiB)
|
| 3810 |
+
Downloading networkx (1.9MiB)
|
| 3811 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3812 |
+
Downloading matplotlib (8.3MiB)
|
| 3813 |
+
Downloading setuptools (1.1MiB)
|
| 3814 |
+
Downloading fonttools (4.7MiB)
|
| 3815 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3816 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3817 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3818 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3819 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3820 |
+
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
|
|
|
| 3821 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3822 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3823 |
Downloading hf-xet (3.0MiB)
|
|
|
|
| 3824 |
Downloading pillow (6.3MiB)
|
|
|
|
|
|
|
| 3825 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3826 |
+
Downloading kiwisolver (1.4MiB)
|
| 3827 |
+
Downloading torch (846.9MiB)
|
| 3828 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3829 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3830 |
+
Downloading triton (148.3MiB)
|
| 3831 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3832 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3833 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3834 |
Downloading nvidia-cufile-cu12
|
| 3835 |
Downloading kiwisolver
|
| 3836 |
Downloading hf-xet
|
| 3837 |
Downloading setuptools
|
|
|
|
| 3838 |
Downloading networkx
|
| 3839 |
+
Downloading fonttools
|
| 3840 |
Downloading pillow
|
| 3841 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3842 |
Downloading nvidia-cuda-cupti-cu12
|
| 3843 |
Downloading matplotlib
|
|
|
|
| 3844 |
Downloading sympy
|
| 3845 |
+
Downloading numpy
|
| 3846 |
Downloading nvidia-nvjitlink-cu12
|
| 3847 |
Downloading nvidia-curand-cu12
|
| 3848 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 3849 |
Downloading triton
|
| 3850 |
Downloading nvidia-cufft-cu12
|
| 3851 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3852 |
Downloading nvidia-cusparselt-cu12
|
| 3853 |
+
Downloading nvidia-cusparse-cu12
|
| 3854 |
Downloading nvidia-nccl-cu12
|
| 3855 |
Downloading nvidia-cublas-cu12
|
| 3856 |
Downloading nvidia-cudnn-cu12
|
| 3857 |
Downloading torch
|
| 3858 |
+
Installed 47 packages in 515ms
|
| 3859 |
</div>
|
| 3860 |
</div>
|
| 3861 |
<div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 3862 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 4.20it/s]
|
| 3863 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.09it/s]
|
| 3864 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.44it/s]</div>
|
| 3865 |
<div class="cell-artifacts">
|
| 3866 |
<h4>Artifacts:</h4>
|
| 3867 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/mem_efficient_attention.html
CHANGED
|
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark | 35.
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3794,44 +3794,44 @@ Cell: benchmark | 35.60s
|
|
| 3794 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3795 |
torch_mem_eff flux_L128 0.59 True
|
| 3796 |
torch_mem_eff flux_L256 0.65 True
|
| 3797 |
-
torch_mem_eff flux_L320 0.
|
| 3798 |
torch_mem_eff flux_L384 0.79 True
|
| 3799 |
-
torch_mem_eff flux_L448 0.
|
| 3800 |
torch_mem_eff flux_L512 0.95 True
|
| 3801 |
</div>
|
| 3802 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3803 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3804 |
<div class="uv-logs-content" style="display: none;">
|
| 3805 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3806 |
-
Downloading
|
| 3807 |
-
Downloading pillow (6.3MiB)
|
| 3808 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 3809 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3810 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3811 |
-
Downloading numpy (16.2MiB)
|
| 3812 |
-
Downloading setuptools (1.1MiB)
|
| 3813 |
-
Downloading kiwisolver (1.4MiB)
|
| 3814 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3815 |
-
Downloading torch (846.9MiB)
|
| 3816 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3817 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3818 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3819 |
Downloading sympy (6.0MiB)
|
|
|
|
| 3820 |
Downloading fonttools (4.7MiB)
|
| 3821 |
-
Downloading networkx (1.9MiB)
|
| 3822 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3823 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 3824 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3825 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 3826 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3827 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
| 3828 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3829 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3830 |
Downloading nvidia-cufile-cu12
|
| 3831 |
Downloading kiwisolver
|
| 3832 |
Downloading setuptools
|
| 3833 |
-
Downloading networkx
|
| 3834 |
Downloading fonttools
|
|
|
|
| 3835 |
Downloading pillow
|
| 3836 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3837 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3850,7 +3850,7 @@ Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
| 3850 |
Downloading nvidia-cublas-cu12
|
| 3851 |
Downloading nvidia-cudnn-cu12
|
| 3852 |
Downloading torch
|
| 3853 |
-
Installed 37 packages in
|
| 3854 |
</div>
|
| 3855 |
</div>
|
| 3856 |
<div class="cell-artifacts">
|
|
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
+
Cell: benchmark | 35.95s
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3794 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3795 |
torch_mem_eff flux_L128 0.59 True
|
| 3796 |
torch_mem_eff flux_L256 0.65 True
|
| 3797 |
+
torch_mem_eff flux_L320 0.78 True
|
| 3798 |
torch_mem_eff flux_L384 0.79 True
|
| 3799 |
+
torch_mem_eff flux_L448 0.85 True
|
| 3800 |
torch_mem_eff flux_L512 0.95 True
|
| 3801 |
</div>
|
| 3802 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3803 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3804 |
<div class="uv-logs-content" style="display: none;">
|
| 3805 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3806 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 3807 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3808 |
+
Downloading networkx (1.9MiB)
|
| 3809 |
+
Downloading kiwisolver (1.4MiB)
|
| 3810 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3811 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3812 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
| 3813 |
Downloading sympy (6.0MiB)
|
| 3814 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3815 |
Downloading fonttools (4.7MiB)
|
|
|
|
|
|
|
| 3816 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3817 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3818 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3819 |
+
Downloading setuptools (1.1MiB)
|
| 3820 |
+
Downloading matplotlib (8.3MiB)
|
| 3821 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3822 |
+
Downloading numpy (16.2MiB)
|
| 3823 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3824 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3825 |
+
Downloading torch (846.9MiB)
|
| 3826 |
+
Downloading triton (148.3MiB)
|
| 3827 |
+
Downloading pillow (6.3MiB)
|
| 3828 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3829 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3830 |
Downloading nvidia-cufile-cu12
|
| 3831 |
Downloading kiwisolver
|
| 3832 |
Downloading setuptools
|
|
|
|
| 3833 |
Downloading fonttools
|
| 3834 |
+
Downloading networkx
|
| 3835 |
Downloading pillow
|
| 3836 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3837 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3850 |
Downloading nvidia-cublas-cu12
|
| 3851 |
Downloading nvidia-cudnn-cu12
|
| 3852 |
Downloading torch
|
| 3853 |
+
Installed 37 packages in 556ms
|
| 3854 |
</div>
|
| 3855 |
</div>
|
| 3856 |
<div class="cell-artifacts">
|
flash_attn/impls/sage_attention.html
CHANGED
|
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark | 40.
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3804,53 +3804,53 @@ Cell: benchmark | 40.11s
|
|
| 3804 |
<div id="output-benchmark" class="cell-output">
|
| 3805 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3806 |
sage_int8_fp16 flux_L128 FAIL False
|
| 3807 |
-
Error: module '
|
| 3808 |
sage_int8_fp16 flux_L256 FAIL False
|
| 3809 |
-
Error: module '
|
| 3810 |
sage_int8_fp16 flux_L320 FAIL False
|
| 3811 |
-
Error: module '
|
| 3812 |
sage_int8_fp16 flux_L384 FAIL False
|
| 3813 |
-
Error: module '
|
| 3814 |
sage_int8_fp16 flux_L448 FAIL False
|
| 3815 |
-
Error: module '
|
| 3816 |
sage_int8_fp16 flux_L512 FAIL False
|
| 3817 |
-
Error: module '
|
| 3818 |
</div>
|
| 3819 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3820 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3821 |
<div class="uv-logs-content" style="display: none;">
|
| 3822 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3823 |
-
Downloading nvidia-
|
| 3824 |
-
Downloading nvidia-
|
|
|
|
| 3825 |
Downloading setuptools (1.1MiB)
|
|
|
|
| 3826 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3827 |
-
Downloading nvidia-
|
| 3828 |
-
Downloading triton (148.3MiB)
|
| 3829 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
| 3830 |
Downloading torch (846.9MiB)
|
| 3831 |
-
Downloading
|
| 3832 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3833 |
-
Downloading
|
| 3834 |
Downloading kiwisolver (1.4MiB)
|
| 3835 |
-
Downloading
|
| 3836 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3837 |
-
Downloading networkx (1.9MiB)
|
| 3838 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3839 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3840 |
Downloading sympy (6.0MiB)
|
| 3841 |
-
Downloading
|
| 3842 |
-
Downloading
|
| 3843 |
-
Downloading nvidia-
|
| 3844 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3845 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
| 3846 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3847 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3848 |
Downloading nvidia-cufile-cu12
|
| 3849 |
Downloading kiwisolver
|
| 3850 |
Downloading hf-xet
|
| 3851 |
Downloading setuptools
|
| 3852 |
-
Downloading fonttools
|
| 3853 |
Downloading networkx
|
|
|
|
| 3854 |
Downloading pillow
|
| 3855 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3856 |
Downloading nvidia-cuda-cupti-cu12
|
|
@@ -3869,15 +3869,13 @@ Downloading matplotlib (8.3MiB)
|
|
| 3869 |
Downloading nvidia-cublas-cu12
|
| 3870 |
Downloading nvidia-cudnn-cu12
|
| 3871 |
Downloading torch
|
| 3872 |
-
Installed 48 packages in
|
| 3873 |
</div>
|
| 3874 |
</div>
|
| 3875 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3876 |
-
Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 5.
|
| 3877 |
-
Fetching 11 files:
|
| 3878 |
-
Fetching 11 files:
|
| 3879 |
-
Fetching 11 files: 64%|██████▎ | 7/11 [00:00<00:00, 11.66it/s]
|
| 3880 |
-
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 15.59it/s]</div>
|
| 3881 |
<div class="cell-artifacts">
|
| 3882 |
<h4>Artifacts:</h4>
|
| 3883 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
|
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
+
Cell: benchmark | 40.43s
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3804 |
<div id="output-benchmark" class="cell-output">
|
| 3805 |
<div class="cell-stdout">impl wl p50(ms) ok
|
| 3806 |
sage_int8_fp16 flux_L128 FAIL False
|
| 3807 |
+
Error: module 'sage_attention_46758c422d547a47' has no attribute 'fwd'
|
| 3808 |
sage_int8_fp16 flux_L256 FAIL False
|
| 3809 |
+
Error: module 'sage_attention_46758c422d547a47' has no attribute 'fwd'
|
| 3810 |
sage_int8_fp16 flux_L320 FAIL False
|
| 3811 |
+
Error: module 'sage_attention_46758c422d547a47' has no attribute 'fwd'
|
| 3812 |
sage_int8_fp16 flux_L384 FAIL False
|
| 3813 |
+
Error: module 'sage_attention_46758c422d547a47' has no attribute 'fwd'
|
| 3814 |
sage_int8_fp16 flux_L448 FAIL False
|
| 3815 |
+
Error: module 'sage_attention_46758c422d547a47' has no attribute 'fwd'
|
| 3816 |
sage_int8_fp16 flux_L512 FAIL False
|
| 3817 |
+
Error: module 'sage_attention_46758c422d547a47' has no attribute 'fwd'
|
| 3818 |
</div>
|
| 3819 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3820 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3821 |
<div class="uv-logs-content" style="display: none;">
|
| 3822 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3823 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3824 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3825 |
+
Downloading networkx (1.9MiB)
|
| 3826 |
Downloading setuptools (1.1MiB)
|
| 3827 |
+
Downloading numpy (16.2MiB)
|
| 3828 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3829 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 3830 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3831 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3832 |
+
Downloading hf-xet (3.0MiB)
|
| 3833 |
Downloading torch (846.9MiB)
|
| 3834 |
+
Downloading triton (148.3MiB)
|
| 3835 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3836 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3837 |
Downloading kiwisolver (1.4MiB)
|
| 3838 |
+
Downloading pillow (6.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3839 |
Downloading sympy (6.0MiB)
|
| 3840 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3841 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3842 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 3843 |
Downloading matplotlib (8.3MiB)
|
| 3844 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3845 |
+
Downloading fonttools (4.7MiB)
|
| 3846 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3847 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3848 |
Downloading nvidia-cufile-cu12
|
| 3849 |
Downloading kiwisolver
|
| 3850 |
Downloading hf-xet
|
| 3851 |
Downloading setuptools
|
|
|
|
| 3852 |
Downloading networkx
|
| 3853 |
+
Downloading fonttools
|
| 3854 |
Downloading pillow
|
| 3855 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3856 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 3869 |
Downloading nvidia-cublas-cu12
|
| 3870 |
Downloading nvidia-cudnn-cu12
|
| 3871 |
Downloading torch
|
| 3872 |
+
Installed 48 packages in 525ms
|
| 3873 |
</div>
|
| 3874 |
</div>
|
| 3875 |
<div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00<?, ?it/s]
|
| 3876 |
+
Fetching 11 files: 9%|▉ | 1/11 [00:00<00:01, 5.55it/s]
|
| 3877 |
+
Fetching 11 files: 73%|███████▎ | 8/11 [00:00<00:00, 12.93it/s]
|
| 3878 |
+
Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 16.93it/s]</div>
|
|
|
|
|
|
|
| 3879 |
<div class="cell-artifacts">
|
| 3880 |
<h4>Artifacts:</h4>
|
| 3881 |
<a href="artifacts/benchmark/attn.jsonl" class="artifact" target="_blank">attn.jsonl</a>
|
flash_attn/impls/xformers.html
CHANGED
|
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
-
Cell: benchmark | 40.
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3797,48 +3797,48 @@ xformers_meff flux_L256 0.47 True
|
|
| 3797 |
xformers_meff flux_L320 0.60 True
|
| 3798 |
xformers_meff flux_L384 0.60 True
|
| 3799 |
xformers_meff flux_L448 0.64 True
|
| 3800 |
-
xformers_meff flux_L512 0.
|
| 3801 |
</div>
|
| 3802 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3803 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3804 |
<div class="uv-logs-content" style="display: none;">
|
| 3805 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3806 |
-
Downloading
|
| 3807 |
-
Downloading
|
| 3808 |
-
Downloading nvidia-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3809 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3810 |
-
Downloading setuptools (1.1MiB)
|
| 3811 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
| 3812 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3813 |
-
Downloading
|
| 3814 |
-
Downloading
|
| 3815 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3816 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3817 |
Downloading sympy (6.0MiB)
|
| 3818 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 3819 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3820 |
-
Downloading
|
| 3821 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3822 |
-
Downloading networkx (1.9MiB)
|
| 3823 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3824 |
Downloading fonttools (4.7MiB)
|
| 3825 |
-
Downloading triton (148.3MiB)
|
| 3826 |
Downloading torch (846.9MiB)
|
| 3827 |
-
Downloading kiwisolver (1.4MiB)
|
| 3828 |
-
Downloading matplotlib (8.3MiB)
|
| 3829 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3830 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3831 |
Downloading nvidia-cufile-cu12
|
| 3832 |
Downloading kiwisolver
|
| 3833 |
Downloading setuptools
|
| 3834 |
-
Downloading networkx
|
| 3835 |
Downloading fonttools
|
|
|
|
| 3836 |
Downloading pillow
|
| 3837 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3838 |
Downloading nvidia-cuda-cupti-cu12
|
| 3839 |
Downloading matplotlib
|
| 3840 |
-
Downloading sympy
|
| 3841 |
Downloading numpy
|
|
|
|
| 3842 |
Downloading nvidia-nvjitlink-cu12
|
| 3843 |
Downloading nvidia-curand-cu12
|
| 3844 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
@@ -3849,10 +3849,10 @@ Downloading matplotlib (8.3MiB)
|
|
| 3849 |
Downloading nvidia-cusparse-cu12
|
| 3850 |
Downloading nvidia-cusparselt-cu12
|
| 3851 |
Downloading nvidia-nccl-cu12
|
| 3852 |
-
Downloading nvidia-cudnn-cu12
|
| 3853 |
Downloading nvidia-cublas-cu12
|
|
|
|
| 3854 |
Downloading torch
|
| 3855 |
-
Installed 38 packages in
|
| 3856 |
</div>
|
| 3857 |
</div>
|
| 3858 |
<div class="cell-artifacts">
|
|
|
|
| 3710 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3711 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3712 |
</span> |
|
| 3713 |
+
Cell: benchmark | 40.64s
|
| 3714 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3715 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3716 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3797 |
xformers_meff flux_L320 0.60 True
|
| 3798 |
xformers_meff flux_L384 0.60 True
|
| 3799 |
xformers_meff flux_L448 0.64 True
|
| 3800 |
+
xformers_meff flux_L512 0.64 True
|
| 3801 |
</div>
|
| 3802 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 3803 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3804 |
<div class="uv-logs-content" style="display: none;">
|
| 3805 |
Updating https://github.com/drbh/kernels-benchmark-tools.git (main)
|
| 3806 |
+
Downloading networkx (1.9MiB)
|
| 3807 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3808 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3809 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3810 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3811 |
+
Downloading triton (148.3MiB)
|
| 3812 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3813 |
+
Downloading pillow (6.3MiB)
|
| 3814 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
| 3815 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3816 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3817 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3818 |
+
Downloading numpy (16.2MiB)
|
| 3819 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
| 3820 |
Downloading sympy (6.0MiB)
|
| 3821 |
+
Downloading matplotlib (8.3MiB)
|
| 3822 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3823 |
+
Downloading xformers (111.8MiB)
|
| 3824 |
+
Downloading setuptools (1.1MiB)
|
| 3825 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3826 |
+
Downloading kiwisolver (1.4MiB)
|
|
|
|
|
|
|
|
|
|
| 3827 |
Downloading fonttools (4.7MiB)
|
|
|
|
| 3828 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 3829 |
Updated https://github.com/drbh/kernels-benchmark-tools.git (f457279bca6573cd2fa54a74e67118f5e6b7a31c)
|
| 3830 |
Building kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3831 |
Downloading nvidia-cufile-cu12
|
| 3832 |
Downloading kiwisolver
|
| 3833 |
Downloading setuptools
|
|
|
|
| 3834 |
Downloading fonttools
|
| 3835 |
+
Downloading networkx
|
| 3836 |
Downloading pillow
|
| 3837 |
Built kernels-benchmark-tools @ git+https://github.com/drbh/kernels-benchmark-tools.git@f457279bca6573cd2fa54a74e67118f5e6b7a31c
|
| 3838 |
Downloading nvidia-cuda-cupti-cu12
|
| 3839 |
Downloading matplotlib
|
|
|
|
| 3840 |
Downloading numpy
|
| 3841 |
+
Downloading sympy
|
| 3842 |
Downloading nvidia-nvjitlink-cu12
|
| 3843 |
Downloading nvidia-curand-cu12
|
| 3844 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
|
|
| 3849 |
Downloading nvidia-cusparse-cu12
|
| 3850 |
Downloading nvidia-cusparselt-cu12
|
| 3851 |
Downloading nvidia-nccl-cu12
|
|
|
|
| 3852 |
Downloading nvidia-cublas-cu12
|
| 3853 |
+
Downloading nvidia-cudnn-cu12
|
| 3854 |
Downloading torch
|
| 3855 |
+
Installed 38 packages in 562ms
|
| 3856 |
</div>
|
| 3857 |
</div>
|
| 3858 |
<div class="cell-artifacts">
|
flash_attn/results/artifacts/combine/latency.png
ADDED
|
Git LFS Details
|
flash_attn/results/cells/combine.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "numpy",
|
| 5 |
+
# "torch",
|
| 6 |
+
# "kernels-benchmark-tools",
|
| 7 |
+
# "matplotlib",
|
| 8 |
+
# ]
|
| 9 |
+
#
|
| 10 |
+
# [tool.uv.sources]
|
| 11 |
+
# kernels-benchmark-tools = { git = "https://github.com/drbh/kernels-benchmark-tools.git", branch = "main" }
|
| 12 |
+
# ///
|
| 13 |
+
import torch
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
import kernels_benchmark_tools as kbt
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
# Discover the upstream artifact directories from environment variables
|
| 20 |
+
cache_dirs = {
|
| 21 |
+
"Flash (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_FLASH_ATTENTION_BENCHMARK'),
|
| 22 |
+
"MemEff (PyTorch SDPA)": os.environ.get('UVNOTE_FILE_MEM_EFFICIENT_ATTENTION_BENCHMARK'),
|
| 23 |
+
"Flash Attn 2": os.environ.get('UVNOTE_FILE_FLASH_ATTN2_BENCHMARK'),
|
| 24 |
+
"xFormers": os.environ.get('UVNOTE_FILE_XFORMERS_BENCHMARK'),
|
| 25 |
+
"SageAttention": os.environ.get('UVNOTE_FILE_SAGE_ATTENTION_BENCHMARK'),
|
| 26 |
+
"Compiled (default)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_DEFAULT'),
|
| 27 |
+
"Compiled (max-autotune)": os.environ.get('UVNOTE_FILE_COMPILED_VARIANTS_BENCHMARK_MAX_AUTOTUNE'),
|
| 28 |
+
"HF Kernels Flash Attn": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN_BENCHMARK'),
|
| 29 |
+
"HF Kernels Flash Attn3": os.environ.get('UVNOTE_FILE_HF_KERNELS_FLASH_ATTN3_BENCHMARK'),
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
print("LOADING BENCHMARK DATA")
|
| 33 |
+
for name, cache_dir in cache_dirs.items():
|
| 34 |
+
print(f"{name:30s}: {cache_dir}")
|
| 35 |
+
print()
|
| 36 |
+
|
| 37 |
+
# Collect all JSONL paths
|
| 38 |
+
all_paths = []
|
| 39 |
+
file_mapping = {
|
| 40 |
+
"Flash (PyTorch SDPA)": "attn.jsonl",
|
| 41 |
+
"MemEff (PyTorch SDPA)": "attn.jsonl",
|
| 42 |
+
"Flash Attn 2": "attn.jsonl",
|
| 43 |
+
"xFormers": "attn.jsonl",
|
| 44 |
+
"SageAttention": "attn.jsonl",
|
| 45 |
+
"Compiled (default)": "attn_default.jsonl",
|
| 46 |
+
"Compiled (max-autotune)": "attn_max_autotune.jsonl",
|
| 47 |
+
"HF Kernels Flash Attn": "attn.jsonl",
|
| 48 |
+
"HF Kernels Flash Attn3": "attn.jsonl",
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
for name, cache_dir in cache_dirs.items():
|
| 52 |
+
if cache_dir:
|
| 53 |
+
jsonl_file = file_mapping[name]
|
| 54 |
+
path = Path(cache_dir) / jsonl_file
|
| 55 |
+
if path.exists() and path.stat().st_size > 0:
|
| 56 |
+
all_paths.append(str(path))
|
| 57 |
+
print(f"✓ Found {name}: {path}")
|
| 58 |
+
else:
|
| 59 |
+
print(f"⊘ Empty/Missing {name}: {path}")
|
| 60 |
+
else:
|
| 61 |
+
print(f"✗ No cache dir for {name}")
|
| 62 |
+
|
| 63 |
+
print()
|
| 64 |
+
|
| 65 |
+
if not all_paths:
|
| 66 |
+
print("ERROR: No benchmark data files found!")
|
| 67 |
+
sys.exit(1)
|
| 68 |
+
|
| 69 |
+
# Generate combined summary
|
| 70 |
+
print("COMBINED BENCHMARK SUMMARY")
|
| 71 |
+
print()
|
| 72 |
+
|
| 73 |
+
kbt.summarize(all_paths)
|
| 74 |
+
|
| 75 |
+
print()
|
| 76 |
+
print("GENERATING COMBINED VISUALIZATION")
|
| 77 |
+
print()
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
kbt.viz(all_paths)
|
| 81 |
+
print("✓ Combined visualization saved as latency.png")
|
| 82 |
+
except ImportError as e:
|
| 83 |
+
print(f"✗ Visualization requires matplotlib: {e}")
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"✗ Visualization failed: {e}")
|
| 86 |
+
|
| 87 |
+
print()
|
| 88 |
+
print("ANALYSIS COMPLETE")
|
| 89 |
+
print(f"Total implementations analyzed: {len(all_paths)}")
|
| 90 |
+
print(f"\nImplementations included:")
|
| 91 |
+
for name, cache_dir in cache_dirs.items():
|
| 92 |
+
if cache_dir:
|
| 93 |
+
jsonl_file = file_mapping[name]
|
| 94 |
+
path = Path(cache_dir) / jsonl_file
|
| 95 |
+
if path.exists() and path.stat().st_size > 0:
|
| 96 |
+
print(f" ✓ {name}")
|
flash_attn/results/combined_results.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|