| {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9336540001640969, "p50": 0.938484000016615, "p90": 0.9400730000379554, "mean": 0.9383200000229408, "iqr": 0.00204800016945228, "raw_times": [0.9413640000275336, 0.938484000016615, 0.9380249998685031, 0.9400730000379554, 0.9336540001640969], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9436739999273414, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9720749999360123, "p50": 0.9796540000479581, "p90": 0.9886739999274141, "mean": 0.9813904000111506, "iqr": 0.011588999768719077, "raw_times": [0.9894639999856736, 0.9720749999360123, 0.9886739999274141, 0.977085000158695, 0.9796540000479581], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9749249998094456, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.026366000132839, "p50": 1.0466650001035305, "p90": 1.048316000151317, "mean": 1.0439156000302319, "iqr": 0.012310000329307513, "raw_times": [1.0622249999414635, 1.0466650001035305, 1.026366000132839, 1.048316000151317, 1.0360059998220095], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0340549999909854, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.031215000011798, "p50": 1.0415550000288931, "p90": 1.0416950001399528, "mean": 1.0391672000423569, "iqr": 0.009410000075149583, "raw_times": [1.031215000011798, 1.0322850000648032, 1.0416950001399528, 1.0415550000288931, 1.0490859999663371], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0389750000285858, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:46Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2092779998056358, "p50": 1.2178079998648172, "p90": 1.2180080000234739, "mean": 1.21775799993884, "iqr": 0.002989999984492897, "raw_times": [1.2286779999612918, 1.2180080000234739, 1.2178079998648172, 1.215018000038981, 1.2092779998056358], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2385289999201632, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} | |
| {"ts": "2025-11-10T22:11:47Z", "run": "8d69ef94c7594eb581f8f1e4fd6b3eef", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.203338000095755, "p50": 1.2106680001124914, "p90": 1.218707999896651, "mean": 1.2165860000095563, "iqr": 0.014340000006995979, "raw_times": [1.204367999889655, 1.218707999896651, 1.203338000095755, 1.2106680001124914, 1.245848000053229], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2345879999884346, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} | |