{"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9106109999947876, "p50": 0.9171110000352201, "p90": 0.9204320000435473, "mean": 0.9179216000347878, "iqr": 0.005419999979494605, "raw_times": [0.9171110000352201, 0.9150120000640527, 0.9106109999947876, 0.9204320000435473, 0.9264420000363316], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9176309999929799, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9576329999845257, "p50": 0.960063999968952, "p90": 0.9623629999850891, "mean": 0.9611931999643275, "iqr": 0.0033900000744324643, "raw_times": [0.9589729999106567, 0.9576329999845257, 0.960063999968952, 0.9669329999724141, 0.9623629999850891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9673530000782193, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0019650000003821, "p50": 1.0193159999971613, "p90": 1.0211459999709405, "mean": 1.015251600006195, "iqr": 0.01198099994326185, "raw_times": [1.0019650000003821, 1.0091650000276786, 1.024666000034813, 1.0193159999971613, 1.0211459999709405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.009233999980097, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0176959999625979, "p50": 1.0199849999708022, "p90": 1.025695000066662, "mean": 1.0218714000075124, "iqr": 0.006820000066909415, "raw_times": [1.0271060000377474, 1.0176959999625979, 1.0188749999997526, 1.0199849999708022, 1.025695000066662], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.027405000058934, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1665810000067722, "p50": 1.1845809999613266, "p90": 1.185440999961429, "mean": 1.1787729999923613, "iqr": 0.01419000000169035, "raw_times": [1.1712509999597387, 1.1665810000067722, 1.18601100007254, 1.1845809999613266, 1.185440999961429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1787800000320203, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null} {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1722899999995207, "p50": 1.1832310000272628, "p90": 1.1854509999693619, "mean": 1.181276799979969, "iqr": 0.008630000024822948, "raw_times": [1.1885909999591604, 1.1854509999693619, 1.176820999944539, 1.1832310000272628, 1.1722899999995207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1782799999764393, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}