Spaces:

kernels-community
/

kernels-benchmarks

Running

App Files Files Community

kernels-benchmarks / flash_attn /impls /artifacts /benchmark_default /attn_default.jsonl

drbh's picture

drbh HF Staff

Upload folder using huggingface_hub

81fff32 verified about 2 months ago

6.23 kB

	{"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L128", "batch": 1, "seq_len": 1152, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.19892000000254484, "p50": 0.20128900018789864, "p90": 0.20218000008753734, "mean": 0.20126180006627692, "iqr": 0.0013400001535046613, "raw_times": [0.19892000000254484, 0.20083999993403268, 0.20128900018789864, 0.2030800001193711, 0.20218000008753734], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2210709999417304, "peak_bytes": 152174592, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
	{"ts": "2025-10-23T17:21:26Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L256", "batch": 1, "seq_len": 1280, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.229150999984995, "p50": 0.22967100017012854, "p90": 0.23078200001691584, "mean": 0.23312540001825255, "iqr": 0.0012210000477352878, "raw_times": [0.23078200001691584, 0.22956099996918056, 0.22967100017012854, 0.2464619999500428, 0.229150999984995], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2712529999371327, "peak_bytes": 163971072, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
	{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L320", "batch": 1, "seq_len": 1344, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2344019999327429, "p50": 0.23504099999627215, "p90": 0.23719199998595286, "mean": 0.23960979997355025, "iqr": 0.0026000000161729986, "raw_times": [0.2568219999830035, 0.23719199998595286, 0.2344019999327429, 0.23459199996977986, 0.23504099999627215], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24443200004498067, "peak_bytes": 167116800, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000347137451171875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
	{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L384", "batch": 1, "seq_len": 1408, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.23659099997530575, "p50": 0.23880100002315885, "p90": 0.23884200004431477, "mean": 0.23843920002946106, "iqr": 0.0014209999790182337, "raw_times": [0.23880100002315885, 0.2405410000392294, 0.23742100006529654, 0.23884200004431477, 0.23659099997530575], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25097200000345765, "peak_bytes": 169345024, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003452301025390625, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
	{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L448", "batch": 1, "seq_len": 1472, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}
	{"ts": "2025-10-23T17:21:27Z", "run": "ab04a13d0bc147c5aeb6a1c30c52fb98", "impl": "torch_flash_compiled_default", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "default"}, "wl": {"name": "flux_L512", "batch": 1, "seq_len": 1536, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "FailOnRecompileLimitHit", "msg": "recompile_limit reached with one_graph=True. Excessive recompilations can degrade performance due to the compilation overhead of each recompilation. To monitor recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider increasing torch._dynamo.config.cache_size_limit to an appropriate value."}}