drbh HF Staff commited on
Commit
e8e4be6
·
verified ·
1 Parent(s): f2afc26

Upload folder using huggingface_hub

Browse files
Files changed (43) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/hf_kernels_swiglu.html +95 -94
  3. activation/impls/torch_swiglu.html +121 -121
  4. activation/results/artifacts/combine/latency.svg +2 -2
  5. activation/results/combined_results.html +93 -93
  6. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -0
  7. causal_conv1d/impls/cells/benchmark.py +40 -0
  8. causal_conv1d/impls/cells/nv.py +2 -0
  9. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  10. causal_conv1d/impls/index.html +89 -0
  11. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  12. causal_conv1d/index.html +89 -0
  13. causal_conv1d/results/artifacts/combine/latency.svg +3 -0
  14. causal_conv1d/results/cells/combine.py +26 -0
  15. causal_conv1d/results/combined_results.html +0 -0
  16. causal_conv1d/results/index.html +88 -0
  17. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  18. flash_attn/impls/cells/benchmark.py +8 -10
  19. flash_attn/impls/flash_attention.html +139 -139
  20. flash_attn/impls/hf_kernels_flash_attn.html +95 -142
  21. flash_attn/impls/hf_kernels_flash_attn3.html +80 -80
  22. flash_attn/impls/mem_efficient_attention.html +186 -134
  23. flash_attn/impls/sage_attention.html +13 -19
  24. flash_attn/impls/xformers.html +91 -91
  25. flash_attn/results/artifacts/combine/latency.svg +2 -2
  26. flash_attn/results/combined_results.html +147 -147
  27. index.html +2 -0
  28. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -48
  29. layer_norm/impls/hf_kernels_layer_norm.html +0 -0
  30. layer_norm/impls/torch_layer_norm.html +0 -0
  31. layer_norm/results/artifacts/combine/latency.svg +2 -2
  32. layer_norm/results/combined_results.html +429 -106
  33. rotary/impls/artifacts/benchmark/rotary.jsonl +24 -0
  34. rotary/impls/cells/benchmark.py +57 -0
  35. rotary/impls/cells/nv.py +2 -0
  36. rotary/impls/hf_kernels_rotary.html +0 -0
  37. rotary/impls/index.html +89 -0
  38. rotary/impls/torch_rotary.html +0 -0
  39. rotary/index.html +89 -0
  40. rotary/results/artifacts/combine/latency.svg +3 -0
  41. rotary/results/cells/combine.py +26 -0
  42. rotary/results/combined_results.html +0 -0
  43. rotary/results/index.html +88 -0
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024320999955307343, "p50": 0.025090999997701147, "p90": 0.02569000002949906, "mean": 0.026606800020090304, "iqr": 0.0010690000635804608, "raw_times": [0.03331100015202537, 0.025090999997701147, 0.024320999955307343, 0.02569000002949906, 0.0246209999659186], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03336100007800269, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.028640999971685233, "p50": 0.02958999994007172, "p90": 0.030561000130546745, "mean": 0.02986059994327661, "iqr": 0.0012610003068402875, "raw_times": [0.029299999823706457, 0.028640999971685233, 0.02958999994007172, 0.0312109998503729, 0.030561000130546745], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03354099999341997, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02880000010918593, "p50": 0.030331000061778468, "p90": 0.030401000003621448, "mean": 0.030208600037440192, "iqr": 0.0004209998678561533, "raw_times": [0.02880000010918593, 0.03153099987684982, 0.029980000135765295, 0.030331000061778468, 0.030401000003621448], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03317000005154114, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02921000009337149, "p50": 0.0294310000299447, "p90": 0.029789999871354667, "mean": 0.029938399984530406, "iqr": 0.0004489997991186101, "raw_times": [0.0294310000299447, 0.02921000009337149, 0.03191999985574512, 0.029341000072236056, 0.029789999871354667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03343000003042107, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029799999992974335, "p50": 0.031021000040709623, "p90": 0.031239999998433632, "mean": 0.03210639997632825, "iqr": 0.0009289999525208259, "raw_times": [0.038159999803610845, 0.031021000040709623, 0.029799999992974335, 0.030311000045912806, 0.031239999998433632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03207100007784902, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0278800000614865, "p50": 0.028550999786602915, "p90": 0.029250000125102815, "mean": 0.02903839999817137, "iqr": 0.0010100000054080738, "raw_times": [0.0278800000614865, 0.02824000011969474, 0.028550999786602915, 0.029250000125102815, 0.03127099989796989], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03262000018366962, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02945000005638576, "p50": 0.029881000045861583, "p90": 0.03017099993485317, "mean": 0.03019639998456114, "iqr": 0.0005509998572961194, "raw_times": [0.029881000045861583, 0.03185999980814813, 0.03017099993485317, 0.02945000005638576, 0.02962000007755705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031610000178261544, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02807000009852345, "p50": 0.028989999918849207, "p90": 0.02929000015683414, "mean": 0.028920200020365883, "iqr": 0.0003590002961573191, "raw_times": [0.029320000066945795, 0.02929000015683414, 0.02807000009852345, 0.028989999918849207, 0.02893099986067682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.033219999977518455, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-10-27T14:46:29Z", "run": "cb61ba9d82ad40cba986f04f71dd51b6", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.029301000040504732, "p50": 0.03090099994551565, "p90": 0.03149000008306757, "mean": 0.03127060003862425, "iqr": 0.0014889999420120148, "raw_times": [0.029301000040504732, 0.030001000141055556, 0.03149000008306757, 0.03465999998297775, 0.03090099994551565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.03197000000909611, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02110099990204617, "p50": 0.022570000055566197, "p90": 0.02266100000269944, "mean": 0.022242599993660406, "iqr": 0.0007410000080199097, "raw_times": [0.022570000055566197, 0.022961000013310695, 0.02191999999467953, 0.02266100000269944, 0.02110099990204617], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02889100005631917, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02585100003216212, "p50": 0.02831100005096232, "p90": 0.02854100000604376, "mean": 0.02791500000967062, "iqr": 0.0013400000398178236, "raw_times": [0.02585100003216212, 0.02854100000604376, 0.02967099999295897, 0.02831100005096232, 0.027200999966225936], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031750999937685265, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02804099995046272, "p50": 0.028271000019230996, "p90": 0.02853099999811093, "mean": 0.032097199982672464, "iqr": 0.0004900000476482091, "raw_times": [0.04760199999509496, 0.028271000019230996, 0.02853099999811093, 0.02804099995046272, 0.02804099995046272], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031132000003708526, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02513000004000787, "p50": 0.027131000024382956, "p90": 0.027909999971598154, "mean": 0.027204600019103964, "iqr": 0.0014589999182135216, "raw_times": [0.02513000004000787, 0.027131000024382956, 0.027909999971598154, 0.029401000006146205, 0.026451000053384632], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030690999892613036, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02570000003743189, "p50": 0.026741000056063058, "p90": 0.02731099993980024, "mean": 0.02703079999264446, "iqr": 0.0012099999366910197, "raw_times": [0.02570000003743189, 0.02731099993980024, 0.029300999926817894, 0.02610100000310922, 0.026741000056063058], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.030331000061778468, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025050999965969822, "p50": 0.026220999984616356, "p90": 0.028031000056216726, "mean": 0.026778999995258346, "iqr": 0.0018400000953988638, "raw_times": [0.025050999965969822, 0.026190999960817862, 0.026220999984616356, 0.028031000056216726, 0.028401000008670962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.031100999990485434, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.02494000000297092, "p50": 0.026971000011144497, "p90": 0.02789099994515709, "mean": 0.027030599972022173, "iqr": 0.0009699999736767495, "raw_times": [0.02494000000297092, 0.026971000011144497, 0.02789099994515709, 0.02842999992935802, 0.02692099997148034], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.029161000043131935, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.024340999971173005, "p50": 0.02594099998987076, "p90": 0.027440999929240206, "mean": 0.026286999968760938, "iqr": 0.0016499999446750735, "raw_times": [0.024340999971173005, 0.027920999968955584, 0.027440999929240206, 0.02594099998987076, 0.025790999984565133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02797100000861974, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-10-28T14:08:47Z", "run": "c851494cf96a4119be3e14911bb5592b", "impl": "hf_kernels_swiglu", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.025551000021550863, "p50": 0.026880999939749017, "p90": 0.028271000019230996, "mean": 0.027656800011754967, "iqr": 0.002240999947389355, "raw_times": [0.025551000021550863, 0.026880999939749017, 0.02603000007184164, 0.03155100000640232, 0.028271000019230996], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.02960100005111599, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.26s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.26s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 75% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.26s
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 4.32s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3976,17 +3976,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 80.128us 1940.62% 80.128us 80.128us 1
3980
- hf_kernels_swiglu 11.19% 199.383us 99.56% 1.774ms 1.774ms 0.000us 0.00% 5.634us 5.634us 1
3981
- _activation_beeaae6::silu_and_mul 1.10% 19.601us 85.64% 1.526ms 508.618us 4.129us 100.00% 5.634us 1.878us 3
3982
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.129us 100.00% 4.129us 1.376us 3
3983
- Activity Buffer Request 82.30% 1.466ms 82.30% 1.466ms 1.466ms 1.505us 36.45% 1.505us 1.505us 1
3984
- aten::empty 2.73% 48.641us 2.73% 48.641us 16.214us 0.000us 0.00% 0.000us 0.000us 3
3985
- cudaLaunchKernel 2.24% 39.931us 2.24% 39.931us 13.310us 0.000us 0.00% 0.000us 0.000us 3
3986
- cudaDeviceSynchronize 0.44% 7.891us 0.44% 7.891us 7.891us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
- Self CPU time total: 1.782ms
3989
- Self CUDA time total: 4.129us
3990
 
3991
 
3992
 
@@ -3996,17 +3996,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 77.823us 1961.76% 77.823us 77.823us 1
4000
- hf_kernels_swiglu 7.28% 119.722us 99.70% 1.640ms 1.640ms 0.000us 0.00% 5.311us 5.311us 1
4001
- _activation_beeaae6::silu_and_mul 1.57% 25.841us 91.18% 1.500ms 499.858us 3.967us 100.00% 5.311us 1.770us 3
4002
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
4003
- Activity Buffer Request 87.74% 1.443ms 87.74% 1.443ms 1.443ms 1.344us 33.88% 1.344us 1.344us 1
4004
- aten::empty 1.24% 20.410us 1.24% 20.410us 6.803us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaLaunchKernel 1.86% 30.650us 1.86% 30.650us 10.217us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaDeviceSynchronize 0.30% 4.930us 0.30% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.645ms
4009
- Self CUDA time total: 3.967us
4010
 
4011
 
4012
 
@@ -4016,17 +4016,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.487us 1369.46% 67.487us 67.487us 1
4020
- hf_kernels_swiglu 6.70% 107.400us 99.69% 1.598ms 1.598ms 0.000us 0.00% 6.592us 6.592us 1
4021
- _activation_beeaae6::silu_and_mul 1.32% 21.191us 91.79% 1.471ms 490.438us 4.928us 100.00% 6.592us 2.197us 3
4022
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.928us 100.00% 4.928us 1.643us 3
4023
- Activity Buffer Request 88.89% 1.425ms 88.89% 1.425ms 1.425ms 1.664us 33.77% 1.664us 1.664us 1
4024
- aten::empty 1.20% 19.281us 1.20% 19.281us 6.427us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaLaunchKernel 1.57% 25.210us 1.57% 25.210us 8.403us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 0.31% 4.970us 0.31% 4.970us 4.970us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 1.603ms
4029
- Self CUDA time total: 4.928us
4030
 
4031
 
4032
 
@@ -4036,17 +4036,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 75.265us 1768.03% 75.265us 75.265us 1
4040
- hf_kernels_swiglu 6.51% 118.032us 99.70% 1.807ms 1.807ms 0.000us 0.00% 5.697us 5.697us 1
4041
- _activation_beeaae6::silu_and_mul 1.22% 22.071us 92.05% 1.668ms 556.119us 4.257us 100.00% 5.697us 1.899us 3
4042
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.257us 100.00% 4.257us 1.419us 3
4043
- Activity Buffer Request 79.39% 1.439ms 79.39% 1.439ms 1.439ms 1.440us 33.83% 1.440us 1.440us 1
4044
- aten::empty 1.14% 20.640us 1.14% 20.640us 6.880us 0.000us 0.00% 0.000us 0.000us 3
4045
- cudaLaunchKernel 11.45% 207.513us 11.45% 207.513us 69.171us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaDeviceSynchronize 0.30% 5.350us 0.30% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 1.812ms
4049
- Self CUDA time total: 4.257us
4050
 
4051
 
4052
 
@@ -4056,16 +4056,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.471us 1111.94% 65.471us 65.471us 1
4060
- hf_kernels_swiglu 19.52% 89.390us 98.84% 452.537us 452.537us 0.000us 0.00% 7.872us 7.872us 1
4061
- _activation_beeaae6::silu_and_mul 5.02% 23.003us 75.04% 343.547us 114.516us 5.888us 100.00% 7.872us 2.624us 3
4062
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 100.00% 5.888us 1.963us 3
4063
- Activity Buffer Request 33.89% 155.152us 33.89% 155.152us 155.152us 1.984us 33.70% 1.984us 1.984us 1
4064
- aten::empty 4.28% 19.600us 4.28% 19.600us 6.533us 0.000us 0.00% 0.000us 0.000us 3
4065
- cudaLaunchKernel 36.13% 165.392us 36.13% 165.392us 55.131us 0.000us 0.00% 0.000us 0.000us 3
4066
- cudaDeviceSynchronize 1.16% 5.290us 1.16% 5.290us 5.290us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
- Self CPU time total: 457.827us
4069
  Self CUDA time total: 5.888us
4070
 
4071
 
@@ -4076,17 +4076,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.383us 879.52% 68.383us 68.383us 1
4080
- hf_kernels_swiglu 6.83% 118.711us 99.72% 1.734ms 1.734ms 0.000us 0.00% 10.367us 10.367us 1
4081
- _activation_beeaae6::silu_and_mul 1.25% 21.741us 91.78% 1.596ms 531.855us 7.775us 100.00% 10.367us 3.456us 3
4082
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 100.00% 7.775us 2.592us 3
4083
- Activity Buffer Request 81.74% 1.421ms 81.74% 1.421ms 1.421ms 2.592us 33.34% 2.592us 2.592us 1
4084
- aten::empty 1.11% 19.311us 1.11% 19.311us 6.437us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaLaunchKernel 8.79% 152.752us 8.79% 152.752us 50.917us 0.000us 0.00% 0.000us 0.000us 3
4086
- cudaDeviceSynchronize 0.28% 4.930us 0.28% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- Self CPU time total: 1.739ms
4089
- Self CUDA time total: 7.775us
4090
 
4091
 
4092
 
@@ -4096,16 +4096,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.527us 1069.89% 70.527us 70.527us 1
4100
- hf_kernels_swiglu 6.20% 108.691us 99.73% 1.749ms 1.749ms 0.000us 0.00% 8.800us 8.800us 1
4101
- _activation_beeaae6::silu_and_mul 1.29% 22.622us 92.35% 1.619ms 539.785us 6.592us 100.00% 8.800us 2.933us 3
4102
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 100.00% 6.592us 2.197us 3
4103
- Activity Buffer Request 82.48% 1.446ms 82.48% 1.446ms 1.446ms 2.208us 33.50% 2.208us 2.208us 1
4104
- aten::empty 1.18% 20.650us 1.18% 20.650us 6.883us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaLaunchKernel 8.58% 150.492us 8.58% 150.492us 50.164us 0.000us 0.00% 0.000us 0.000us 3
4106
- cudaDeviceSynchronize 0.27% 4.780us 0.27% 4.780us 4.780us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
- Self CPU time total: 1.753ms
4109
  Self CUDA time total: 6.592us
4110
 
4111
 
@@ -4116,17 +4116,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.591us 703.03% 66.591us 66.591us 1
4120
- hf_kernels_swiglu 22.91% 88.512us 98.75% 381.506us 381.506us 0.000us 0.00% 12.640us 12.640us 1
4121
- _activation_beeaae6::silu_and_mul 5.22% 20.151us 70.42% 272.064us 90.688us 9.472us 100.00% 12.640us 4.213us 3
4122
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.472us 100.00% 9.472us 3.157us 3
4123
- Activity Buffer Request 26.21% 101.241us 26.21% 101.241us 101.241us 3.168us 33.45% 3.168us 3.168us 1
4124
- aten::empty 5.42% 20.930us 5.42% 20.930us 6.977us 0.000us 0.00% 0.000us 0.000us 3
4125
- cudaLaunchKernel 39.00% 150.672us 39.00% 150.672us 50.224us 0.000us 0.00% 0.000us 0.000us 3
4126
- cudaDeviceSynchronize 1.25% 4.820us 1.25% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
- Self CPU time total: 386.326us
4129
- Self CUDA time total: 9.472us
4130
 
4131
 
4132
 
@@ -4136,23 +4136,23 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.295us 514.21% 67.295us 67.295us 1
4140
- hf_kernels_swiglu 24.05% 101.492us 98.90% 417.266us 417.266us 0.000us 0.00% 17.503us 17.503us 1
4141
- _activation_beeaae6::silu_and_mul 5.33% 22.480us 70.08% 295.684us 98.561us 13.087us 100.00% 17.503us 5.834us 3
4142
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.087us 100.00% 13.087us 4.362us 3
4143
- Activity Buffer Request 28.92% 122.012us 28.92% 122.012us 122.012us 4.416us 33.74% 4.416us 4.416us 1
4144
- aten::empty 4.76% 20.090us 4.76% 20.090us 6.697us 0.000us 0.00% 0.000us 0.000us 3
4145
- cudaLaunchKernel 35.83% 151.192us 35.83% 151.192us 50.397us 0.000us 0.00% 0.000us 0.000us 3
4146
- cudaDeviceSynchronize 1.10% 4.660us 1.10% 4.660us 4.660us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
- Self CPU time total: 421.926us
4149
- Self CUDA time total: 13.087us
4150
 
4151
 
4152
  impl wl p50(ms) ok
4153
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4154
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4155
- hf_kernels_swiglu cuda_T128_D768 0.03 True
4156
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4157
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4158
  hf_kernels_swiglu cuda_T256_D768 0.03 True
@@ -4163,12 +4163,13 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
- Installed 15 packages in 15ms
4167
  </div>
4168
  </div>
4169
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4170
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 13.68it/s]
4171
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 19.14it/s]</div>
 
4172
  <div class="cell-artifacts">
4173
  <h4>Artifacts:</h4>
4174
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 27C P0 80W / 350W | 0MiB / 46068MiB | 1% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 4.26s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3978
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3979
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 70.944us 1745.67% 70.944us 70.944us 1
3980
+ hf_kernels_swiglu 10.31% 179.916us 99.57% 1.738ms 1.738ms 0.000us 0.00% 5.472us 5.472us 1
3981
+ _activation_beeaae6::silu_and_mul 1.09% 18.951us 86.60% 1.512ms 503.911us 4.064us 100.00% 5.472us 1.824us 3
3982
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.064us 100.00% 4.064us 1.355us 3
3983
+ Activity Buffer Request 83.12% 1.451ms 83.12% 1.451ms 1.451ms 1.408us 34.65% 1.408us 1.408us 1
3984
+ aten::empty 2.66% 46.432us 2.66% 46.432us 15.477us 0.000us 0.00% 0.000us 0.000us 3
3985
+ cudaLaunchKernel 2.39% 41.801us 2.39% 41.801us 13.934us 0.000us 0.00% 0.000us 0.000us 3
3986
+ cudaDeviceSynchronize 0.43% 7.500us 0.43% 7.500us 7.500us 0.000us 0.00% 0.000us 0.000us 1
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ Self CPU time total: 1.746ms
3989
+ Self CUDA time total: 4.064us
3990
 
3991
 
3992
 
 
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.703us 1759.36% 68.703us 68.703us 1
4000
+ hf_kernels_swiglu 6.60% 109.215us 99.70% 1.650ms 1.650ms 0.000us 0.00% 5.217us 5.217us 1
4001
+ _activation_beeaae6::silu_and_mul 1.44% 23.760us 91.91% 1.521ms 506.927us 3.905us 100.00% 5.217us 1.739us 3
4002
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.905us 100.00% 3.905us 1.302us 3
4003
+ Activity Buffer Request 88.83% 1.470ms 88.83% 1.470ms 1.470ms 1.312us 33.60% 1.312us 1.312us 1
4004
+ aten::empty 1.19% 19.640us 1.19% 19.640us 6.547us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaLaunchKernel 1.65% 27.251us 1.65% 27.251us 9.084us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaDeviceSynchronize 0.30% 4.941us 0.30% 4.941us 4.941us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.655ms
4009
+ Self CUDA time total: 3.905us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.999us 1388.58% 67.999us 67.999us 1
4020
+ hf_kernels_swiglu 6.71% 113.524us 99.73% 1.687ms 1.687ms 0.000us 0.00% 6.529us 6.529us 1
4021
+ _activation_beeaae6::silu_and_mul 1.26% 21.380us 91.91% 1.555ms 518.231us 4.897us 100.00% 6.529us 2.176us 3
4022
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.897us 100.00% 4.897us 1.632us 3
4023
+ Activity Buffer Request 89.08% 1.507ms 89.08% 1.507ms 1.507ms 1.632us 33.33% 1.632us 1.632us 1
4024
+ aten::empty 1.11% 18.802us 1.11% 18.802us 6.267us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaLaunchKernel 1.56% 26.371us 1.56% 26.371us 8.790us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 0.27% 4.571us 0.27% 4.571us 4.571us 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 1.692ms
4029
+ Self CUDA time total: 4.897us
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.776us 1498.50% 63.776us 63.776us 1
4040
+ hf_kernels_swiglu 5.54% 99.283us 99.75% 1.788ms 1.788ms 0.000us 0.00% 5.696us 5.696us 1
4041
+ _activation_beeaae6::silu_and_mul 1.20% 21.550us 93.21% 1.671ms 556.862us 4.256us 100.00% 5.696us 1.899us 3
4042
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4043
+ Activity Buffer Request 79.15% 1.419ms 79.15% 1.419ms 1.419ms 1.440us 33.83% 1.440us 1.440us 1
4044
+ aten::empty 1.00% 17.972us 1.00% 17.972us 5.991us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaLaunchKernel 12.85% 230.398us 12.85% 230.398us 76.799us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaDeviceSynchronize 0.25% 4.510us 0.25% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 1.792ms
4049
+ Self CUDA time total: 4.256us
4050
 
4051
 
4052
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.431us 1060.31% 62.431us 62.431us 1
4060
+ hf_kernels_swiglu 20.17% 83.914us 98.89% 411.305us 411.305us 0.000us 0.00% 7.872us 7.872us 1
4061
+ _activation_beeaae6::silu_and_mul 5.09% 21.171us 74.40% 309.470us 103.157us 5.888us 100.00% 7.872us 2.624us 3
4062
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 100.00% 5.888us 1.963us 3
4063
+ Activity Buffer Request 32.60% 135.614us 32.60% 135.614us 135.614us 1.984us 33.70% 1.984us 1.984us 1
4064
+ aten::empty 4.31% 17.921us 4.31% 17.921us 5.974us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaLaunchKernel 36.71% 152.685us 36.71% 152.685us 50.895us 0.000us 0.00% 0.000us 0.000us 3
4066
+ cudaDeviceSynchronize 1.11% 4.631us 1.11% 4.631us 4.631us 0.000us 0.00% 0.000us 0.000us 1
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ Self CPU time total: 415.936us
4069
  Self CUDA time total: 5.888us
4070
 
4071
 
 
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.615us 880.40% 67.615us 67.615us 1
4080
+ hf_kernels_swiglu 5.97% 103.444us 99.74% 1.727ms 1.727ms 0.000us 0.00% 10.240us 10.240us 1
4081
+ _activation_beeaae6::silu_and_mul 1.23% 21.310us 92.70% 1.605ms 535.135us 7.680us 100.00% 10.240us 3.413us 3
4082
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.680us 100.00% 7.680us 2.560us 3
4083
+ Activity Buffer Request 82.79% 1.434ms 82.79% 1.434ms 1.434ms 2.560us 33.33% 2.560us 2.560us 1
4084
+ aten::empty 1.07% 18.611us 1.07% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaLaunchKernel 8.68% 150.305us 8.68% 150.305us 50.102us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaDeviceSynchronize 0.26% 4.450us 0.26% 4.450us 4.450us 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ Self CPU time total: 1.732ms
4089
+ Self CUDA time total: 7.680us
4090
 
4091
 
4092
 
 
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.423us 962.12% 63.423us 63.423us 1
4100
+ hf_kernels_swiglu 5.71% 97.705us 99.74% 1.706ms 1.706ms 0.000us 0.00% 8.800us 8.800us 1
4101
+ _activation_beeaae6::silu_and_mul 1.25% 21.440us 92.96% 1.590ms 530.071us 6.592us 100.00% 8.800us 2.933us 3
4102
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 100.00% 6.592us 2.197us 3
4103
+ Activity Buffer Request 82.94% 1.419ms 82.94% 1.419ms 1.419ms 2.208us 33.50% 2.208us 2.208us 1
4104
+ aten::empty 1.07% 18.230us 1.07% 18.230us 6.077us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaLaunchKernel 8.77% 149.945us 8.77% 149.945us 49.982us 0.000us 0.00% 0.000us 0.000us 3
4106
+ cudaDeviceSynchronize 0.26% 4.450us 0.26% 4.450us 4.450us 0.000us 0.00% 0.000us 0.000us 1
4107
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4108
+ Self CPU time total: 1.711ms
4109
  Self CUDA time total: 6.592us
4110
 
4111
 
 
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.982us 658.89% 61.982us 61.982us 1
4120
+ hf_kernels_swiglu 22.04% 82.603us 98.77% 370.213us 370.213us 0.000us 0.00% 12.543us 12.543us 1
4121
+ _activation_beeaae6::silu_and_mul 5.90% 22.112us 71.72% 268.830us 89.610us 9.407us 100.00% 12.543us 4.181us 3
4122
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.407us 100.00% 9.407us 3.136us 3
4123
+ Activity Buffer Request 26.16% 98.063us 26.16% 98.063us 98.063us 3.136us 33.34% 3.136us 3.136us 1
4124
+ aten::empty 5.01% 18.780us 5.01% 18.780us 6.260us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaLaunchKernel 39.66% 148.655us 39.66% 148.655us 49.552us 0.000us 0.00% 0.000us 0.000us 3
4126
+ cudaDeviceSynchronize 1.23% 4.600us 1.23% 4.600us 4.600us 0.000us 0.00% 0.000us 0.000us 1
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
+ Self CPU time total: 374.813us
4129
+ Self CUDA time total: 9.407us
4130
 
4131
 
4132
 
 
4136
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4137
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 63.776us 490.85% 63.776us 63.776us 1
4140
+ hf_kernels_swiglu 24.11% 99.284us 98.97% 407.515us 407.515us 0.000us 0.00% 17.346us 17.346us 1
4141
+ _activation_beeaae6::silu_and_mul 5.19% 21.351us 70.31% 289.510us 96.503us 12.993us 100.00% 17.346us 5.782us 3
4142
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 12.993us 100.00% 12.993us 4.331us 3
4143
+ Activity Buffer Request 28.96% 119.264us 28.96% 119.264us 119.264us 4.353us 33.50% 4.353us 4.353us 1
4144
+ aten::empty 4.55% 18.721us 4.55% 18.721us 6.240us 0.000us 0.00% 0.000us 0.000us 3
4145
+ cudaLaunchKernel 36.16% 148.895us 36.16% 148.895us 49.632us 0.000us 0.00% 0.000us 0.000us 3
4146
+ cudaDeviceSynchronize 1.03% 4.240us 1.03% 4.240us 4.240us 0.000us 0.00% 0.000us 0.000us 1
4147
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4148
+ Self CPU time total: 411.755us
4149
+ Self CUDA time total: 12.993us
4150
 
4151
 
4152
  impl wl p50(ms) ok
4153
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4154
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4155
+ hf_kernels_swiglu cuda_T128_D768 0.02 True
4156
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4157
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4158
  hf_kernels_swiglu cuda_T256_D768 0.03 True
 
4163
  <div class="uv-install-logs" id="uv-logs-benchmark">
4164
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4165
  <div class="uv-logs-content" style="display: none;">
4166
+ Installed 15 packages in 14ms
4167
  </div>
4168
  </div>
4169
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4170
+ Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:00, 7.79it/s]
4171
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 11.48it/s]
4172
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 15.62it/s]</div>
4173
  <div class="cell-artifacts">
4174
  <h4>Artifacts:</h4>
4175
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/torch_swiglu.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: nv | 0.26s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3887,7 +3887,7 @@ Cell: nv | 0.26s
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
- <div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:46:00 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.26s
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
- | N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 75% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.26s
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
- Cell: benchmark | 6.99s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3970,20 +3970,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 183.359us 1436.08% 183.359us 183.359us 1
3974
- torch_eager 11.24% 212.694us 99.53% 1.883ms 1.883ms 0.000us 0.00% 15.072us 15.072us 1
3975
- aten::silu 3.31% 62.660us 82.30% 1.557ms 519.134us 6.527us 51.12% 8.831us 2.944us 3
3976
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.527us 51.12% 6.527us 2.176us 3
3977
- aten::mul 1.85% 35.100us 2.98% 56.340us 18.780us 6.241us 48.88% 6.241us 2.080us 3
3978
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.241us 48.88% 6.241us 2.080us 3
3979
- Activity Buffer Request 76.74% 1.452ms 76.74% 1.452ms 1.452ms 2.304us 18.05% 2.304us 2.304us 1
3980
- aten::slice 2.41% 45.561us 3.01% 56.902us 9.484us 0.000us 0.00% 0.000us 0.000us 6
3981
- aten::as_strided 0.60% 11.341us 0.60% 11.341us 1.890us 0.000us 0.00% 0.000us 0.000us 6
3982
- cudaLaunchKernel 3.37% 63.741us 3.37% 63.741us 10.623us 0.000us 0.00% 0.000us 0.000us 6
3983
- cudaDeviceSynchronize 0.47% 8.969us 0.47% 8.969us 8.969us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
- Self CPU time total: 1.892ms
3986
- Self CUDA time total: 12.768us
3987
 
3988
 
3989
 
@@ -3993,20 +3993,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.431us 1279.63% 158.431us 158.431us 1
3997
- torch_eager 6.85% 117.301us 99.69% 1.707ms 1.707ms 0.000us 0.00% 14.557us 14.557us 1
3998
- aten::silu 2.45% 41.990us 88.25% 1.511ms 503.680us 6.398us 51.68% 8.574us 2.858us 3
3999
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.398us 51.68% 6.398us 2.133us 3
4000
- aten::mul 1.63% 27.830us 2.78% 47.630us 15.877us 5.983us 48.32% 5.983us 1.994us 3
4001
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
4002
- Activity Buffer Request 84.28% 1.443ms 84.28% 1.443ms 1.443ms 2.176us 17.58% 2.176us 2.176us 1
4003
- aten::slice 1.45% 24.820us 1.81% 30.931us 5.155us 0.000us 0.00% 0.000us 0.000us 6
4004
- aten::as_strided 0.36% 6.111us 0.36% 6.111us 1.019us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaLaunchKernel 2.67% 45.711us 2.67% 45.711us 7.618us 0.000us 0.00% 0.000us 0.000us 6
4006
- cudaDeviceSynchronize 0.31% 5.320us 0.31% 5.320us 5.320us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- Self CPU time total: 1.712ms
4009
- Self CUDA time total: 12.381us
4010
 
4011
 
4012
 
@@ -4016,20 +4016,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.182us 1095.88% 145.182us 145.182us 1
4020
- torch_eager 6.28% 105.841us 99.65% 1.680ms 1.680ms 0.000us 0.00% 15.552us 15.552us 1
4021
- aten::silu 2.40% 40.400us 89.03% 1.501ms 500.258us 6.816us 51.45% 9.120us 3.040us 3
4022
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.45% 6.816us 2.272us 3
4023
- aten::mul 1.52% 25.690us 2.64% 44.480us 14.827us 6.432us 48.55% 6.432us 2.144us 3
4024
  void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
4025
- Activity Buffer Request 85.10% 1.434ms 85.10% 1.434ms 1.434ms 2.304us 17.39% 2.304us 2.304us 1
4026
- aten::slice 1.37% 23.030us 1.70% 28.690us 4.782us 0.000us 0.00% 0.000us 0.000us 6
4027
- aten::as_strided 0.34% 5.660us 0.34% 5.660us 0.943us 0.000us 0.00% 0.000us 0.000us 6
4028
- cudaLaunchKernel 2.66% 44.762us 2.66% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaDeviceSynchronize 0.35% 5.820us 0.35% 5.820us 5.820us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- Self CPU time total: 1.686ms
4032
- Self CUDA time total: 13.248us
4033
 
4034
 
4035
 
@@ -4039,20 +4039,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.025us 1135.85% 145.025us 145.025us 1
4043
- torch_eager 7.55% 116.292us 99.65% 1.535ms 1.535ms 0.000us 0.00% 14.976us 14.976us 1
4044
- aten::silu 2.67% 41.061us 87.34% 1.345ms 448.460us 6.592us 51.63% 8.800us 2.933us 3
4045
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 51.63% 6.592us 2.197us 3
4046
- aten::mul 1.71% 26.359us 2.88% 44.330us 14.777us 6.176us 48.37% 6.176us 2.059us 3
4047
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.37% 6.176us 2.059us 3
4048
- Activity Buffer Request 69.61% 1.072ms 69.61% 1.072ms 1.072ms 2.208us 17.29% 2.208us 2.208us 1
4049
- aten::slice 1.52% 23.350us 1.89% 29.050us 4.842us 0.000us 0.00% 0.000us 0.000us 6
4050
- aten::as_strided 0.37% 5.700us 0.37% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
4051
- cudaLaunchKernel 16.23% 250.045us 16.23% 250.045us 41.674us 0.000us 0.00% 0.000us 0.000us 6
4052
- cudaDeviceSynchronize 0.35% 5.360us 0.35% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- Self CPU time total: 1.540ms
4055
- Self CUDA time total: 12.768us
4056
 
4057
 
4058
 
@@ -4062,20 +4062,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 144.030us 1089.82% 144.030us 144.030us 1
4066
- torch_eager 5.82% 104.551us 99.68% 1.792ms 1.792ms 0.000us 0.00% 15.488us 15.488us 1
4067
- aten::silu 2.32% 41.682us 89.81% 1.614ms 538.151us 6.752us 51.09% 9.024us 3.008us 3
4068
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.752us 51.09% 6.752us 2.251us 3
4069
- aten::mul 1.41% 25.409us 2.48% 44.550us 14.850us 6.464us 48.91% 6.464us 2.155us 3
4070
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.91% 6.464us 2.155us 3
4071
- Activity Buffer Request 78.50% 1.411ms 78.50% 1.411ms 1.411ms 2.272us 17.19% 2.272us 2.272us 1
4072
- aten::slice 1.27% 22.830us 1.58% 28.320us 4.720us 0.000us 0.00% 0.000us 0.000us 6
4073
- aten::as_strided 0.31% 5.490us 0.31% 5.490us 0.915us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaLaunchKernel 10.06% 180.853us 10.06% 180.853us 30.142us 0.000us 0.00% 0.000us 0.000us 6
4075
- cudaDeviceSynchronize 0.32% 5.710us 0.32% 5.710us 5.710us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- Self CPU time total: 1.798ms
4078
- Self CUDA time total: 13.216us
4079
 
4080
 
4081
 
@@ -4085,20 +4085,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 140.382us 902.66% 140.382us 140.382us 1
4089
- torch_eager 21.39% 103.633us 98.99% 479.697us 479.697us 0.000us 0.00% 18.240us 18.240us 1
4090
- aten::silu 8.56% 41.460us 63.18% 306.154us 102.051us 7.936us 51.03% 10.624us 3.541us 3
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
4092
- aten::mul 4.90% 23.759us 8.63% 41.840us 13.947us 7.616us 48.97% 7.616us 2.539us 3
4093
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
4094
- Activity Buffer Request 23.12% 112.032us 23.12% 112.032us 112.032us 2.688us 17.28% 2.688us 2.688us 1
4095
- aten::slice 4.68% 22.671us 5.79% 28.070us 4.678us 0.000us 0.00% 0.000us 0.000us 6
4096
- aten::as_strided 1.11% 5.399us 1.11% 5.399us 0.900us 0.000us 0.00% 0.000us 0.000us 6
4097
- cudaLaunchKernel 35.23% 170.743us 35.23% 170.743us 28.457us 0.000us 0.00% 0.000us 0.000us 6
4098
- cudaDeviceSynchronize 1.01% 4.900us 1.01% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
- Self CPU time total: 484.597us
4101
- Self CUDA time total: 15.552us
4102
 
4103
 
4104
 
@@ -4108,20 +4108,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.662us 1011.54% 145.662us 145.662us 1
4112
- torch_eager 5.99% 108.381us 99.73% 1.804ms 1.804ms 0.000us 0.00% 16.896us 16.896us 1
4113
- aten::silu 2.28% 41.342us 89.69% 1.623ms 540.945us 7.392us 51.33% 9.888us 3.296us 3
4114
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 51.33% 7.392us 2.464us 3
4115
- aten::mul 1.44% 26.049us 2.45% 44.420us 14.807us 7.008us 48.67% 7.008us 2.336us 3
4116
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.67% 7.008us 2.336us 3
4117
- Activity Buffer Request 78.99% 1.429ms 78.99% 1.429ms 1.429ms 2.496us 17.33% 2.496us 2.496us 1
4118
- aten::slice 1.28% 23.160us 1.59% 28.810us 4.802us 0.000us 0.00% 0.000us 0.000us 6
4119
- aten::as_strided 0.31% 5.650us 0.31% 5.650us 0.942us 0.000us 0.00% 0.000us 0.000us 6
4120
- cudaLaunchKernel 9.43% 170.603us 9.43% 170.603us 28.434us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaDeviceSynchronize 0.27% 4.930us 0.27% 4.930us 4.930us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
- Self CPU time total: 1.809ms
4124
- Self CUDA time total: 14.400us
4125
 
4126
 
4127
 
@@ -4131,20 +4131,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 142.206us 914.45% 142.206us 142.206us 1
4135
- torch_eager 21.70% 105.494us 98.87% 480.727us 480.727us 0.000us 0.00% 18.239us 18.239us 1
4136
- aten::silu 8.21% 39.900us 62.39% 303.354us 101.118us 7.966us 51.23% 10.654us 3.551us 3
4137
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.966us 51.23% 7.966us 2.655us 3
4138
- aten::mul 5.16% 25.070us 8.84% 42.990us 14.330us 7.585us 48.77% 7.585us 2.528us 3
4139
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.585us 48.77% 7.585us 2.528us 3
4140
- Activity Buffer Request 23.29% 113.242us 23.29% 113.242us 113.242us 2.688us 17.29% 2.688us 2.688us 1
4141
- aten::slice 4.75% 23.080us 5.94% 28.889us 4.815us 0.000us 0.00% 0.000us 0.000us 6
4142
- aten::as_strided 1.19% 5.809us 1.19% 5.809us 0.968us 0.000us 0.00% 0.000us 0.000us 6
4143
- cudaLaunchKernel 34.58% 168.132us 34.58% 168.132us 28.022us 0.000us 0.00% 0.000us 0.000us 6
4144
- cudaDeviceSynchronize 1.13% 5.500us 1.13% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
- Self CPU time total: 486.227us
4147
- Self CUDA time total: 15.551us
4148
 
4149
 
4150
 
@@ -4154,20 +4154,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.022us 661.50% 149.022us 149.022us 1
4158
- torch_eager 5.72% 105.900us 99.72% 1.847ms 1.847ms 0.000us 0.00% 26.431us 26.431us 1
4159
- aten::silu 2.24% 41.461us 90.05% 1.668ms 555.875us 11.552us 51.28% 15.455us 5.152us 3
4160
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 51.28% 11.552us 3.851us 3
4161
- aten::mul 1.41% 26.021us 2.40% 44.421us 14.807us 10.976us 48.72% 10.976us 3.659us 3
4162
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.976us 48.72% 10.976us 3.659us 3
4163
- Activity Buffer Request 79.50% 1.472ms 79.50% 1.472ms 1.472ms 3.903us 17.33% 3.903us 3.903us 1
4164
- aten::slice 1.25% 23.131us 1.56% 28.831us 4.805us 0.000us 0.00% 0.000us 0.000us 6
4165
- aten::as_strided 0.31% 5.700us 0.31% 5.700us 0.950us 0.000us 0.00% 0.000us 0.000us 6
4166
- cudaLaunchKernel 9.31% 172.382us 9.31% 172.382us 28.730us 0.000us 0.00% 0.000us 0.000us 6
4167
- cudaDeviceSynchronize 0.28% 5.130us 0.28% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
- Self CPU time total: 1.852ms
4170
- Self CUDA time total: 22.528us
4171
 
4172
 
4173
  impl wl p50(ms) ok
@@ -4184,7 +4184,7 @@ torch_eager cuda_T512_D768 0.05 True
4184
  <div class="uv-install-logs" id="uv-logs-benchmark">
4185
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4186
  <div class="uv-logs-content" style="display: none;">
4187
- Installed 37 packages in 246ms
4188
  </div>
4189
  </div>
4190
  <div class="cell-artifacts">
 
3871
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: nv | 0.21s
3875
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3877
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3887
  </div>
3888
  </div>
3889
  <div id="output-nv" class="cell-output">
3890
+ <div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:07:54 2025
3891
  +-----------------------------------------------------------------------------------------+
3892
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3893
  |-----------------------------------------+------------------------+----------------------+
 
3896
  | | | MIG M. |
3897
  |=========================================+========================+======================|
3898
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3899
+ | N/A 27C P0 80W / 350W | 0MiB / 46068MiB | 1% Default |
3900
  | | | N/A |
3901
  +-----------------------------------------+------------------------+----------------------+
3902
 
 
3920
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3921
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3922
  </span> |
3923
+ Cell: benchmark | 6.88s
3924
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3925
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3926
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3970
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3971
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 206.526us 1621.34% 206.526us 206.526us 1
3974
+ torch_eager 11.16% 213.167us 99.55% 1.902ms 1.902ms 0.000us 0.00% 15.042us 15.042us 1
3975
+ aten::silu 3.29% 62.892us 81.79% 1.563ms 520.961us 6.529us 51.26% 8.833us 2.944us 3
3976
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.529us 51.26% 6.529us 2.176us 3
3977
+ aten::mul 2.06% 39.382us 3.23% 61.724us 20.575us 6.209us 48.74% 6.209us 2.070us 3
3978
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.209us 48.74% 6.209us 2.070us 3
3979
+ Activity Buffer Request 76.05% 1.453ms 76.05% 1.453ms 1.453ms 2.304us 18.09% 2.304us 2.304us 1
3980
+ aten::slice 2.72% 51.931us 3.38% 64.581us 10.764us 0.000us 0.00% 0.000us 0.000us 6
3981
+ aten::as_strided 0.66% 12.650us 0.66% 12.650us 2.108us 0.000us 0.00% 0.000us 0.000us 6
3982
+ cudaLaunchKernel 3.62% 69.144us 3.62% 69.144us 11.524us 0.000us 0.00% 0.000us 0.000us 6
3983
+ cudaDeviceSynchronize 0.45% 8.521us 0.45% 8.521us 8.521us 0.000us 0.00% 0.000us 0.000us 1
3984
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3985
+ Self CPU time total: 1.911ms
3986
+ Self CUDA time total: 12.738us
3987
 
3988
 
3989
 
 
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.965us 1227.21% 151.965us 151.965us 1
3997
+ torch_eager 7.02% 119.974us 99.63% 1.704ms 1.704ms 0.000us 0.00% 14.558us 14.558us 1
3998
+ aten::silu 2.35% 40.140us 88.12% 1.507ms 502.320us 6.399us 51.68% 8.574us 2.858us 3
3999
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
4000
+ aten::mul 1.61% 27.481us 2.72% 46.541us 15.514us 5.984us 48.32% 5.984us 1.995us 3
4001
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
4002
+ Activity Buffer Request 84.14% 1.439ms 84.14% 1.439ms 1.439ms 2.175us 17.56% 2.175us 2.175us 1
4003
+ aten::slice 1.43% 24.471us 1.78% 30.412us 5.069us 0.000us 0.00% 0.000us 0.000us 6
4004
+ aten::as_strided 0.35% 5.941us 0.35% 5.941us 0.990us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaLaunchKernel 2.74% 46.851us 2.74% 46.851us 7.809us 0.000us 0.00% 0.000us 0.000us 6
4006
+ cudaDeviceSynchronize 0.37% 6.320us 0.37% 6.320us 6.320us 0.000us 0.00% 0.000us 0.000us 1
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ Self CPU time total: 1.710ms
4009
+ Self CUDA time total: 12.383us
4010
 
4011
 
4012
 
 
4016
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4017
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 151.008us 1139.77% 151.008us 151.008us 1
4020
+ torch_eager 6.34% 107.173us 99.70% 1.687ms 1.687ms 0.000us 0.00% 15.522us 15.522us 1
4021
+ aten::silu 2.38% 40.332us 88.83% 1.503ms 500.911us 6.817us 51.45% 9.090us 3.030us 3
4022
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.817us 51.45% 6.817us 2.272us 3
4023
+ aten::mul 1.57% 26.503us 2.73% 46.253us 15.418us 6.432us 48.55% 6.432us 2.144us 3
4024
  void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.55% 6.432us 2.144us 3
4025
+ Activity Buffer Request 84.91% 1.436ms 84.91% 1.436ms 1.436ms 2.273us 17.16% 2.273us 2.273us 1
4026
+ aten::slice 1.43% 24.250us 1.81% 30.550us 5.092us 0.000us 0.00% 0.000us 0.000us 6
4027
+ aten::as_strided 0.37% 6.300us 0.37% 6.300us 1.050us 0.000us 0.00% 0.000us 0.000us 6
4028
+ cudaLaunchKernel 2.70% 45.731us 2.70% 45.731us 7.622us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaDeviceSynchronize 0.30% 5.000us 0.30% 5.000us 5.000us 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ Self CPU time total: 1.692ms
4032
+ Self CUDA time total: 13.249us
4033
 
4034
 
4035
 
 
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.149us 1202.68% 153.149us 153.149us 1
4043
+ torch_eager 6.34% 109.104us 99.71% 1.717ms 1.717ms 0.000us 0.00% 14.941us 14.941us 1
4044
+ aten::silu 2.38% 40.982us 88.93% 1.531ms 510.411us 6.558us 51.50% 8.765us 2.922us 3
4045
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.558us 51.50% 6.558us 2.186us 3
4046
+ aten::mul 1.52% 26.241us 2.68% 46.222us 15.407us 6.176us 48.50% 6.176us 2.059us 3
4047
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.50% 6.176us 2.059us 3
4048
+ Activity Buffer Request 73.41% 1.264ms 73.41% 1.264ms 1.264ms 2.207us 17.33% 2.207us 2.207us 1
4049
+ aten::slice 1.43% 24.560us 1.77% 30.400us 5.067us 0.000us 0.00% 0.000us 0.000us 6
4050
+ aten::as_strided 0.34% 5.840us 0.34% 5.840us 0.973us 0.000us 0.00% 0.000us 0.000us 6
4051
+ cudaLaunchKernel 14.29% 246.139us 14.29% 246.139us 41.023us 0.000us 0.00% 0.000us 0.000us 6
4052
+ cudaDeviceSynchronize 0.29% 4.920us 0.29% 4.920us 4.920us 0.000us 0.00% 0.000us 0.000us 1
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ Self CPU time total: 1.722ms
4055
+ Self CUDA time total: 12.734us
4056
 
4057
 
4058
 
 
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 149.310us 1126.87% 149.310us 149.310us 1
4066
+ torch_eager 5.88% 107.113us 99.73% 1.817ms 1.817ms 0.000us 0.00% 15.555us 15.555us 1
4067
+ aten::silu 2.34% 42.602us 89.83% 1.636ms 545.432us 6.785us 51.21% 9.090us 3.030us 3
4068
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 51.21% 6.785us 2.262us 3
4069
+ aten::mul 1.33% 24.312us 2.33% 42.512us 14.171us 6.465us 48.79% 6.465us 2.155us 3
4070
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.465us 48.79% 6.465us 2.155us 3
4071
+ Activity Buffer Request 78.20% 1.424ms 78.20% 1.424ms 1.424ms 2.305us 17.40% 2.305us 2.305us 1
4072
+ aten::slice 1.35% 24.650us 1.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
4073
+ aten::as_strided 0.33% 6.010us 0.33% 6.010us 1.002us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaLaunchKernel 10.29% 187.406us 10.29% 187.406us 31.234us 0.000us 0.00% 0.000us 0.000us 6
4075
+ cudaDeviceSynchronize 0.27% 4.950us 0.27% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 1.822ms
4078
+ Self CUDA time total: 13.250us
4079
 
4080
 
4081
 
 
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 143.804us 924.73% 143.804us 143.804us 1
4089
+ torch_eager 21.50% 103.524us 99.01% 476.736us 476.736us 0.000us 0.00% 18.271us 18.271us 1
4090
+ aten::silu 8.70% 41.893us 62.70% 301.891us 100.630us 7.999us 51.44% 10.719us 3.573us 3
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.999us 51.44% 7.999us 2.666us 3
4092
+ aten::mul 5.07% 24.390us 8.83% 42.521us 14.174us 7.552us 48.56% 7.552us 2.517us 3
4093
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.552us 48.56% 7.552us 2.517us 3
4094
+ Activity Buffer Request 22.22% 106.973us 22.22% 106.973us 106.973us 2.720us 17.49% 2.720us 2.720us 1
4095
+ aten::slice 4.80% 23.090us 5.98% 28.800us 4.800us 0.000us 0.00% 0.000us 0.000us 6
4096
+ aten::as_strided 1.19% 5.710us 1.19% 5.710us 0.952us 0.000us 0.00% 0.000us 0.000us 6
4097
+ cudaLaunchKernel 35.55% 171.156us 35.55% 171.156us 28.526us 0.000us 0.00% 0.000us 0.000us 6
4098
+ cudaDeviceSynchronize 0.99% 4.760us 0.99% 4.760us 4.760us 0.000us 0.00% 0.000us 0.000us 1
4099
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4100
+ Self CPU time total: 481.496us
4101
+ Self CUDA time total: 15.551us
4102
 
4103
 
4104
 
 
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4110
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4111
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.372us 1067.46% 153.372us 153.372us 1
4112
+ torch_eager 5.96% 108.164us 99.73% 1.810ms 1.810ms 0.000us 0.00% 16.832us 16.832us 1
4113
+ aten::silu 2.30% 41.731us 89.59% 1.626ms 541.925us 7.360us 51.22% 9.824us 3.275us 3
4114
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 51.22% 7.360us 2.453us 3
4115
+ aten::mul 1.41% 25.542us 2.47% 44.792us 14.931us 7.008us 48.78% 7.008us 2.336us 3
4116
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.78% 7.008us 2.336us 3
4117
+ Activity Buffer Request 78.82% 1.430ms 78.82% 1.430ms 1.430ms 2.464us 17.15% 2.464us 2.464us 1
4118
+ aten::slice 1.37% 24.840us 1.70% 30.900us 5.150us 0.000us 0.00% 0.000us 0.000us 6
4119
+ aten::as_strided 0.33% 6.060us 0.33% 6.060us 1.010us 0.000us 0.00% 0.000us 0.000us 6
4120
+ cudaLaunchKernel 9.53% 172.976us 9.53% 172.976us 28.829us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaDeviceSynchronize 0.27% 4.960us 0.27% 4.960us 4.960us 0.000us 0.00% 0.000us 0.000us 1
4122
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4123
+ Self CPU time total: 1.815ms
4124
+ Self CUDA time total: 14.368us
4125
 
4126
 
4127
 
 
4131
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4132
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4133
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4134
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 146.240us 942.27% 146.240us 146.240us 1
4135
+ torch_eager 22.59% 104.486us 98.96% 457.726us 457.726us 0.000us 0.00% 18.208us 18.208us 1
4136
+ aten::silu 8.78% 40.590us 60.43% 279.519us 93.173us 7.936us 51.13% 10.624us 3.541us 3
4137
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
4138
+ aten::mul 5.53% 25.579us 9.45% 43.730us 14.577us 7.584us 48.87% 7.584us 2.528us 3
4139
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
4140
+ Activity Buffer Request 18.85% 87.193us 18.85% 87.193us 87.193us 2.688us 17.32% 2.688us 2.688us 1
4141
+ aten::slice 5.23% 24.201us 6.48% 29.991us 4.999us 0.000us 0.00% 0.000us 0.000us 6
4142
+ aten::as_strided 1.25% 5.790us 1.25% 5.790us 0.965us 0.000us 0.00% 0.000us 0.000us 6
4143
+ cudaLaunchKernel 36.73% 169.887us 36.73% 169.887us 28.314us 0.000us 0.00% 0.000us 0.000us 6
4144
+ cudaDeviceSynchronize 1.04% 4.800us 1.04% 4.800us 4.800us 0.000us 0.00% 0.000us 0.000us 1
4145
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4146
+ Self CPU time total: 462.526us
4147
+ Self CUDA time total: 15.520us
4148
 
4149
 
4150
 
 
4154
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4155
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4156
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4157
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 181.470us 803.28% 181.470us 181.470us 1
4158
+ torch_eager 5.97% 109.125us 99.74% 1.823ms 1.823ms 0.000us 0.00% 26.526us 26.526us 1
4159
+ aten::silu 2.38% 43.492us 88.50% 1.617ms 539.072us 11.647us 51.56% 15.582us 5.194us 3
4160
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.647us 51.56% 11.647us 3.882us 3
4161
+ aten::mul 1.42% 25.882us 3.51% 64.123us 21.374us 10.944us 48.44% 10.944us 3.648us 3
4162
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.44% 10.944us 3.648us 3
4163
+ Activity Buffer Request 77.67% 1.419ms 77.67% 1.419ms 1.419ms 3.935us 17.42% 3.935us 3.935us 1
4164
+ aten::slice 1.42% 25.910us 1.76% 32.089us 5.348us 0.000us 0.00% 0.000us 0.000us 6
4165
+ aten::as_strided 0.34% 6.179us 0.34% 6.179us 1.030us 0.000us 0.00% 0.000us 0.000us 6
4166
+ cudaLaunchKernel 10.54% 192.606us 10.54% 192.606us 32.101us 0.000us 0.00% 0.000us 0.000us 6
4167
+ cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
4168
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4169
+ Self CPU time total: 1.827ms
4170
+ Self CUDA time total: 22.591us
4171
 
4172
 
4173
  impl wl p50(ms) ok
 
4184
  <div class="uv-install-logs" id="uv-logs-benchmark">
4185
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4186
  <div class="uv-logs-content" style="display: none;">
4187
+ Installed 37 packages in 192ms
4188
  </div>
4189
  </div>
4190
  <div class="cell-artifacts">
activation/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 9254fad09b1905d500f91c98ba5debdf4f6497c196acc2cdc499c0572bc73647
  • Pointer size: 130 Bytes
  • Size of remote file: 20.6 kB

Git LFS Details

  • SHA256: 431dea6a591fc822f7d0d0d6f793e8c11170edb647c627b5a44ad9883df2c3fc
  • Pointer size: 130 Bytes
  • Size of remote file: 20.7 kB
activation/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-27T14:46:43.482898</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -4021,83 +4021,83 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
- <path d="M 60.23 452.615548 L 847.294169 452.615548 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
- <path d="M 60.23 373.068398 L 847.294169 373.068398 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
- <path d="M 60.23 293.521249 L 847.294169 293.521249 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
- <path d="M 60.23 213.974099 L 847.294169 213.974099 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
- <path d="M 60.23 134.42695 L 847.294169 134.42695 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
- <use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
- <path d="M 60.23 54.8798 L 847.294169 54.8798 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
- <use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
4101
  </g>
4102
  </g>
4103
  <g id="label--y" class="ylabel">
@@ -4105,37 +4105,37 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4105
  </g>
4106
  </g>
4107
  <g id="series--hf-kernels-swiglu" class="series">
4108
- <path d="M 96.005644 451.16779 L 185.444754 379.591266 L 274.883864 367.802376 L 364.322974 382.120864 L 453.762084 356.82487 L 543.201194 396.121166 L 632.640304 374.96162 L 722.079415 389.136924 L 811.518525 358.734003 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4109
  <defs>
4110
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4111
  </defs>
4112
  <g clip-path="url(#p620c7d392f)">
4113
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4114
- <use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
4115
- <use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
4116
- <use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
4117
- <use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
4118
- <use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
4119
- <use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
4120
- <use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
4121
- <use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
4122
  </g>
4123
  </g>
4124
  <g id="series--torch-eager" class="series">
4125
- <path d="M 96.005644 189.63267 L 185.444754 53.272948 L 274.883864 47.08418 L 364.322974 66.175497 L 453.762084 61.545851 L 543.201194 66.795966 L 632.640304 59.954911 L 722.079415 85.26681 L 811.518525 95.751126 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
- <use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
4131
- <use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
4132
- <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4133
- <use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
4134
- <use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
4135
- <use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
4136
- <use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
4137
- <use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
4138
- <use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  </g>
4140
  </g>
4141
  <g id="patch_3">
@@ -4155,25 +4155,25 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4155
  </g>
4156
  <g id="legend" class="legend">
4157
  <g id="patch_7">
4158
- <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4159
  </g>
4160
  <g id="line2d_16">
4161
- <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4162
  <g>
4163
- <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4164
  </g>
4165
  </g>
4166
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4167
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4168
  </g>
4169
  <g id="line2d_17">
4170
- <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4171
  <g>
4172
- <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4173
  </g>
4174
  </g>
4175
  <g id="legend-label--torch-eager" class="legend">
4176
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
4177
  </g>
4178
  </g>
4179
  </g>
@@ -4193,7 +4193,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4193
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4194
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4195
  </span> |
4196
- Cell: combine | 4.45s
4197
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4198
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4199
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4284,7 +4284,7 @@ COMBINED BENCHMARK SUMMARY
4284
  impl wl p50(ms) ok
4285
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4286
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4287
- hf_kernels_swiglu cuda_T128_D768 0.03 True
4288
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4289
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4290
  hf_kernels_swiglu cuda_T256_D768 0.03 True
@@ -4319,7 +4319,7 @@ Implementations included:
4319
  <div class="uv-install-logs" id="uv-logs-combine">
4320
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4321
  <div class="uv-logs-content" style="display: none;">
4322
- Installed 37 packages in 250ms
4323
  </div>
4324
  </div>
4325
  <div class="cell-artifacts">
@@ -4332,7 +4332,7 @@ Installed 37 packages in 250ms
4332
  <rdf:RDF>
4333
  <ns2:Work>
4334
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4335
- <dc:date>2025-10-27T14:46:43.482898</dc:date>
4336
  <dc:format>image/svg+xml</dc:format>
4337
  <dc:creator>
4338
  <ns2:Agent>
@@ -4481,83 +4481,83 @@ Installed 37 packages in 250ms
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
- <path d="M 60.23 452.615548 L 847.294169 452.615548 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_10">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
- <use ns4:href="#m0fca2865ba" x="60.23" y="452.615548" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_10">
4495
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="456.414767" transform="rotate(-0 53.23 456.414767)">0.025</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
- <path d="M 60.23 373.068398 L 847.294169 373.068398 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_11">
4503
  <g>
4504
- <use ns4:href="#m0fca2865ba" x="60.23" y="373.068398" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_11">
4508
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="376.867617" transform="rotate(-0 53.23 376.867617)">0.030</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
- <path d="M 60.23 293.521249 L 847.294169 293.521249 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_12">
4516
  <g>
4517
- <use ns4:href="#m0fca2865ba" x="60.23" y="293.521249" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_12">
4521
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="297.320468" transform="rotate(-0 53.23 297.320468)">0.035</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
- <path d="M 60.23 213.974099 L 847.294169 213.974099 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_13">
4529
  <g>
4530
- <use ns4:href="#m0fca2865ba" x="60.23" y="213.974099" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_13">
4534
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.773318" transform="rotate(-0 53.23 217.773318)">0.040</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
- <path d="M 60.23 134.42695 L 847.294169 134.42695 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_14">
4542
  <g>
4543
- <use ns4:href="#m0fca2865ba" x="60.23" y="134.42695" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_14">
4547
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="138.226168" transform="rotate(-0 53.23 138.226168)">0.045</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
- <path d="M 60.23 54.8798 L 847.294169 54.8798 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_15">
4555
  <g>
4556
- <use ns4:href="#m0fca2865ba" x="60.23" y="54.8798" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_15">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="58.679019" transform="rotate(-0 53.23 58.679019)">0.050</text>
4561
  </g>
4562
  </g>
4563
  <g id="label--y" class="ylabel">
@@ -4565,37 +4565,37 @@ Installed 37 packages in 250ms
4565
  </g>
4566
  </g>
4567
  <g id="series--hf-kernels-swiglu" class="series">
4568
- <path d="M 96.005644 451.16779 L 185.444754 379.591266 L 274.883864 367.802376 L 364.322974 382.120864 L 453.762084 356.82487 L 543.201194 396.121166 L 632.640304 374.96162 L 722.079415 389.136924 L 811.518525 358.734003 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4569
  <defs>
4570
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4571
  </defs>
4572
  <g clip-path="url(#p620c7d392f)">
4573
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4574
- <use ns4:href="#md7efaf3aec" x="185.444754" y="379.591266" style="fill: #1f77b4; stroke: #1f77b4" />
4575
- <use ns4:href="#md7efaf3aec" x="274.883864" y="367.802376" style="fill: #1f77b4; stroke: #1f77b4" />
4576
- <use ns4:href="#md7efaf3aec" x="364.322974" y="382.120864" style="fill: #1f77b4; stroke: #1f77b4" />
4577
- <use ns4:href="#md7efaf3aec" x="453.762084" y="356.82487" style="fill: #1f77b4; stroke: #1f77b4" />
4578
- <use ns4:href="#md7efaf3aec" x="543.201194" y="396.121166" style="fill: #1f77b4; stroke: #1f77b4" />
4579
- <use ns4:href="#md7efaf3aec" x="632.640304" y="374.96162" style="fill: #1f77b4; stroke: #1f77b4" />
4580
- <use ns4:href="#md7efaf3aec" x="722.079415" y="389.136924" style="fill: #1f77b4; stroke: #1f77b4" />
4581
- <use ns4:href="#md7efaf3aec" x="811.518525" y="358.734003" style="fill: #1f77b4; stroke: #1f77b4" />
4582
  </g>
4583
  </g>
4584
  <g id="series--torch-eager" class="series">
4585
- <path d="M 96.005644 189.63267 L 185.444754 53.272948 L 274.883864 47.08418 L 364.322974 66.175497 L 453.762084 61.545851 L 543.201194 66.795966 L 632.640304 59.954911 L 722.079415 85.26681 L 811.518525 95.751126 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
- <use ns4:href="#m9b8c54d372" x="96.005644" y="189.63267" style="fill: #ff7f0e; stroke: #ff7f0e" />
4591
- <use ns4:href="#m9b8c54d372" x="185.444754" y="53.272948" style="fill: #ff7f0e; stroke: #ff7f0e" />
4592
- <use ns4:href="#m9b8c54d372" x="274.883864" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4593
- <use ns4:href="#m9b8c54d372" x="364.322974" y="66.175497" style="fill: #ff7f0e; stroke: #ff7f0e" />
4594
- <use ns4:href="#m9b8c54d372" x="453.762084" y="61.545851" style="fill: #ff7f0e; stroke: #ff7f0e" />
4595
- <use ns4:href="#m9b8c54d372" x="543.201194" y="66.795966" style="fill: #ff7f0e; stroke: #ff7f0e" />
4596
- <use ns4:href="#m9b8c54d372" x="632.640304" y="59.954911" style="fill: #ff7f0e; stroke: #ff7f0e" />
4597
- <use ns4:href="#m9b8c54d372" x="722.079415" y="85.26681" style="fill: #ff7f0e; stroke: #ff7f0e" />
4598
- <use ns4:href="#m9b8c54d372" x="811.518525" y="95.751126" style="fill: #ff7f0e; stroke: #ff7f0e" />
4599
  </g>
4600
  </g>
4601
  <g id="patch_3">
@@ -4615,25 +4615,25 @@ Installed 37 packages in 250ms
4615
  </g>
4616
  <g id="legend" class="legend">
4617
  <g id="patch_7">
4618
- <path d="M 720.811356 64.7925 L 840.294169 64.7925 Q 842.294169 64.7925 842.294169 62.7925 L 842.294169 33.88 Q 842.294169 31.88 840.294169 31.88 L 720.811356 31.88 Q 718.811356 31.88 718.811356 33.88 L 718.811356 62.7925 Q 718.811356 64.7925 720.811356 64.7925 L 720.811356 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4619
  </g>
4620
  <g id="line2d_16">
4621
- <path d="M 722.811356 39.978438 L 732.811356 39.978438 L 742.811356 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4622
  <g>
4623
- <use ns4:href="#md7efaf3aec" x="732.811356" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4624
  </g>
4625
  </g>
4626
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4627
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="43.478438" transform="rotate(-0 750.811356 43.478438)">hf_kernels_swiglu</text>
4628
  </g>
4629
  <g id="line2d_17">
4630
- <path d="M 722.811356 54.934687 L 732.811356 54.934687 L 742.811356 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4631
  <g>
4632
- <use ns4:href="#m9b8c54d372" x="732.811356" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4633
  </g>
4634
  </g>
4635
  <g id="legend-label--torch-eager" class="legend">
4636
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="58.434687" transform="rotate(-0 750.811356 58.434687)">torch_eager</text>
4637
  </g>
4638
  </g>
4639
  </g>
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-28T14:09:13.211569</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
4021
  <g id="matplotlib.axis_2">
4022
  <g id="ytick_1">
4023
  <g id="grid-y--2" class="grid grid-y">
4024
+ <path d="M 60.23 416.825206 L 847.294169 416.825206 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4025
  </g>
4026
  <g id="line2d_10">
4027
  <defs>
4028
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4029
  </defs>
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_2">
4039
  <g id="grid-y--3" class="grid grid-y">
4040
+ <path d="M 60.23 346.161452 L 847.294169 346.161452 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_3">
4052
  <g id="grid-y--4" class="grid grid-y">
4053
+ <path d="M 60.23 275.497698 L 847.294169 275.497698 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_4">
4065
  <g id="grid-y--5" class="grid grid-y">
4066
+ <path d="M 60.23 204.833944 L 847.294169 204.833944 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
+ <use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
4075
  </g>
4076
  </g>
4077
  <g id="ytick_5">
4078
  <g id="grid-y--6" class="grid grid-y">
4079
+ <path d="M 60.23 134.170191 L 847.294169 134.170191 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4080
  </g>
4081
  <g id="line2d_14">
4082
  <g>
4083
+ <use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
4084
  </g>
4085
  </g>
4086
  <g id="text_14">
4087
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
4088
  </g>
4089
  </g>
4090
  <g id="ytick_6">
4091
  <g id="grid-y--7" class="grid grid-y">
4092
+ <path d="M 60.23 63.506437 L 847.294169 63.506437 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4093
  </g>
4094
  <g id="line2d_15">
4095
  <g>
4096
+ <use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
4097
  </g>
4098
  </g>
4099
  <g id="text_15">
4100
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
4101
  </g>
4102
  </g>
4103
  <g id="label--y" class="ylabel">
 
4105
  </g>
4106
  </g>
4107
  <g id="series--hf-kernels-swiglu" class="series">
4108
+ <path d="M 96.005644 451.16779 L 185.444754 370.031668 L 274.883864 370.596978 L 364.322974 386.708314 L 453.762084 392.220086 L 543.201194 399.569118 L 632.640304 388.969554 L 722.079415 403.526288 L 811.518525 390.241503 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4109
  <defs>
4110
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4111
  </defs>
4112
  <g clip-path="url(#p620c7d392f)">
4113
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4114
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
4115
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
4116
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
4117
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
4118
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
4119
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
4120
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
4121
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
4122
  </g>
4123
  </g>
4124
  <g id="series--torch-eager" class="series">
4125
+ <path d="M 96.005644 166.37873 L 185.444754 47.08418 L 274.883864 54.857193 L 364.322974 60.807081 L 453.762084 69.569387 L 543.201194 78.176231 L 632.640304 66.44605 L 722.079415 63.902153 L 811.518525 71.109857 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
4131
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4132
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
4133
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4134
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
4135
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
4136
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
4137
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
4138
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4139
  </g>
4140
  </g>
4141
  <g id="patch_3">
 
4155
  </g>
4156
  <g id="legend" class="legend">
4157
  <g id="patch_7">
4158
+ <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4159
  </g>
4160
  <g id="line2d_16">
4161
+ <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4162
  <g>
4163
+ <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
4164
  </g>
4165
  </g>
4166
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4167
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4168
  </g>
4169
  <g id="line2d_17">
4170
+ <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4171
  <g>
4172
+ <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
4173
  </g>
4174
  </g>
4175
  <g id="legend-label--torch-eager" class="legend">
4176
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
4177
  </g>
4178
  </g>
4179
  </g>
 
4193
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4194
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4195
  </span> |
4196
+ Cell: combine | 4.28s
4197
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4198
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4199
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4284
  impl wl p50(ms) ok
4285
  hf_kernels_swiglu cuda_T128_D1024 0.03 True
4286
  hf_kernels_swiglu cuda_T128_D2048 0.03 True
4287
+ hf_kernels_swiglu cuda_T128_D768 0.02 True
4288
  hf_kernels_swiglu cuda_T256_D1024 0.03 True
4289
  hf_kernels_swiglu cuda_T256_D2048 0.03 True
4290
  hf_kernels_swiglu cuda_T256_D768 0.03 True
 
4319
  <div class="uv-install-logs" id="uv-logs-combine">
4320
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4321
  <div class="uv-logs-content" style="display: none;">
4322
+ Installed 37 packages in 195ms
4323
  </div>
4324
  </div>
4325
  <div class="cell-artifacts">
 
4332
  <rdf:RDF>
4333
  <ns2:Work>
4334
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4335
+ <dc:date>2025-10-28T14:09:13.211569</dc:date>
4336
  <dc:format>image/svg+xml</dc:format>
4337
  <dc:creator>
4338
  <ns2:Agent>
 
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
+ <path d="M 60.23 416.825206 L 847.294169 416.825206 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_10">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
+ <use ns4:href="#m0fca2865ba" x="60.23" y="416.825206" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_10">
4495
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="420.624425" transform="rotate(-0 53.23 420.624425)">0.025</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
+ <path d="M 60.23 346.161452 L 847.294169 346.161452 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_11">
4503
  <g>
4504
+ <use ns4:href="#m0fca2865ba" x="60.23" y="346.161452" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_11">
4508
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="349.960671" transform="rotate(-0 53.23 349.960671)">0.030</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
+ <path d="M 60.23 275.497698 L 847.294169 275.497698 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_12">
4516
  <g>
4517
+ <use ns4:href="#m0fca2865ba" x="60.23" y="275.497698" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_12">
4521
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="279.296917" transform="rotate(-0 53.23 279.296917)">0.035</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
+ <path d="M 60.23 204.833944 L 847.294169 204.833944 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_13">
4529
  <g>
4530
+ <use ns4:href="#m0fca2865ba" x="60.23" y="204.833944" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_13">
4534
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="208.633163" transform="rotate(-0 53.23 208.633163)">0.040</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
+ <path d="M 60.23 134.170191 L 847.294169 134.170191 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_14">
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="60.23" y="134.170191" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_14">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="137.969409" transform="rotate(-0 53.23 137.969409)">0.045</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
+ <path d="M 60.23 63.506437 L 847.294169 63.506437 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_15">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="60.23" y="63.506437" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_15">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="67.305655" transform="rotate(-0 53.23 67.305655)">0.050</text>
4561
  </g>
4562
  </g>
4563
  <g id="label--y" class="ylabel">
 
4565
  </g>
4566
  </g>
4567
  <g id="series--hf-kernels-swiglu" class="series">
4568
+ <path d="M 96.005644 451.16779 L 185.444754 370.031668 L 274.883864 370.596978 L 364.322974 386.708314 L 453.762084 392.220086 L 543.201194 399.569118 L 632.640304 388.969554 L 722.079415 403.526288 L 811.518525 390.241503 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4569
  <defs>
4570
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4571
  </defs>
4572
  <g clip-path="url(#p620c7d392f)">
4573
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4574
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="370.031668" style="fill: #1f77b4; stroke: #1f77b4" />
4575
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="370.596978" style="fill: #1f77b4; stroke: #1f77b4" />
4576
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="386.708314" style="fill: #1f77b4; stroke: #1f77b4" />
4577
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="392.220086" style="fill: #1f77b4; stroke: #1f77b4" />
4578
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="399.569118" style="fill: #1f77b4; stroke: #1f77b4" />
4579
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="388.969554" style="fill: #1f77b4; stroke: #1f77b4" />
4580
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="403.526288" style="fill: #1f77b4; stroke: #1f77b4" />
4581
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="390.241503" style="fill: #1f77b4; stroke: #1f77b4" />
4582
  </g>
4583
  </g>
4584
  <g id="series--torch-eager" class="series">
4585
+ <path d="M 96.005644 166.37873 L 185.444754 47.08418 L 274.883864 54.857193 L 364.322974 60.807081 L 453.762084 69.569387 L 543.201194 78.176231 L 632.640304 66.44605 L 722.079415 63.902153 L 811.518525 71.109857 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4586
  <defs>
4587
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4588
  </defs>
4589
  <g clip-path="url(#p620c7d392f)">
4590
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="166.37873" style="fill: #ff7f0e; stroke: #ff7f0e" />
4591
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4592
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="54.857193" style="fill: #ff7f0e; stroke: #ff7f0e" />
4593
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="60.807081" style="fill: #ff7f0e; stroke: #ff7f0e" />
4594
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="69.569387" style="fill: #ff7f0e; stroke: #ff7f0e" />
4595
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="78.176231" style="fill: #ff7f0e; stroke: #ff7f0e" />
4596
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="66.44605" style="fill: #ff7f0e; stroke: #ff7f0e" />
4597
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="63.902153" style="fill: #ff7f0e; stroke: #ff7f0e" />
4598
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="71.109857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4599
  </g>
4600
  </g>
4601
  <g id="patch_3">
 
4615
  </g>
4616
  <g id="legend" class="legend">
4617
  <g id="patch_7">
4618
+ <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4619
  </g>
4620
  <g id="line2d_16">
4621
+ <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4622
  <g>
4623
+ <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
4624
  </g>
4625
  </g>
4626
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4627
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4628
  </g>
4629
  <g id="line2d_17">
4630
+ <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4631
  <g>
4632
+ <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
4633
  </g>
4634
  </g>
4635
  <g id="legend-label--torch-eager" class="legend">
4636
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="460.014158" transform="rotate(-0 750.811356 460.014158)">torch_eager</text>
4637
  </g>
4638
  </g>
4639
  </g>
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.06712200001857127, "p50": 0.06883200001084333, "p90": 0.06976199995278876, "mean": 0.06901199997173535, "iqr": 0.0014600000213249587, "raw_times": [0.06976199995278876, 0.07104199994500959, 0.06712200001857127, 0.0683019999314638, 0.06883200001084333], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0738530000035098, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08455299996512622, "p50": 0.08599400007369695, "p90": 0.0868530000843748, "mean": 0.08612520005044644, "iqr": 0.0014299999975264654, "raw_times": [0.08780300004218589, 0.08455299996512622, 0.0868530000843748, 0.08542300008684833, 0.08599400007369695], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08941300006881647, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08494299993344612, "p50": 0.08714299997336639, "p90": 0.08724299993900786, "mean": 0.086546999955317, "iqr": 0.0020200000108161476, "raw_times": [0.08522299992819171, 0.08714299997336639, 0.08818300000257295, 0.08724299993900786, 0.08494299993344612], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105300000555872, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08327299997290538, "p50": 0.084122999965075, "p90": 0.08580299993354856, "mean": 0.08452299998680246, "iqr": 0.0023699999474047218, "raw_times": [0.08327299997290538, 0.084122999965075, 0.08598300007633952, 0.08580299993354856, 0.08343299998614384], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08891300001323543, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08298299997022696, "p50": 0.08508299993081891, "p90": 0.08600299997851835, "mean": 0.0849267999683434, "iqr": 0.0016210000239880173, "raw_times": [0.08298299997022696, 0.08508299993081891, 0.08600299997851835, 0.08438199995453033, 0.08618300000762247], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08780300004218589, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08270299997548136, "p50": 0.08315299999139825, "p90": 0.0846430000365217, "mean": 0.08407499999520951, "iqr": 0.0019010000187336118, "raw_times": [0.08315299999139825, 0.08713399995485815, 0.08270299997548136, 0.08274200001778809, 0.0846430000365217], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981299993138236, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08372299998882227, "p50": 0.08510199995725998, "p90": 0.08608299992829416, "mean": 0.08701479998762807, "iqr": 0.0011499998890940333, "raw_times": [0.08493300003920012, 0.09523300002456381, 0.08510199995725998, 0.08372299998882227, 0.08608299992829416], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923300003971235, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08285199999136239, "p50": 0.08483300007355865, "p90": 0.08511300006830425, "mean": 0.08449480001218035, "iqr": 0.0016500000583619112, "raw_times": [0.08285199999136239, 0.08346300000994233, 0.08483300007355865, 0.08621299991773412, 0.08511300006830425], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08870299996033282, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08278300003894401, "p50": 0.08427300008406746, "p90": 0.08444299999155191, "mean": 0.08422300002166594, "iqr": 0.0002599999788799323, "raw_times": [0.08444299999155191, 0.08418300001267198, 0.08278300003894401, 0.08543299998109433, 0.08427300008406746], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08903299999474257, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08352199995442788, "p50": 0.0842329999386493, "p90": 0.08553300006042264, "mean": 0.08496079999531503, "iqr": 0.0014400000054592965, "raw_times": [0.08409300005496334, 0.08742299996811198, 0.08553300006042264, 0.08352199995442788, 0.0842329999386493], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08985400006622513, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.14414499992199126, "p50": 0.14512600000671227, "p90": 0.14515400005166157, "mean": 0.1465472000063528, "iqr": 0.0008580000212532468, "raw_times": [0.14512600000671227, 0.14414499992199126, 0.14429600003040832, 0.15401500002099056, 0.14515400005166157], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.14571500003057736, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.16020600003230356, "p50": 0.16135600003508443, "p90": 0.16139600006681576, "mean": 0.16140360005465482, "iqr": 0.00029099999210302485, "raw_times": [0.16139600006681576, 0.1629550000643576, 0.16110500007471273, 0.16020600003230356, 0.16135600003508443], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1623660000404925, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07979300005445111, "p50": 0.08039299996198679, "p90": 0.08136300004935038, "mean": 0.08070500002759218, "iqr": 0.001150000002780871, "raw_times": [0.0802130000465695, 0.0817630000256031, 0.07979300005445111, 0.08039299996198679, 0.08136300004935038], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0855329999467358, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0800829999434427, "p50": 0.08147299990923784, "p90": 0.08197300007850572, "mean": 0.08146099996793055, "iqr": 0.00109000018255756, "raw_times": [0.0800829999434427, 0.08197300007850572, 0.08147299990923784, 0.08289300001251831, 0.08088299989594816], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08291199992527254, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0799729999698684, "p50": 0.08137199995417177, "p90": 0.081513000054656, "mean": 0.08127659998535819, "iqr": 0.0006500000608866685, "raw_times": [0.0799729999698684, 0.08266199995432544, 0.081513000054656, 0.08086299999376934, 0.08137199995417177], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08939400004237541, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08067300007041922, "p50": 0.08162300002823031, "p90": 0.08189199991193163, "mean": 0.08365860001049441, "iqr": 0.0008099999604382901, "raw_times": [0.08067300007041922, 0.08108199995149334, 0.08189199991193163, 0.08162300002823031, 0.09302300009039755], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08415299998887349, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0806030000148894, "p50": 0.08186299999124458, "p90": 0.08199299998068454, "mean": 0.08162900001025264, "iqr": 0.001009999891721236, "raw_times": [0.08270299997548136, 0.08186299999124458, 0.0806030000148894, 0.08199299998068454, 0.08098300008896331], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10199300004387624, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08040199998049502, "p50": 0.08168299996214046, "p90": 0.08185199999388715, "mean": 0.08171659999334224, "iqr": 0.0013889999763705418, "raw_times": [0.0804630000175166, 0.08418300001267198, 0.08168299996214046, 0.08040199998049502, 0.08185199999388715], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08522300004187855, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08097300008103048, "p50": 0.08150300004672317, "p90": 0.08173299988811777, "mean": 0.08153900000706926, "iqr": 0.0005599998758043512, "raw_times": [0.08117300001231342, 0.08231300000716146, 0.08150300004672317, 0.08173299988811777, 0.08097300008103048], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08440300007350743, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0802130000465695, "p50": 0.08124300006784324, "p90": 0.08242299998073577, "mean": 0.08162480000919459, "iqr": 0.0012000000424450263, "raw_times": [0.0802130000465695, 0.08302200001253368, 0.08242299998073577, 0.08124300006784324, 0.08122299993829074], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08460300000479037, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09226300005593657, "p50": 0.09320300000581483, "p90": 0.0934630000983816, "mean": 0.09316100004070904, "iqr": 0.0007800000503266347, "raw_times": [0.09419299999535724, 0.09320300000581483, 0.0934630000983816, 0.09226300005593657, 0.09268300004805496], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0951240000404141, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.09887299995625654, "p50": 0.09917300008055463, "p90": 0.09990300009121711, "mean": 0.09939520000443736, "iqr": 0.0009100001534534385, "raw_times": [0.09887299995625654, 0.09917300008055463, 0.09990300009121711, 0.10003399995639484, 0.09899299993776367], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.1023739999936879, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4842959999677987, "p50": 0.4860569999891595, "p90": 0.4878769999550059, "mean": 0.48646659997757524, "iqr": 0.002959999960694404, "raw_times": [0.4849169999943115, 0.4860569999891595, 0.4878769999550059, 0.4842959999677987, 0.4891859999816006], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4877669999814316, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-10-28T14:09:04Z", "run": "f19e00ba30d74acf9dc0d60a3bc32059", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4968179999877975, "p50": 0.49805800006197387, "p90": 0.4990780000753148, "mean": 0.4983496000022569, "iqr": 0.001141000097959477, "raw_times": [0.4979369999773553, 0.49985699990884314, 0.4990780000753148, 0.49805800006197387, 0.4968179999877975], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.49727700002222264, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/cells/benchmark.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
+ # ///
12
+ import torch
13
+ import torch.nn.functional as F
14
+ import sys
15
+ from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+
17
+
18
+ def torch_causal_conv1d(input_tensor, weight, bias):
19
+ # Convert to weight dtype for computation
20
+ x = input_tensor.to(weight.dtype)
21
+ dim = weight.shape[0]
22
+ width = weight.shape[1]
23
+ seqlen = input_tensor.shape[-1]
24
+
25
+ # Depthwise causal conv1d using PyTorch
26
+ out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
27
+
28
+ # Truncate to original sequence length
29
+ out = out[..., :seqlen]
30
+
31
+ # Convert back to original dtype
32
+ return out.to(input_tensor.dtype)
33
+
34
+
35
+ run_benchmark(
36
+ kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
37
+ impl_name="torch_eager",
38
+ impl_tags={"family": "pytorch", "backend": "eager"},
39
+ impl_func=torch_causal_conv1d,
40
+ )
causal_conv1d/impls/cells/nv.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import subprocess
2
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
causal_conv1d/impls/hf_kernels_causal_conv1d.html ADDED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /causal_conv1d/impls</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /causal_conv1d/impls</h1>
84
+ <ul>
85
+ <li><a href='hf_kernels_causal_conv1d.html' class='file'>hf_kernels_causal_conv1d.html</a></li>
86
+ <li><a href='torch_causal_conv1d.html' class='file'>torch_causal_conv1d.html</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
causal_conv1d/impls/torch_causal_conv1d.html ADDED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /causal_conv1d</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /causal_conv1d</h1>
84
+ <ul>
85
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results/index.html' class='dir'>results/</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
causal_conv1d/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: a640783c4d5cb4dc1763b97fa9a3e0cf2d278599a3fc38ba2056846c760ec8fe
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB
causal_conv1d/results/cells/combine.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # ///
13
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
14
+
15
+ # Map display names to uvnote environment variables
16
+ cache_env_map = {
17
+ "HF Kernels Causal Conv1D": "UVNOTE_FILE_HF_KERNELS_CAUSAL_CONV1D_BENCHMARK",
18
+ "PyTorch Causal Conv1D": "UVNOTE_FILE_TORCH_CAUSAL_CONV1D_BENCHMARK",
19
+ }
20
+
21
+ # Generate combined results with visualization
22
+ generate_combined_results(
23
+ cache_env_map=cache_env_map,
24
+ output_filename="causal_conv1d.jsonl",
25
+ svg_filename="latency.svg"
26
+ )
causal_conv1d/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /causal_conv1d/results</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /causal_conv1d/results</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
2
- {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
3
- {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
4
- {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
5
- {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
6
- {"ts": "2025-10-27T14:46:25Z", "run": "a317de5ffd144e2d917a7f8d53507ad4", "impl": "sage_int8_fp16", "tags": {"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": null, "compile_ms": null, "peak_bytes": null, "ok": false, "absmax": null, "corr": {}, "err": {"type": "AttributeError", "msg": "module 'sage_attention_12c766386675beb4' has no attribute 'fwd'"}}
 
1
+ {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9106109999947876, "p50": 0.9171110000352201, "p90": 0.9204320000435473, "mean": 0.9179216000347878, "iqr": 0.005419999979494605, "raw_times": [0.9171110000352201, 0.9150120000640527, 0.9106109999947876, 0.9204320000435473, 0.9264420000363316], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9176309999929799, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.9576329999845257, "p50": 0.960063999968952, "p90": 0.9623629999850891, "mean": 0.9611931999643275, "iqr": 0.0033900000744324643, "raw_times": [0.9589729999106567, 0.9576329999845257, 0.960063999968952, 0.9669329999724141, 0.9623629999850891], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.9673530000782193, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0019650000003821, "p50": 1.0193159999971613, "p90": 1.0211459999709405, "mean": 1.015251600006195, "iqr": 0.01198099994326185, "raw_times": [1.0019650000003821, 1.0091650000276786, 1.024666000034813, 1.0193159999971613, 1.0211459999709405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.009233999980097, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0176959999625979, "p50": 1.0199849999708022, "p90": 1.025695000066662, "mean": 1.0218714000075124, "iqr": 0.006820000066909415, "raw_times": [1.0271060000377474, 1.0176959999625979, 1.0188749999997526, 1.0199849999708022, 1.025695000066662], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.027405000058934, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1665810000067722, "p50": 1.1845809999613266, "p90": 1.185440999961429, "mean": 1.1787729999923613, "iqr": 0.01419000000169035, "raw_times": [1.1712509999597387, 1.1665810000067722, 1.18601100007254, 1.1845809999613266, 1.185440999961429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1787800000320203, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-10-28T14:08:53Z", "run": "5a355db38c804430a5f59b9c06c46f52", "impl": "hf_kernels_flash_attn3", "tags": {"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.1722899999995207, "p50": 1.1832310000272628, "p90": 1.1854509999693619, "mean": 1.181276799979969, "iqr": 0.008630000024822948, "raw_times": [1.1885909999591604, 1.1854509999693619, 1.176820999944539, 1.1832310000272628, 1.1722899999995207], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.1782799999764393, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -3,9 +3,8 @@
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
6
- # "kernels",
7
  # "kernels-benchmark-tools",
8
- # "sageattention",
9
  # ]
10
  #
11
  # [tool.uv.sources]
@@ -16,18 +15,17 @@ import sys
16
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
17
  from kernels import get_kernel
18
 
19
- # Load the sage attention kernel
20
- hf_kernels_sage_attn = get_kernel("kernels-community/sage_attention")
21
 
22
 
23
- def sage_attention(query, key, value):
24
- """SageAttention with INT8 Q/K quantization and FP16 P/V"""
25
- return hf_kernels_sage_attn.fwd(query, key, value, is_causal=False)[0]
26
 
27
 
28
  run_benchmark(
29
  kernel_type=KernelTypeEnum.ATTENTION,
30
- impl_name="sage_int8_fp16",
31
- impl_tags={"family": "sageattention", "backend": "int8_fp16_cuda", "compile": "none"},
32
- impl_func=sage_attention,
33
  )
 
3
  # dependencies = [
4
  # "numpy",
5
  # "torch==2.8.0",
 
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
  from kernels import get_kernel
17
 
18
+ # Load the flash attention 3 kernel
19
+ hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
20
 
21
 
22
+ def hf_flash_attention3(query, key, value):
23
+ return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
 
24
 
25
 
26
  run_benchmark(
27
  kernel_type=KernelTypeEnum.ATTENTION,
28
+ impl_name="hf_kernels_flash_attn3",
29
+ impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
30
+ impl_func=hf_flash_attention3,
31
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -3888,7 +3888,7 @@ Cell: nv | 0.26s
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
- <div class="cell-stdout"><pre class="stdout-text">Mon Oct 27 14:45:45 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
@@ -3897,7 +3897,7 @@ Cell: nv | 0.26s
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
- | N/A 31C P0 135W / 350W | 0MiB / 46068MiB | 100% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
@@ -3921,7 +3921,7 @@ Cell: nv | 0.26s
3921
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3922
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3923
  </span> |
3924
- Cell: benchmark | 3.87s
3925
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3926
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3927
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3972,29 +3972,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.610ms 101.76% 3.610ms 3.610ms 1
3976
- torch_flash_ma 6.54% 340.396us 46.01% 2.394ms 2.394ms 0.000us 0.00% 3.588ms 3.588ms 1
3977
- aten::scaled_dot_product_attention 0.84% 43.810us 4.24% 220.593us 73.531us 0.000us 0.00% 2.829ms 943.091us 3
3978
- aten::_scaled_dot_product_flash_attention 0.51% 26.609us 3.40% 176.783us 58.928us 0.000us 0.00% 2.829ms 943.091us 3
3979
- aten::_flash_attention_forward 0.74% 38.381us 2.45% 127.692us 42.564us 2.829ms 79.74% 2.829ms 943.091us 3
3980
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.829ms 79.74% 2.829ms 943.091us 3
3981
- aten::contiguous 0.29% 15.001us 33.86% 1.762ms 146.802us 0.000us 0.00% 759.072us 63.256us 12
3982
- aten::clone 0.76% 39.432us 33.57% 1.747ms 145.552us 0.000us 0.00% 759.072us 63.256us 12
3983
- aten::copy_ 1.71% 88.801us 31.26% 1.626ms 135.534us 718.688us 20.26% 759.072us 63.256us 12
3984
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 718.688us 20.26% 718.688us 59.891us 12
3985
- Activity Buffer Request 27.68% 1.440ms 27.68% 1.440ms 1.440ms 40.384us 1.14% 40.384us 40.384us 1
3986
- aten::transpose 1.34% 69.973us 1.80% 93.503us 3.896us 0.000us 0.00% 0.000us 0.000us 24
3987
- aten::as_strided 0.45% 23.530us 0.45% 23.530us 0.980us 0.000us 0.00% 0.000us 0.000us 24
3988
- aten::empty_like 0.50% 25.908us 1.97% 102.319us 6.821us 0.000us 0.00% 0.000us 0.000us 15
3989
- aten::empty 1.75% 91.041us 1.75% 91.041us 3.793us 0.000us 0.00% 0.000us 0.000us 24
3990
- cudaLaunchKernel 2.36% 123.031us 2.36% 123.031us 8.202us 0.000us 0.00% 0.000us 0.000us 15
3991
- aten::empty_strided 0.31% 16.010us 0.31% 16.010us 5.337us 0.000us 0.00% 0.000us 0.000us 3
3992
- cudaDeviceGetAttribute 0.05% 2.700us 0.05% 2.700us 0.450us 0.000us 0.00% 0.000us 0.000us 6
3993
- cudaFuncSetAttribute 0.17% 8.980us 0.17% 8.980us 2.993us 0.000us 0.00% 0.000us 0.000us 3
3994
- cudaDeviceSynchronize 53.99% 2.809ms 53.99% 2.809ms 2.809ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
- Self CPU time total: 5.203ms
3997
- Self CUDA time total: 3.548ms
3998
 
3999
 
4000
 
@@ -4004,29 +4004,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- torch_flash_ma 5.17% 272.917us 42.06% 2.218ms 2.218ms 0.000us 0.00% 3.821ms 3.821ms 1
4008
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.777ms 100.28% 3.777ms 3.777ms 1
4009
- aten::scaled_dot_product_attention 0.53% 27.761us 3.55% 187.333us 62.444us 0.000us 0.00% 3.004ms 1.001ms 3
4010
- aten::_scaled_dot_product_flash_attention 0.37% 19.492us 3.03% 159.572us 53.191us 0.000us 0.00% 3.004ms 1.001ms 3
4011
- aten::_flash_attention_forward 0.75% 39.549us 2.23% 117.371us 39.124us 3.004ms 79.75% 3.004ms 1.001ms 3
4012
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.004ms 79.75% 3.004ms 1.001ms 3
4013
- aten::contiguous 0.20% 10.320us 32.06% 1.691ms 140.876us 0.000us 0.00% 817.314us 68.110us 12
4014
- aten::clone 0.55% 29.048us 31.86% 1.680ms 140.016us 0.000us 0.00% 817.314us 68.110us 12
4015
- aten::copy_ 1.64% 86.662us 30.11% 1.588ms 132.347us 762.658us 20.25% 817.314us 68.110us 12
4016
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 762.658us 20.25% 762.658us 63.555us 12
4017
- Activity Buffer Request 26.84% 1.415ms 26.84% 1.415ms 1.415ms 54.656us 1.45% 54.656us 54.656us 1
4018
- aten::transpose 1.36% 71.528us 1.71% 90.179us 3.757us 0.000us 0.00% 0.000us 0.000us 24
4019
- aten::as_strided 0.35% 18.651us 0.35% 18.651us 0.777us 0.000us 0.00% 0.000us 0.000us 24
4020
- aten::empty_like 0.38% 19.801us 1.55% 81.840us 5.456us 0.000us 0.00% 0.000us 0.000us 15
4021
- aten::empty 1.46% 77.040us 1.46% 77.040us 3.210us 0.000us 0.00% 0.000us 0.000us 24
4022
- cudaLaunchKernel 2.07% 108.973us 2.07% 108.973us 7.265us 0.000us 0.00% 0.000us 0.000us 15
4023
- aten::empty_strided 0.26% 13.940us 0.26% 13.940us 4.647us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaDeviceGetAttribute 0.06% 2.910us 0.06% 2.910us 0.485us 0.000us 0.00% 0.000us 0.000us 6
4025
- cudaFuncSetAttribute 0.08% 4.240us 0.08% 4.240us 1.413us 0.000us 0.00% 0.000us 0.000us 3
4026
- cudaDeviceSynchronize 57.94% 3.056ms 57.94% 3.056ms 3.056ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 5.274ms
4029
- Self CUDA time total: 3.767ms
4030
 
4031
 
4032
 
@@ -4036,29 +4036,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- torch_flash_ma 4.99% 269.576us 41.89% 2.262ms 2.262ms 0.000us 0.00% 3.875ms 3.875ms 1
4040
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.827ms 100.29% 3.827ms 3.827ms 1
4041
- aten::scaled_dot_product_attention 0.50% 27.011us 3.47% 187.262us 62.421us 0.000us 0.00% 3.037ms 1.012ms 3
4042
- aten::_scaled_dot_product_flash_attention 0.35% 18.851us 2.97% 160.251us 53.417us 0.000us 0.00% 3.037ms 1.012ms 3
4043
- aten::_flash_attention_forward 0.72% 39.000us 2.20% 118.550us 39.517us 3.037ms 79.57% 3.037ms 1.012ms 3
4044
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.037ms 79.57% 3.037ms 1.012ms 3
4045
- aten::contiguous 0.18% 9.780us 32.51% 1.755ms 146.253us 0.000us 0.00% 838.461us 69.872us 12
4046
- aten::clone 0.54% 29.119us 32.32% 1.745ms 145.438us 0.000us 0.00% 838.461us 69.872us 12
4047
- aten::copy_ 1.56% 84.200us 30.52% 1.648ms 137.328us 779.741us 20.43% 838.461us 69.872us 12
4048
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.741us 20.43% 779.741us 64.978us 12
4049
- Activity Buffer Request 27.41% 1.480ms 27.41% 1.480ms 1.480ms 58.720us 1.54% 58.720us 58.720us 1
4050
- aten::transpose 1.00% 54.180us 1.34% 72.500us 3.021us 0.000us 0.00% 0.000us 0.000us 24
4051
- aten::as_strided 0.34% 18.320us 0.34% 18.320us 0.763us 0.000us 0.00% 0.000us 0.000us 24
4052
- aten::empty_like 0.36% 19.560us 1.66% 89.381us 5.959us 0.000us 0.00% 0.000us 0.000us 15
4053
- aten::empty 1.53% 82.821us 1.53% 82.821us 3.451us 0.000us 0.00% 0.000us 0.000us 24
4054
- cudaLaunchKernel 1.99% 107.272us 1.99% 107.272us 7.151us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_strided 0.30% 16.380us 0.30% 16.380us 5.460us 0.000us 0.00% 0.000us 0.000us 3
4056
- cudaDeviceGetAttribute 0.03% 1.850us 0.03% 1.850us 0.308us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4058
- cudaDeviceSynchronize 58.11% 3.138ms 58.11% 3.138ms 3.138ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- Self CPU time total: 5.399ms
4061
- Self CUDA time total: 3.817ms
4062
 
4063
 
4064
 
@@ -4068,29 +4068,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- torch_flash_ma 4.76% 268.853us 43.13% 2.435ms 2.435ms 0.000us 0.00% 3.964ms 3.964ms 1
4072
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.917ms 100.30% 3.917ms 3.917ms 1
4073
- aten::scaled_dot_product_attention 0.49% 27.720us 3.46% 195.333us 65.111us 0.000us 0.00% 3.118ms 1.039ms 3
4074
- aten::_scaled_dot_product_flash_attention 0.34% 19.471us 2.97% 167.613us 55.871us 0.000us 0.00% 3.118ms 1.039ms 3
4075
- aten::_flash_attention_forward 0.70% 39.530us 2.23% 125.742us 41.914us 3.118ms 79.84% 3.118ms 1.039ms 3
4076
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.118ms 79.84% 3.118ms 1.039ms 3
4077
- aten::contiguous 0.17% 9.719us 34.03% 1.921ms 160.116us 0.000us 0.00% 845.599us 70.467us 12
4078
- aten::clone 0.52% 29.239us 33.85% 1.912ms 159.306us 0.000us 0.00% 845.599us 70.467us 12
4079
- aten::copy_ 1.54% 86.910us 32.19% 1.818ms 151.460us 787.167us 20.16% 845.599us 70.467us 12
4080
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 787.167us 20.16% 787.167us 65.597us 12
4081
- Activity Buffer Request 25.41% 1.435ms 25.41% 1.435ms 1.435ms 58.432us 1.50% 58.432us 58.432us 1
4082
- aten::transpose 0.96% 54.080us 1.28% 72.141us 3.006us 0.000us 0.00% 0.000us 0.000us 24
4083
- aten::as_strided 0.32% 18.061us 0.32% 18.061us 0.753us 0.000us 0.00% 0.000us 0.000us 24
4084
- aten::empty_like 0.35% 19.512us 1.49% 84.134us 5.609us 0.000us 0.00% 0.000us 0.000us 15
4085
- aten::empty 1.53% 86.581us 1.53% 86.581us 3.608us 0.000us 0.00% 0.000us 0.000us 24
4086
- cudaLaunchKernel 5.66% 319.547us 5.66% 319.547us 21.303us 0.000us 0.00% 0.000us 0.000us 15
4087
- aten::empty_strided 0.26% 14.430us 0.26% 14.430us 4.810us 0.000us 0.00% 0.000us 0.000us 3
4088
- cudaDeviceGetAttribute 0.05% 2.740us 0.05% 2.740us 0.457us 0.000us 0.00% 0.000us 0.000us 6
4089
- cudaFuncSetAttribute 0.07% 4.201us 0.07% 4.201us 1.400us 0.000us 0.00% 0.000us 0.000us 3
4090
- cudaDeviceSynchronize 56.87% 3.211ms 56.87% 3.211ms 3.211ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
- Self CPU time total: 5.647ms
4093
- Self CUDA time total: 3.906ms
4094
 
4095
 
4096
 
@@ -4100,29 +4100,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
- torch_flash_ma 5.25% 320.614us 40.80% 2.490ms 2.490ms 0.000us 0.00% 4.428ms 4.428ms 1
4104
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.377ms 100.25% 4.377ms 4.377ms 1
4105
- aten::scaled_dot_product_attention 0.44% 26.800us 3.27% 199.713us 66.571us 0.000us 0.00% 3.558ms 1.186ms 3
4106
- aten::_scaled_dot_product_flash_attention 0.32% 19.239us 2.83% 172.913us 57.638us 0.000us 0.00% 3.558ms 1.186ms 3
4107
- aten::_flash_attention_forward 0.64% 38.816us 2.13% 129.963us 43.321us 3.558ms 81.48% 3.558ms 1.186ms 3
4108
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.558ms 81.48% 3.558ms 1.186ms 3
4109
- aten::contiguous 0.17% 10.568us 31.48% 1.922ms 160.138us 0.000us 0.00% 870.015us 72.501us 12
4110
- aten::clone 0.48% 29.552us 31.31% 1.911ms 159.257us 0.000us 0.00% 870.015us 72.501us 12
4111
- aten::copy_ 1.37% 83.622us 29.71% 1.813ms 151.123us 808.479us 18.52% 870.015us 72.501us 12
4112
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 808.479us 18.52% 808.479us 67.373us 12
4113
- Activity Buffer Request 24.07% 1.469ms 24.07% 1.469ms 1.469ms 61.536us 1.41% 61.536us 61.536us 1
4114
- aten::transpose 0.88% 53.494us 1.18% 71.893us 2.996us 0.000us 0.00% 0.000us 0.000us 24
4115
- aten::as_strided 0.30% 18.399us 0.30% 18.399us 0.767us 0.000us 0.00% 0.000us 0.000us 24
4116
- aten::empty_like 0.45% 27.388us 1.61% 98.450us 6.563us 0.000us 0.00% 0.000us 0.000us 15
4117
- aten::empty 1.35% 82.243us 1.35% 82.243us 3.427us 0.000us 0.00% 0.000us 0.000us 24
4118
- cudaLaunchKernel 4.68% 285.943us 4.68% 285.943us 19.063us 0.000us 0.00% 0.000us 0.000us 15
4119
- aten::empty_strided 0.29% 17.820us 0.29% 17.820us 5.940us 0.000us 0.00% 0.000us 0.000us 3
4120
- cudaDeviceGetAttribute 0.04% 2.328us 0.04% 2.328us 0.388us 0.000us 0.00% 0.000us 0.000us 6
4121
- cudaFuncSetAttribute 0.07% 4.078us 0.07% 4.078us 1.359us 0.000us 0.00% 0.000us 0.000us 3
4122
- cudaDeviceSynchronize 59.20% 3.614ms 59.20% 3.614ms 3.614ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
- Self CPU time total: 6.104ms
4125
- Self CUDA time total: 4.366ms
4126
 
4127
 
4128
 
@@ -4132,38 +4132,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
- torch_flash_ma 4.45% 272.752us 38.96% 2.390ms 2.390ms 0.000us 0.00% 4.517ms 4.517ms 1
4136
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.467ms 100.24% 4.467ms 4.467ms 1
4137
- aten::scaled_dot_product_attention 0.45% 27.641us 3.22% 197.213us 65.738us 0.000us 0.00% 3.636ms 1.212ms 3
4138
- aten::_scaled_dot_product_flash_attention 0.32% 19.841us 2.76% 169.572us 56.524us 0.000us 0.00% 3.636ms 1.212ms 3
4139
- aten::_flash_attention_forward 0.71% 43.282us 2.06% 126.092us 42.031us 3.636ms 81.58% 3.636ms 1.212ms 3
4140
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.636ms 81.58% 3.636ms 1.212ms 3
4141
- aten::contiguous 0.18% 11.069us 30.46% 1.869ms 155.711us 0.000us 0.00% 881.085us 73.424us 12
4142
- aten::clone 0.50% 30.953us 30.28% 1.857ms 154.789us 0.000us 0.00% 881.085us 73.424us 12
4143
- aten::copy_ 1.39% 85.529us 28.66% 1.758ms 146.482us 820.670us 18.42% 881.085us 73.424us 12
4144
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 820.670us 18.42% 820.670us 68.389us 12
4145
- Activity Buffer Request 23.40% 1.435ms 23.40% 1.435ms 1.435ms 60.415us 1.36% 60.415us 60.415us 1
4146
- aten::transpose 0.92% 56.138us 1.22% 75.130us 3.130us 0.000us 0.00% 0.000us 0.000us 24
4147
- aten::as_strided 0.31% 18.992us 0.31% 18.992us 0.791us 0.000us 0.00% 0.000us 0.000us 24
4148
- aten::empty_like 0.33% 20.287us 1.48% 90.810us 6.054us 0.000us 0.00% 0.000us 0.000us 15
4149
- aten::empty 1.36% 83.613us 1.36% 83.613us 3.484us 0.000us 0.00% 0.000us 0.000us 24
4150
- cudaLaunchKernel 4.26% 261.175us 4.26% 261.175us 17.412us 0.000us 0.00% 0.000us 0.000us 15
4151
- aten::empty_strided 0.28% 17.260us 0.28% 17.260us 5.753us 0.000us 0.00% 0.000us 0.000us 3
4152
- cudaDeviceGetAttribute 0.03% 1.850us 0.03% 1.850us 0.308us 0.000us 0.00% 0.000us 0.000us 6
4153
- cudaFuncSetAttribute 0.07% 4.250us 0.07% 4.250us 1.417us 0.000us 0.00% 0.000us 0.000us 3
4154
- cudaDeviceSynchronize 61.04% 3.744ms 61.04% 3.744ms 3.744ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
- Self CPU time total: 6.134ms
4157
- Self CUDA time total: 4.456ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4162
  torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4163
- torch_flash_ma cuda_attn_L320_bfloat16 1.31 True
4164
- torch_flash_ma cuda_attn_L384_bfloat16 1.34 True
4165
- torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4166
- torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4167
  </pre></div>
4168
  <div class="cell-artifacts">
4169
  <h4>Artifacts:</h4>
 
3888
  </div>
3889
  </div>
3890
  <div id="output-nv" class="cell-output">
3891
+ <div class="cell-stdout"><pre class="stdout-text">Tue Oct 28 14:08:39 2025
3892
  +-----------------------------------------------------------------------------------------+
3893
  | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 |
3894
  |-----------------------------------------+------------------------+----------------------+
 
3897
  | | | MIG M. |
3898
  |=========================================+========================+======================|
3899
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3900
+ | N/A 32C P0 153W / 350W | 0MiB / 46068MiB | 26% Default |
3901
  | | | N/A |
3902
  +-----------------------------------------+------------------------+----------------------+
3903
 
 
3921
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3922
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3923
  </span> |
3924
+ Cell: benchmark | 3.83s
3925
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3926
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3927
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.585ms 101.47% 3.585ms 3.585ms 1
3976
+ torch_flash_ma 6.34% 327.656us 45.53% 2.352ms 2.352ms 0.000us 0.00% 3.573ms 3.573ms 1
3977
+ aten::scaled_dot_product_attention 0.82% 42.312us 4.12% 213.057us 71.019us 0.000us 0.00% 2.820ms 940.062us 3
3978
+ aten::_scaled_dot_product_flash_attention 0.51% 26.321us 3.31% 170.745us 56.915us 0.000us 0.00% 2.820ms 940.062us 3
3979
+ aten::_flash_attention_forward 0.73% 37.527us 2.40% 124.015us 41.338us 2.820ms 79.83% 2.820ms 940.062us 3
3980
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.820ms 79.83% 2.820ms 940.062us 3
3981
+ aten::contiguous 0.27% 14.121us 33.79% 1.745ms 145.446us 0.000us 0.00% 752.928us 62.744us 12
3982
+ aten::clone 0.72% 37.329us 33.52% 1.731ms 144.269us 0.000us 0.00% 752.928us 62.744us 12
3983
+ aten::copy_ 1.68% 87.013us 31.25% 1.614ms 134.513us 712.672us 20.17% 752.928us 62.744us 12
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 712.672us 20.17% 712.672us 59.389us 12
3985
+ Activity Buffer Request 27.64% 1.428ms 27.64% 1.428ms 1.428ms 40.256us 1.14% 40.256us 40.256us 1
3986
+ aten::transpose 1.24% 64.087us 1.67% 86.009us 3.584us 0.000us 0.00% 0.000us 0.000us 24
3987
+ aten::as_strided 0.42% 21.922us 0.42% 21.922us 0.913us 0.000us 0.00% 0.000us 0.000us 24
3988
+ aten::empty_like 0.48% 24.711us 1.99% 102.775us 6.852us 0.000us 0.00% 0.000us 0.000us 15
3989
+ aten::empty 1.74% 89.843us 1.74% 89.843us 3.743us 0.000us 0.00% 0.000us 0.000us 24
3990
+ cudaLaunchKernel 2.38% 122.771us 2.38% 122.771us 8.185us 0.000us 0.00% 0.000us 0.000us 15
3991
+ aten::empty_strided 0.34% 17.310us 0.34% 17.310us 5.770us 0.000us 0.00% 0.000us 0.000us 3
3992
+ cudaDeviceGetAttribute 0.04% 2.229us 0.04% 2.229us 0.372us 0.000us 0.00% 0.000us 0.000us 6
3993
+ cudaFuncSetAttribute 0.17% 8.900us 0.17% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaDeviceSynchronize 54.47% 2.814ms 54.47% 2.814ms 2.814ms 0.000us 0.00% 0.000us 0.000us 1
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
+ Self CPU time total: 5.165ms
3997
+ Self CUDA time total: 3.533ms
3998
 
3999
 
4000
 
 
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ torch_flash_ma 4.84% 255.079us 41.49% 2.188ms 2.188ms 0.000us 0.00% 3.787ms 3.787ms 1
4008
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.29% 3.743ms 3.743ms 1
4009
+ aten::scaled_dot_product_attention 0.47% 24.640us 3.42% 180.356us 60.119us 0.000us 0.00% 2.967ms 989.106us 3
4010
+ aten::_scaled_dot_product_flash_attention 0.36% 19.241us 2.95% 155.716us 51.905us 0.000us 0.00% 2.967ms 989.106us 3
4011
+ aten::_flash_attention_forward 0.73% 38.683us 2.19% 115.525us 38.508us 2.967ms 79.51% 2.967ms 989.106us 3
4012
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.967ms 79.51% 2.967ms 989.106us 3
4013
+ aten::contiguous 0.17% 8.802us 32.41% 1.709ms 142.425us 0.000us 0.00% 819.868us 68.322us 12
4014
+ aten::clone 0.52% 27.349us 32.24% 1.700ms 141.692us 0.000us 0.00% 819.868us 68.322us 12
4015
+ aten::copy_ 1.56% 82.061us 30.60% 1.614ms 134.473us 764.892us 20.49% 819.868us 68.322us 12
4016
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 764.892us 20.49% 764.892us 63.741us 12
4017
+ Activity Buffer Request 27.50% 1.450ms 27.50% 1.450ms 1.450ms 54.976us 1.47% 54.976us 54.976us 1
4018
+ aten::transpose 0.91% 47.959us 1.22% 64.512us 2.688us 0.000us 0.00% 0.000us 0.000us 24
4019
+ aten::as_strided 0.31% 16.553us 0.31% 16.553us 0.690us 0.000us 0.00% 0.000us 0.000us 24
4020
+ aten::empty_like 0.39% 20.732us 1.52% 80.304us 5.354us 0.000us 0.00% 0.000us 0.000us 15
4021
+ aten::empty 1.38% 72.972us 1.38% 72.972us 3.040us 0.000us 0.00% 0.000us 0.000us 24
4022
+ cudaLaunchKernel 1.96% 103.146us 1.96% 103.146us 6.876us 0.000us 0.00% 0.000us 0.000us 15
4023
+ aten::empty_strided 0.28% 14.880us 0.28% 14.880us 4.960us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaDeviceGetAttribute 0.03% 1.800us 0.03% 1.800us 0.300us 0.000us 0.00% 0.000us 0.000us 6
4025
+ cudaFuncSetAttribute 0.07% 3.830us 0.07% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4026
+ cudaDeviceSynchronize 58.51% 3.085ms 58.51% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 5.273ms
4029
+ Self CUDA time total: 3.732ms
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ torch_flash_ma 4.77% 251.162us 41.45% 2.184ms 2.184ms 0.000us 0.00% 3.786ms 3.786ms 1
4040
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.738ms 100.28% 3.738ms 3.738ms 1
4041
+ aten::scaled_dot_product_attention 0.46% 24.280us 3.42% 180.086us 60.029us 0.000us 0.00% 2.949ms 982.872us 3
4042
+ aten::_scaled_dot_product_flash_attention 0.34% 18.160us 2.96% 155.806us 51.935us 0.000us 0.00% 2.949ms 982.872us 3
4043
+ aten::_flash_attention_forward 0.73% 38.599us 2.20% 115.865us 38.622us 2.949ms 79.09% 2.949ms 982.872us 3
4044
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.949ms 79.09% 2.949ms 982.872us 3
4045
+ aten::contiguous 0.17% 8.991us 32.44% 1.710ms 142.465us 0.000us 0.00% 837.719us 69.810us 12
4046
+ aten::clone 0.53% 27.728us 32.27% 1.701ms 141.715us 0.000us 0.00% 837.719us 69.810us 12
4047
+ aten::copy_ 1.52% 79.873us 30.57% 1.611ms 134.242us 779.480us 20.91% 837.719us 69.810us 12
4048
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 779.480us 20.91% 779.480us 64.957us 12
4049
+ Activity Buffer Request 27.50% 1.449ms 27.50% 1.449ms 1.449ms 58.239us 1.56% 58.239us 58.239us 1
4050
+ aten::transpose 0.92% 48.219us 1.24% 65.252us 2.719us 0.000us 0.00% 0.000us 0.000us 24
4051
+ aten::as_strided 0.32% 17.033us 0.32% 17.033us 0.710us 0.000us 0.00% 0.000us 0.000us 24
4052
+ aten::empty_like 0.37% 19.303us 1.55% 81.795us 5.453us 0.000us 0.00% 0.000us 0.000us 15
4053
+ aten::empty 1.44% 76.031us 1.44% 76.031us 3.168us 0.000us 0.00% 0.000us 0.000us 24
4054
+ cudaLaunchKernel 1.98% 104.564us 1.98% 104.564us 6.971us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_strided 0.28% 14.492us 0.28% 14.492us 4.831us 0.000us 0.00% 0.000us 0.000us 3
4056
+ cudaDeviceGetAttribute 0.04% 1.860us 0.04% 1.860us 0.310us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaFuncSetAttribute 0.10% 5.030us 0.10% 5.030us 1.677us 0.000us 0.00% 0.000us 0.000us 3
4058
+ cudaDeviceSynchronize 58.55% 3.085ms 58.55% 3.085ms 3.085ms 0.000us 0.00% 0.000us 0.000us 1
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ Self CPU time total: 5.269ms
4061
+ Self CUDA time total: 3.728ms
4062
 
4063
 
4064
 
 
4068
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4069
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ torch_flash_ma 5.01% 280.573us 44.17% 2.475ms 2.475ms 0.000us 0.00% 3.878ms 3.878ms 1
4072
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.831ms 100.27% 3.831ms 3.831ms 1
4073
+ aten::scaled_dot_product_attention 0.48% 26.630us 3.39% 189.956us 63.319us 0.000us 0.00% 3.032ms 1.011ms 3
4074
+ aten::_scaled_dot_product_flash_attention 0.34% 19.101us 2.91% 163.326us 54.442us 0.000us 0.00% 3.032ms 1.011ms 3
4075
+ aten::_flash_attention_forward 0.70% 39.063us 2.15% 120.325us 40.108us 3.032ms 79.37% 3.032ms 1.011ms 3
4076
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.032ms 79.37% 3.032ms 1.011ms 3
4077
+ aten::contiguous 0.17% 9.271us 34.98% 1.960ms 163.354us 0.000us 0.00% 845.820us 70.485us 12
4078
+ aten::clone 0.52% 28.974us 34.82% 1.951ms 162.581us 0.000us 0.00% 845.820us 70.485us 12
4079
+ aten::copy_ 1.48% 83.180us 33.17% 1.859ms 154.908us 788.284us 20.63% 845.820us 70.485us 12
4080
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 788.284us 20.63% 788.284us 65.690us 12
4081
+ Activity Buffer Request 26.18% 1.467ms 26.18% 1.467ms 1.467ms 57.536us 1.51% 57.536us 57.536us 1
4082
+ aten::transpose 0.89% 50.110us 1.21% 67.952us 2.831us 0.000us 0.00% 0.000us 0.000us 24
4083
+ aten::as_strided 0.32% 17.842us 0.32% 17.842us 0.743us 0.000us 0.00% 0.000us 0.000us 24
4084
+ aten::empty_like 0.36% 19.969us 1.53% 85.492us 5.699us 0.000us 0.00% 0.000us 0.000us 15
4085
+ aten::empty 1.37% 76.982us 1.37% 76.982us 3.208us 0.000us 0.00% 0.000us 0.000us 24
4086
+ cudaLaunchKernel 5.95% 333.480us 5.95% 333.480us 22.232us 0.000us 0.00% 0.000us 0.000us 15
4087
+ aten::empty_strided 0.30% 17.041us 0.30% 17.041us 5.680us 0.000us 0.00% 0.000us 0.000us 3
4088
+ cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4089
+ cudaFuncSetAttribute 0.07% 4.040us 0.07% 4.040us 1.347us 0.000us 0.00% 0.000us 0.000us 3
4090
+ cudaDeviceSynchronize 55.83% 3.129ms 55.83% 3.129ms 3.129ms 0.000us 0.00% 0.000us 0.000us 1
4091
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4092
+ Self CPU time total: 5.603ms
4093
+ Self CUDA time total: 3.820ms
4094
 
4095
 
4096
 
 
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
+ torch_flash_ma 5.07% 303.893us 39.93% 2.395ms 2.395ms 0.000us 0.00% 4.370ms 4.370ms 1
4104
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.320ms 100.25% 4.320ms 4.320ms 1
4105
+ aten::scaled_dot_product_attention 0.41% 24.650us 3.07% 184.006us 61.335us 0.000us 0.00% 3.503ms 1.168ms 3
4106
+ aten::_scaled_dot_product_flash_attention 0.32% 19.311us 2.66% 159.356us 53.119us 0.000us 0.00% 3.503ms 1.168ms 3
4107
+ aten::_flash_attention_forward 0.68% 40.911us 1.97% 118.205us 39.402us 3.503ms 81.28% 3.503ms 1.168ms 3
4108
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.503ms 81.28% 3.503ms 1.168ms 3
4109
+ aten::contiguous 0.15% 8.977us 31.04% 1.862ms 155.201us 0.000us 0.00% 867.581us 72.298us 12
4110
+ aten::clone 0.47% 28.114us 30.89% 1.853ms 154.453us 0.000us 0.00% 867.581us 72.298us 12
4111
+ aten::copy_ 1.36% 81.500us 29.40% 1.764ms 146.991us 806.749us 18.72% 867.581us 72.298us 12
4112
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.749us 18.72% 806.749us 67.229us 12
4113
+ Activity Buffer Request 23.82% 1.429ms 23.82% 1.429ms 1.429ms 60.832us 1.41% 60.832us 60.832us 1
4114
+ aten::transpose 0.82% 49.363us 1.11% 66.863us 2.786us 0.000us 0.00% 0.000us 0.000us 24
4115
+ aten::as_strided 0.29% 17.500us 0.29% 17.500us 0.729us 0.000us 0.00% 0.000us 0.000us 24
4116
+ aten::empty_like 0.33% 20.081us 1.37% 82.424us 5.495us 0.000us 0.00% 0.000us 0.000us 15
4117
+ aten::empty 1.26% 75.593us 1.26% 75.593us 3.150us 0.000us 0.00% 0.000us 0.000us 24
4118
+ cudaLaunchKernel 4.60% 275.759us 4.60% 275.759us 18.384us 0.000us 0.00% 0.000us 0.000us 15
4119
+ aten::empty_strided 0.25% 15.251us 0.25% 15.251us 5.084us 0.000us 0.00% 0.000us 0.000us 3
4120
+ cudaDeviceGetAttribute 0.03% 1.740us 0.03% 1.740us 0.290us 0.000us 0.00% 0.000us 0.000us 6
4121
+ cudaFuncSetAttribute 0.06% 3.680us 0.06% 3.680us 1.227us 0.000us 0.00% 0.000us 0.000us 3
4122
+ cudaDeviceSynchronize 60.07% 3.604ms 60.07% 3.604ms 3.604ms 0.000us 0.00% 0.000us 0.000us 1
4123
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4124
+ Self CPU time total: 5.999ms
4125
+ Self CUDA time total: 4.309ms
4126
 
4127
 
4128
 
 
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4134
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4135
+ torch_flash_ma 3.83% 232.270us 37.82% 2.296ms 2.296ms 0.000us 0.00% 4.474ms 4.474ms 1
4136
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.423ms 100.25% 4.423ms 4.423ms 1
4137
+ aten::scaled_dot_product_attention 0.41% 24.850us 2.85% 172.746us 57.582us 0.000us 0.00% 3.595ms 1.198ms 3
4138
+ aten::_scaled_dot_product_flash_attention 0.30% 18.250us 2.44% 147.896us 49.299us 0.000us 0.00% 3.595ms 1.198ms 3
4139
+ aten::_flash_attention_forward 0.54% 32.692us 1.77% 107.224us 35.741us 3.595ms 81.48% 3.595ms 1.198ms 3
4140
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.595ms 81.48% 3.595ms 1.198ms 3
4141
+ aten::contiguous 0.14% 8.610us 30.41% 1.846ms 153.859us 0.000us 0.00% 878.139us 73.178us 12
4142
+ aten::clone 0.45% 27.368us 30.27% 1.838ms 153.142us 0.000us 0.00% 878.139us 73.178us 12
4143
+ aten::copy_ 1.35% 81.917us 28.83% 1.750ms 145.831us 817.083us 18.52% 878.139us 73.178us 12
4144
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 817.083us 18.52% 817.083us 68.090us 12
4145
+ Activity Buffer Request 23.72% 1.440ms 23.72% 1.440ms 1.440ms 61.056us 1.38% 61.056us 61.056us 1
4146
+ aten::transpose 0.82% 50.064us 1.10% 66.792us 2.783us 0.000us 0.00% 0.000us 0.000us 24
4147
+ aten::as_strided 0.28% 16.728us 0.28% 16.728us 0.697us 0.000us 0.00% 0.000us 0.000us 24
4148
+ aten::empty_like 0.32% 19.431us 1.31% 79.591us 5.306us 0.000us 0.00% 0.000us 0.000us 15
4149
+ aten::empty 1.21% 73.220us 1.21% 73.220us 3.051us 0.000us 0.00% 0.000us 0.000us 24
4150
+ cudaLaunchKernel 4.12% 249.950us 4.12% 249.950us 16.663us 0.000us 0.00% 0.000us 0.000us 15
4151
+ aten::empty_strided 0.24% 14.270us 0.24% 14.270us 4.757us 0.000us 0.00% 0.000us 0.000us 3
4152
+ cudaDeviceGetAttribute 0.03% 1.680us 0.03% 1.680us 0.280us 0.000us 0.00% 0.000us 0.000us 6
4153
+ cudaFuncSetAttribute 0.07% 4.380us 0.07% 4.380us 1.460us 0.000us 0.00% 0.000us 0.000us 3
4154
+ cudaDeviceSynchronize 62.18% 3.775ms 62.18% 3.775ms 3.775ms 0.000us 0.00% 0.000us 0.000us 1
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
+ Self CPU time total: 6.071ms
4157
+ Self CUDA time total: 4.413ms
4158
 
4159
 
4160
  impl wl p50(ms) ok
4161
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4162
  torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4163
+ torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4164
+ torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4165
+ torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4166
+ torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4167
  </pre></div>
4168
  <div class="cell-artifacts">
4169
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 35.44s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3926,21 +3926,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
- hf_kernels_flash_attn 3.89% 173.532us 41.54% 1.852ms 1.852ms 0.000us 0.00% 3.821ms 3.821ms 1
3930
- _flash_attn_9e27194::fwd 1.71% 76.382us 37.65% 1.679ms 559.513us 2.851ms 100.00% 3.821ms 1.274ms 3
3931
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.852ms 100.05% 2.852ms 2.852ms 1
3932
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.851ms 100.00% 2.851ms 950.289us 3
3933
- Activity Buffer Request 32.53% 1.450ms 32.53% 1.450ms 1.450ms 970.364us 34.04% 970.364us 970.364us 1
3934
- cudaDeviceGetAttribute 0.10% 4.520us 0.10% 4.520us 0.301us 0.000us 0.00% 0.000us 0.000us 15
3935
- aten::empty_like 0.46% 20.440us 1.29% 57.461us 19.154us 0.000us 0.00% 0.000us 0.000us 3
3936
- aten::empty_strided 0.83% 37.021us 0.83% 37.021us 12.340us 0.000us 0.00% 0.000us 0.000us 3
3937
- aten::empty 0.76% 33.730us 0.76% 33.730us 3.748us 0.000us 0.00% 0.000us 0.000us 9
3938
- cudaFuncSetAttribute 0.29% 12.870us 0.29% 12.870us 4.290us 0.000us 0.00% 0.000us 0.000us 3
3939
- cudaLaunchKernel 0.97% 43.280us 0.97% 43.280us 14.427us 0.000us 0.00% 0.000us 0.000us 3
3940
- cudaDeviceSynchronize 58.46% 2.606ms 58.46% 2.606ms 2.606ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
- Self CPU time total: 4.458ms
3943
- Self CUDA time total: 2.851ms
3944
 
3945
 
3946
 
@@ -3950,21 +3950,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
- hf_kernels_flash_attn 2.32% 104.162us 37.24% 1.676ms 1.676ms 0.000us 0.00% 4.000ms 4.000ms 1
3954
- _flash_attn_9e27194::fwd 1.05% 47.052us 34.93% 1.571ms 523.812us 2.988ms 100.00% 4.000ms 1.333ms 3
3955
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.989ms 100.04% 2.989ms 2.989ms 1
3956
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.988ms 100.00% 2.988ms 995.942us 3
3957
- Activity Buffer Request 32.02% 1.441ms 32.02% 1.441ms 1.441ms 1.012ms 33.87% 1.012ms 1.012ms 1
3958
- cudaDeviceGetAttribute 0.10% 4.331us 0.10% 4.331us 0.289us 0.000us 0.00% 0.000us 0.000us 15
3959
- aten::empty_like 0.16% 7.210us 0.52% 23.350us 7.783us 0.000us 0.00% 0.000us 0.000us 3
3960
- aten::empty_strided 0.36% 16.140us 0.36% 16.140us 5.380us 0.000us 0.00% 0.000us 0.000us 3
3961
- aten::empty 0.47% 21.320us 0.47% 21.320us 2.369us 0.000us 0.00% 0.000us 0.000us 9
3962
- cudaFuncSetAttribute 0.10% 4.349us 0.10% 4.349us 1.450us 0.000us 0.00% 0.000us 0.000us 3
3963
- cudaLaunchKernel 0.67% 30.329us 0.67% 30.329us 10.110us 0.000us 0.00% 0.000us 0.000us 3
3964
- cudaDeviceSynchronize 62.76% 2.824ms 62.76% 2.824ms 2.824ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
- Self CPU time total: 4.499ms
3967
- Self CUDA time total: 2.988ms
3968
 
3969
 
3970
 
@@ -3974,21 +3974,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
- hf_kernels_flash_attn 2.58% 116.241us 37.17% 1.677ms 1.677ms 0.000us 0.00% 4.040ms 4.040ms 1
3978
- _flash_attn_9e27194::fwd 1.11% 49.909us 34.60% 1.561ms 520.326us 3.012ms 100.00% 4.040ms 1.347ms 3
3979
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.013ms 100.04% 3.013ms 3.013ms 1
3980
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.00% 3.012ms 1.004ms 3
3981
- Activity Buffer Request 31.60% 1.426ms 31.60% 1.426ms 1.426ms 1.029ms 34.16% 1.029ms 1.029ms 1
3982
- cudaDeviceGetAttribute 0.08% 3.801us 0.08% 3.801us 0.253us 0.000us 0.00% 0.000us 0.000us 15
3983
- aten::empty_like 0.18% 8.151us 0.55% 24.960us 8.320us 0.000us 0.00% 0.000us 0.000us 3
3984
- aten::empty_strided 0.37% 16.809us 0.37% 16.809us 5.603us 0.000us 0.00% 0.000us 0.000us 3
3985
- aten::empty 0.47% 21.201us 0.47% 21.201us 2.356us 0.000us 0.00% 0.000us 0.000us 9
3986
- cudaFuncSetAttribute 0.09% 3.950us 0.09% 3.950us 1.317us 0.000us 0.00% 0.000us 0.000us 3
3987
- cudaLaunchKernel 0.69% 31.260us 0.69% 31.260us 10.420us 0.000us 0.00% 0.000us 0.000us 3
3988
- cudaDeviceSynchronize 62.83% 2.835ms 62.83% 2.835ms 2.835ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- Self CPU time total: 4.512ms
3991
- Self CUDA time total: 3.012ms
3992
 
3993
 
3994
 
@@ -3998,21 +3998,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
- hf_kernels_flash_attn 2.01% 99.212us 38.53% 1.898ms 1.898ms 0.000us 0.00% 4.264ms 4.264ms 1
4002
- _flash_attn_9e27194::fwd 1.06% 52.152us 36.51% 1.799ms 599.723us 3.190ms 100.00% 4.264ms 1.421ms 3
4003
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.191ms 100.05% 3.191ms 3.191ms 1
4004
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.190ms 100.00% 3.190ms 1.063ms 3
4005
- Activity Buffer Request 28.82% 1.420ms 28.82% 1.420ms 1.420ms 1.074ms 33.68% 1.074ms 1.074ms 1
4006
- cudaDeviceGetAttribute 0.09% 4.479us 0.09% 4.479us 0.299us 0.000us 0.00% 0.000us 0.000us 15
4007
- aten::empty_like 0.16% 7.900us 0.54% 26.470us 8.823us 0.000us 0.00% 0.000us 0.000us 3
4008
- aten::empty_strided 0.38% 18.570us 0.38% 18.570us 6.190us 0.000us 0.00% 0.000us 0.000us 3
4009
- aten::empty 0.46% 22.430us 0.46% 22.430us 2.492us 0.000us 0.00% 0.000us 0.000us 9
4010
- cudaFuncSetAttribute 0.08% 3.830us 0.08% 3.830us 1.277us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaLaunchKernel 5.47% 269.763us 5.47% 269.763us 89.921us 0.000us 0.00% 0.000us 0.000us 3
4012
- cudaDeviceSynchronize 61.47% 3.029ms 61.47% 3.029ms 3.029ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
- Self CPU time total: 4.928ms
4015
- Self CUDA time total: 3.190ms
4016
 
4017
 
4018
 
@@ -4022,21 +4022,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- hf_kernels_flash_attn 2.16% 88.971us 14.91% 614.057us 614.057us 0.000us 0.00% 4.875ms 4.875ms 1
4026
- _flash_attn_9e27194::fwd 1.23% 50.539us 12.75% 525.086us 175.029us 3.652ms 100.00% 4.875ms 1.625ms 3
4027
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.653ms 100.04% 3.653ms 3.653ms 1
4028
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.652ms 100.00% 3.652ms 1.217ms 3
4029
- Activity Buffer Request 5.08% 209.112us 5.08% 209.112us 209.112us 1.223ms 33.50% 1.223ms 1.223ms 1
4030
- cudaDeviceGetAttribute 0.10% 3.960us 0.10% 3.960us 0.264us 0.000us 0.00% 0.000us 0.000us 15
4031
- aten::empty_like 0.19% 7.749us 0.60% 24.700us 8.233us 0.000us 0.00% 0.000us 0.000us 3
4032
- aten::empty_strided 0.41% 16.951us 0.41% 16.951us 5.650us 0.000us 0.00% 0.000us 0.000us 3
4033
- aten::empty 0.54% 22.121us 0.54% 22.121us 2.458us 0.000us 0.00% 0.000us 0.000us 9
4034
- cudaFuncSetAttribute 0.10% 4.190us 0.10% 4.190us 1.397us 0.000us 0.00% 0.000us 0.000us 3
4035
- cudaLaunchKernel 5.11% 210.464us 5.11% 210.464us 70.155us 0.000us 0.00% 0.000us 0.000us 3
4036
- cudaDeviceSynchronize 85.09% 3.504ms 85.09% 3.504ms 3.504ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- Self CPU time total: 4.118ms
4039
- Self CUDA time total: 3.652ms
4040
 
4041
 
4042
 
@@ -4046,88 +4046,41 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- hf_kernels_flash_attn 2.23% 91.402us 14.65% 600.857us 600.857us 0.000us 0.00% 4.881ms 4.881ms 1
4050
- _flash_attn_9e27194::fwd 1.15% 47.191us 12.42% 509.455us 169.818us 3.654ms 100.00% 4.881ms 1.627ms 3
4051
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 100.04% 3.655ms 3.655ms 1
4052
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.654ms 100.00% 3.654ms 1.218ms 3
4053
- Activity Buffer Request 5.38% 220.623us 5.38% 220.623us 220.623us 1.227ms 33.59% 1.227ms 1.227ms 1
4054
- cudaDeviceGetAttribute 0.09% 3.601us 0.09% 3.601us 0.240us 0.000us 0.00% 0.000us 0.000us 15
4055
- aten::empty_like 0.18% 7.230us 0.58% 23.840us 7.947us 0.000us 0.00% 0.000us 0.000us 3
4056
- aten::empty_strided 0.40% 16.610us 0.40% 16.610us 5.537us 0.000us 0.00% 0.000us 0.000us 3
4057
- aten::empty 0.51% 20.851us 0.51% 20.851us 2.317us 0.000us 0.00% 0.000us 0.000us 9
4058
- cudaFuncSetAttribute 0.09% 3.688us 0.09% 3.688us 1.229us 0.000us 0.00% 0.000us 0.000us 3
4059
- cudaLaunchKernel 4.62% 189.661us 4.62% 189.661us 63.220us 0.000us 0.00% 0.000us 0.000us 3
4060
- cudaDeviceSynchronize 85.35% 3.502ms 85.35% 3.502ms 3.502ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
- Self CPU time total: 4.103ms
4063
- Self CUDA time total: 3.654ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True
4068
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4069
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4070
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
4071
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
4072
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4073
  </pre></div>
4074
  <div class="uv-install-logs" id="uv-logs-benchmark">
4075
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4076
  <div class="uv-logs-content" style="display: none;">
4077
- Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4078
- Downloading hf-xet (3.2MiB)
4079
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4080
- Downloading networkx (1.9MiB)
4081
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4082
- Downloading nvidia-nccl-cu12 (307.4MiB)
4083
- Downloading kiwisolver (1.4MiB)
4084
- Downloading pillow (6.7MiB)
4085
- Downloading nvidia-curand-cu12 (60.7MiB)
4086
- Downloading nvidia-cublas-cu12 (566.8MiB)
4087
- Downloading sympy (6.0MiB)
4088
- Downloading setuptools (1.1MiB)
4089
- Downloading matplotlib (8.3MiB)
4090
- Downloading numpy (16.2MiB)
4091
- Downloading triton (148.3MiB)
4092
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4093
- Downloading fonttools (4.7MiB)
4094
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4095
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4096
- Downloading nvidia-cufile-cu12 (1.1MiB)
4097
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4098
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4099
- Downloading nvidia-cufft-cu12 (184.2MiB)
4100
- Downloading torch (846.9MiB)
4101
- Downloading nvidia-cufile-cu12
4102
- Downloading kiwisolver
4103
- Downloading hf-xet
4104
- Downloading setuptools
4105
- Downloading networkx
4106
- Downloading fonttools
4107
- Downloading pillow
4108
- Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4109
- Downloading nvidia-cuda-cupti-cu12
4110
- Downloading matplotlib
4111
- Downloading numpy
4112
- Downloading sympy
4113
- Downloading nvidia-nvjitlink-cu12
4114
- Downloading nvidia-curand-cu12
4115
- Downloading nvidia-cuda-nvrtc-cu12
4116
- Downloading triton
4117
- Downloading nvidia-cufft-cu12
4118
- Downloading nvidia-cusolver-cu12
4119
- Downloading nvidia-cusparselt-cu12
4120
- Downloading nvidia-cusparse-cu12
4121
- Downloading nvidia-nccl-cu12
4122
- Downloading nvidia-cublas-cu12
4123
- Downloading nvidia-cudnn-cu12
4124
- Downloading torch
4125
- Installed 52 packages in 223ms
4126
  </div>
4127
  </div>
4128
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4129
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:12, 1.43it/s]
4130
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 14.34it/s]</div>
 
4131
  <div class="cell-artifacts">
4132
  <h4>Artifacts:</h4>
4133
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 6.08s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3928
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3929
+ hf_kernels_flash_attn 3.64% 160.058us 41.50% 1.823ms 1.823ms 0.000us 0.00% 3.744ms 3.744ms 1
3930
+ _flash_attn_9e27194::fwd 1.78% 78.347us 37.86% 1.663ms 554.208us 2.792ms 100.00% 3.744ms 1.248ms 3
3931
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1
3932
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.800us 3
3933
+ Activity Buffer Request 33.00% 1.449ms 33.00% 1.449ms 1.449ms 951.685us 34.08% 951.685us 951.685us 1
3934
+ cudaDeviceGetAttribute 0.13% 5.638us 0.13% 5.638us 0.376us 0.000us 0.00% 0.000us 0.000us 15
3935
+ aten::empty_like 0.40% 17.551us 1.19% 52.122us 17.374us 0.000us 0.00% 0.000us 0.000us 3
3936
+ aten::empty_strided 0.79% 34.571us 0.79% 34.571us 11.524us 0.000us 0.00% 0.000us 0.000us 3
3937
+ aten::empty 0.57% 24.890us 0.57% 24.890us 2.766us 0.000us 0.00% 0.000us 0.000us 9
3938
+ cudaFuncSetAttribute 0.28% 12.210us 0.28% 12.210us 4.070us 0.000us 0.00% 0.000us 0.000us 3
3939
+ cudaLaunchKernel 0.92% 40.292us 0.92% 40.292us 13.431us 0.000us 0.00% 0.000us 0.000us 3
3940
+ cudaDeviceSynchronize 58.50% 2.569ms 58.50% 2.569ms 2.569ms 0.000us 0.00% 0.000us 0.000us 1
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
+ Self CPU time total: 4.392ms
3943
+ Self CUDA time total: 2.792ms
3944
 
3945
 
3946
 
 
3950
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3951
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3952
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3953
+ hf_kernels_flash_attn 2.22% 99.144us 37.48% 1.673ms 1.673ms 0.000us 0.00% 3.949ms 3.949ms 1
3954
+ _flash_attn_9e27194::fwd 1.20% 53.462us 35.26% 1.574ms 524.654us 2.953ms 100.00% 3.949ms 1.316ms 3
3955
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.955ms 100.05% 2.955ms 2.955ms 1
3956
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.953ms 100.00% 2.953ms 984.436us 3
3957
+ Activity Buffer Request 32.23% 1.439ms 32.23% 1.439ms 1.439ms 995.807us 33.72% 995.807us 995.807us 1
3958
+ cudaDeviceGetAttribute 0.10% 4.621us 0.10% 4.621us 0.308us 0.000us 0.00% 0.000us 0.000us 15
3959
+ aten::empty_like 0.17% 7.710us 0.56% 24.861us 8.287us 0.000us 0.00% 0.000us 0.000us 3
3960
+ aten::empty_strided 0.38% 17.151us 0.38% 17.151us 5.717us 0.000us 0.00% 0.000us 0.000us 3
3961
+ aten::empty 0.47% 21.122us 0.47% 21.122us 2.347us 0.000us 0.00% 0.000us 0.000us 9
3962
+ cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3
3963
+ cudaLaunchKernel 0.61% 27.380us 0.61% 27.380us 9.127us 0.000us 0.00% 0.000us 0.000us 3
3964
+ cudaDeviceSynchronize 62.52% 2.791ms 62.52% 2.791ms 2.791ms 0.000us 0.00% 0.000us 0.000us 1
3965
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3966
+ Self CPU time total: 4.464ms
3967
+ Self CUDA time total: 2.953ms
3968
 
3969
 
3970
 
 
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3976
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3977
+ hf_kernels_flash_attn 2.58% 116.955us 37.54% 1.702ms 1.702ms 0.000us 0.00% 4.041ms 4.041ms 1
3978
+ _flash_attn_9e27194::fwd 1.53% 69.255us 34.96% 1.585ms 528.314us 3.010ms 100.00% 4.041ms 1.347ms 3
3979
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.012ms 100.05% 3.012ms 3.012ms 1
3980
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.010ms 100.00% 3.010ms 1.003ms 3
3981
+ Activity Buffer Request 31.53% 1.430ms 31.53% 1.430ms 1.430ms 1.031ms 34.26% 1.031ms 1.031ms 1
3982
+ cudaDeviceGetAttribute 0.10% 4.450us 0.10% 4.450us 0.297us 0.000us 0.00% 0.000us 0.000us 15
3983
+ aten::empty_like 0.18% 8.151us 0.57% 25.801us 8.600us 0.000us 0.00% 0.000us 0.000us 3
3984
+ aten::empty_strided 0.39% 17.650us 0.39% 17.650us 5.883us 0.000us 0.00% 0.000us 0.000us 3
3985
+ aten::empty 0.48% 21.771us 0.48% 21.771us 2.419us 0.000us 0.00% 0.000us 0.000us 9
3986
+ cudaFuncSetAttribute 0.10% 4.360us 0.10% 4.360us 1.453us 0.000us 0.00% 0.000us 0.000us 3
3987
+ cudaLaunchKernel 0.66% 29.790us 0.66% 29.790us 9.930us 0.000us 0.00% 0.000us 0.000us 3
3988
+ cudaDeviceSynchronize 62.46% 2.832ms 62.46% 2.832ms 2.832ms 0.000us 0.00% 0.000us 0.000us 1
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ Self CPU time total: 4.534ms
3991
+ Self CUDA time total: 3.010ms
3992
 
3993
 
3994
 
 
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4000
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4001
+ hf_kernels_flash_attn 2.39% 114.805us 40.03% 1.925ms 1.925ms 0.000us 0.00% 4.094ms 4.094ms 1
4002
+ _flash_attn_9e27194::fwd 1.09% 52.653us 37.65% 1.810ms 603.407us 3.063ms 100.00% 4.094ms 1.365ms 3
4003
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.065ms 100.05% 3.065ms 3.065ms 1
4004
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.063ms 100.00% 3.063ms 1.021ms 3
4005
+ Activity Buffer Request 29.78% 1.432ms 29.78% 1.432ms 1.432ms 1.031ms 33.65% 1.031ms 1.031ms 1
4006
+ cudaDeviceGetAttribute 0.10% 4.861us 0.10% 4.861us 0.324us 0.000us 0.00% 0.000us 0.000us 15
4007
+ aten::empty_like 0.16% 7.720us 0.55% 26.331us 8.777us 0.000us 0.00% 0.000us 0.000us 3
4008
+ aten::empty_strided 0.39% 18.611us 0.39% 18.611us 6.204us 0.000us 0.00% 0.000us 0.000us 3
4009
+ aten::empty 0.45% 21.731us 0.45% 21.731us 2.415us 0.000us 0.00% 0.000us 0.000us 9
4010
+ cudaFuncSetAttribute 0.08% 3.728us 0.08% 3.728us 1.243us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaLaunchKernel 5.59% 268.862us 5.59% 268.862us 89.621us 0.000us 0.00% 0.000us 0.000us 3
4012
+ cudaDeviceSynchronize 59.97% 2.884ms 59.97% 2.884ms 2.884ms 0.000us 0.00% 0.000us 0.000us 1
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
+ Self CPU time total: 4.809ms
4015
+ Self CUDA time total: 3.063ms
4016
 
4017
 
4018
 
 
4022
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4023
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ hf_kernels_flash_attn 2.13% 113.755us 35.84% 1.918ms 1.918ms 0.000us 0.00% 4.786ms 4.786ms 1
4026
+ _flash_attn_9e27194::fwd 1.02% 54.483us 33.71% 1.804ms 601.364us 3.588ms 100.00% 4.786ms 1.595ms 3
4027
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.590ms 100.04% 3.590ms 3.590ms 1
4028
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.588ms 100.00% 3.588ms 1.196ms 3
4029
+ Activity Buffer Request 26.99% 1.445ms 26.99% 1.445ms 1.445ms 1.198ms 33.38% 1.198ms 1.198ms 1
4030
+ cudaDeviceGetAttribute 0.08% 4.270us 0.08% 4.270us 0.285us 0.000us 0.00% 0.000us 0.000us 15
4031
+ aten::empty_like 0.15% 8.039us 0.48% 25.640us 8.547us 0.000us 0.00% 0.000us 0.000us 3
4032
+ aten::empty_strided 0.33% 17.601us 0.33% 17.601us 5.867us 0.000us 0.00% 0.000us 0.000us 3
4033
+ aten::empty 0.40% 21.582us 0.40% 21.582us 2.398us 0.000us 0.00% 0.000us 0.000us 9
4034
+ cudaFuncSetAttribute 0.07% 3.700us 0.07% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3
4035
+ cudaLaunchKernel 4.67% 249.891us 4.67% 249.891us 83.297us 0.000us 0.00% 0.000us 0.000us 3
4036
+ cudaDeviceSynchronize 64.16% 3.434ms 64.16% 3.434ms 3.434ms 0.000us 0.00% 0.000us 0.000us 1
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ Self CPU time total: 5.351ms
4039
+ Self CUDA time total: 3.588ms
4040
 
4041
 
4042
 
 
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ hf_kernels_flash_attn 2.08% 111.044us 35.25% 1.879ms 1.879ms 0.000us 0.00% 4.816ms 4.816ms 1
4050
+ _flash_attn_9e27194::fwd 0.99% 52.834us 33.17% 1.768ms 589.427us 3.606ms 100.00% 4.816ms 1.605ms 3
4051
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.607ms 100.05% 3.607ms 3.607ms 1
4052
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.606ms 100.00% 3.606ms 1.202ms 3
4053
+ Activity Buffer Request 26.56% 1.416ms 26.56% 1.416ms 1.416ms 1.210ms 33.55% 1.210ms 1.210ms 1
4054
+ cudaDeviceGetAttribute 0.08% 4.460us 0.08% 4.460us 0.297us 0.000us 0.00% 0.000us 0.000us 15
4055
+ aten::empty_like 0.14% 7.500us 0.49% 26.051us 8.684us 0.000us 0.00% 0.000us 0.000us 3
4056
+ aten::empty_strided 0.35% 18.551us 0.35% 18.551us 6.184us 0.000us 0.00% 0.000us 0.000us 3
4057
+ aten::empty 0.41% 21.960us 0.41% 21.960us 2.440us 0.000us 0.00% 0.000us 0.000us 9
4058
+ cudaFuncSetAttribute 0.08% 4.009us 0.08% 4.009us 1.336us 0.000us 0.00% 0.000us 0.000us 3
4059
+ cudaLaunchKernel 4.55% 242.792us 4.55% 242.792us 80.931us 0.000us 0.00% 0.000us 0.000us 3
4060
+ cudaDeviceSynchronize 64.75% 3.452ms 64.75% 3.452ms 3.452ms 0.000us 0.00% 0.000us 0.000us 1
4061
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4062
+ Self CPU time total: 5.332ms
4063
+ Self CUDA time total: 3.606ms
4064
 
4065
 
4066
  impl wl p50(ms) ok
4067
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4068
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4069
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4070
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
4071
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4072
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
4073
  </pre></div>
4074
  <div class="uv-install-logs" id="uv-logs-benchmark">
4075
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4076
  <div class="uv-logs-content" style="display: none;">
4077
+ Installed 15 packages in 13ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4078
  </div>
4079
  </div>
4080
  <div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]
4081
+ Fetching 20 files: 5%|▌ | 1/20 [00:00&lt;00:04, 4.26it/s]
4082
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:17, 1.03it/s]
4083
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.64it/s]</div>
4084
  <div class="cell-artifacts">
4085
  <h4>Artifacts:</h4>
4086
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.62s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3925,19 +3925,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
- hf_kernels_flash_attn3 3.90% 171.143us 44.22% 1.941ms 1.941ms 0.000us 0.00% 3.653ms 3.653ms 1
3929
- FlashAttnFunc 2.92% 128.011us 40.32% 1.769ms 589.788us 0.000us 0.00% 3.653ms 1.218ms 3
3930
- _flash_attn3_48fe103_dirty::fwd 1.90% 83.422us 37.41% 1.641ms 547.118us 2.755ms 100.00% 3.653ms 1.218ms 3
3931
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.756ms 100.05% 2.756ms 2.756ms 1
3932
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.00% 2.755ms 918.306us 3
3933
- Activity Buffer Request 33.13% 1.454ms 33.13% 1.454ms 1.454ms 898.082us 32.60% 898.082us 898.082us 1
3934
- aten::empty 1.02% 44.762us 1.02% 44.762us 7.460us 0.000us 0.00% 0.000us 0.000us 6
3935
- cudaFuncSetAttribute 0.33% 14.660us 0.33% 14.660us 4.887us 0.000us 0.00% 0.000us 0.000us 3
3936
- cudaLaunchKernel 1.02% 44.660us 1.02% 44.660us 14.887us 0.000us 0.00% 0.000us 0.000us 3
3937
- cudaDeviceSynchronize 55.78% 2.447ms 55.78% 2.447ms 2.447ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.388ms
3940
- Self CUDA time total: 2.755ms
3941
 
3942
 
3943
 
@@ -3947,19 +3947,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- hf_kernels_flash_attn3 2.42% 105.470us 40.03% 1.743ms 1.743ms 0.000us 0.00% 3.784ms 3.784ms 1
3951
- FlashAttnFunc 2.12% 92.121us 37.61% 1.638ms 546.005us 0.000us 0.00% 3.784ms 1.261ms 3
3952
- _flash_attn3_48fe103_dirty::fwd 1.23% 53.460us 35.49% 1.546ms 515.298us 2.836ms 100.00% 3.784ms 1.261ms 3
3953
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.05% 2.838ms 2.838ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.836ms 100.00% 2.836ms 945.359us 3
3955
- Activity Buffer Request 32.85% 1.431ms 32.85% 1.431ms 1.431ms 947.652us 33.41% 947.652us 947.652us 1
3956
- aten::empty 0.62% 27.052us 0.62% 27.052us 4.509us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.11% 4.721us 0.11% 4.721us 1.574us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.68% 29.730us 0.68% 29.730us 9.910us 0.000us 0.00% 0.000us 0.000us 3
3959
- cudaDeviceSynchronize 59.97% 2.612ms 59.97% 2.612ms 2.612ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
- Self CPU time total: 4.355ms
3962
- Self CUDA time total: 2.836ms
3963
 
3964
 
3965
 
@@ -3969,19 +3969,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
- hf_kernels_flash_attn3 2.34% 104.112us 39.68% 1.767ms 1.767ms 0.000us 0.00% 3.931ms 3.931ms 1
3973
- FlashAttnFunc 2.59% 115.143us 37.35% 1.662ms 554.155us 0.000us 0.00% 3.931ms 1.310ms 3
3974
- _flash_attn3_48fe103_dirty::fwd 1.23% 54.772us 34.76% 1.547ms 515.774us 2.932ms 100.00% 3.931ms 1.310ms 3
3975
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.934ms 100.05% 2.934ms 2.934ms 1
3976
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.432us 3
3977
- Activity Buffer Request 32.05% 1.427ms 32.05% 1.427ms 1.427ms 998.487us 34.05% 998.487us 998.487us 1
3978
- aten::empty 0.66% 29.309us 0.66% 29.309us 4.885us 0.000us 0.00% 0.000us 0.000us 6
3979
- cudaFuncSetAttribute 0.11% 4.840us 0.11% 4.840us 1.613us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.71% 31.520us 0.71% 31.520us 10.507us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 60.32% 2.685ms 60.32% 2.685ms 2.685ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 4.452ms
3984
- Self CUDA time total: 2.932ms
3985
 
3986
 
3987
 
@@ -3991,19 +3991,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn3 2.48% 118.391us 41.58% 1.983ms 1.983ms 0.000us 0.00% 4.029ms 4.029ms 1
3995
- FlashAttnFunc 2.00% 95.232us 39.09% 1.865ms 621.579us 0.000us 0.00% 4.029ms 1.343ms 3
3996
- _flash_attn3_48fe103_dirty::fwd 1.18% 56.301us 37.10% 1.770ms 589.835us 3.014ms 100.00% 4.029ms 1.343ms 3
3997
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.016ms 100.06% 3.016ms 3.016ms 1
3998
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.014ms 100.00% 3.014ms 1.005ms 3
3999
- Activity Buffer Request 30.19% 1.440ms 30.19% 1.440ms 1.440ms 1.015ms 33.67% 1.015ms 1.015ms 1
4000
- aten::empty 0.58% 27.710us 0.58% 27.710us 4.618us 0.000us 0.00% 0.000us 0.000us 6
4001
- cudaFuncSetAttribute 0.10% 4.771us 0.10% 4.771us 1.590us 0.000us 0.00% 0.000us 0.000us 3
4002
- cudaLaunchKernel 5.05% 240.873us 5.05% 240.873us 80.291us 0.000us 0.00% 0.000us 0.000us 3
4003
- cudaDeviceSynchronize 58.42% 2.787ms 58.42% 2.787ms 2.787ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
- Self CPU time total: 4.770ms
4006
- Self CUDA time total: 3.014ms
4007
 
4008
 
4009
 
@@ -4013,19 +4013,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
- hf_kernels_flash_attn3 2.45% 127.821us 37.14% 1.937ms 1.937ms 0.000us 0.00% 4.669ms 4.669ms 1
4017
- FlashAttnFunc 1.78% 92.961us 34.69% 1.809ms 603.079us 0.000us 0.00% 4.669ms 1.556ms 3
4018
- _flash_attn3_48fe103_dirty::fwd 0.98% 50.990us 32.91% 1.716ms 572.092us 3.496ms 100.00% 4.669ms 1.556ms 3
4019
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.498ms 100.05% 3.498ms 3.498ms 1
4020
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.496ms 100.00% 3.496ms 1.165ms 3
4021
- Activity Buffer Request 27.66% 1.443ms 27.66% 1.443ms 1.443ms 1.173ms 33.56% 1.173ms 1.173ms 1
4022
- aten::empty 0.56% 28.951us 0.56% 28.951us 4.825us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaFuncSetAttribute 0.09% 4.870us 0.09% 4.870us 1.623us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 3.62% 188.673us 3.62% 188.673us 62.891us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 62.86% 3.279ms 62.86% 3.279ms 3.279ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 5.216ms
4028
- Self CUDA time total: 3.496ms
4029
 
4030
 
4031
 
@@ -4035,33 +4035,33 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_flash_attn3 2.26% 115.651us 36.11% 1.844ms 1.844ms 0.000us 0.00% 4.648ms 4.648ms 1
4039
- FlashAttnFunc 1.78% 91.130us 33.84% 1.728ms 576.085us 0.000us 0.00% 4.648ms 1.549ms 3
4040
- _flash_attn3_48fe103_dirty::fwd 1.06% 54.250us 32.06% 1.637ms 545.708us 3.480ms 100.00% 4.648ms 1.549ms 3
4041
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.481ms 100.04% 3.481ms 3.481ms 1
4042
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.480ms 100.00% 3.480ms 1.160ms 3
4043
- Activity Buffer Request 27.00% 1.379ms 27.00% 1.379ms 1.379ms 1.168ms 33.58% 1.168ms 1.168ms 1
4044
- aten::empty 0.55% 28.142us 0.55% 28.142us 4.690us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaFuncSetAttribute 0.10% 5.261us 0.10% 5.261us 1.754us 0.000us 0.00% 0.000us 0.000us 3
4046
- cudaLaunchKernel 3.35% 170.883us 3.35% 170.883us 56.961us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 63.89% 3.263ms 63.89% 3.263ms 3.263ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 5.107ms
4050
- Self CUDA time total: 3.480ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True
4055
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4056
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4057
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
4058
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True
4059
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4060
  </pre></div>
4061
  <div class="cell-stderr">
4062
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4063
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.33it/s]
4064
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.66it/s]
4065
  </div>
4066
  <div class="cell-artifacts">
4067
  <h4>Artifacts:</h4>
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.68s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3927
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3928
+ hf_kernels_flash_attn3 3.89% 167.076us 44.49% 1.911ms 1.911ms 0.000us 0.00% 3.576ms 3.576ms 1
3929
+ FlashAttnFunc 3.00% 128.934us 40.60% 1.744ms 581.290us 0.000us 0.00% 3.576ms 1.192ms 3
3930
+ _flash_attn3_48fe103_dirty::fwd 1.82% 78.184us 37.60% 1.615ms 538.312us 2.688ms 100.00% 3.576ms 1.192ms 3
3931
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.690ms 100.05% 2.690ms 2.690ms 1
3932
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.688ms 100.00% 2.688ms 896.117us 3
3933
+ Activity Buffer Request 33.29% 1.430ms 33.29% 1.430ms 1.430ms 887.327us 33.01% 887.327us 887.327us 1
3934
+ aten::empty 1.08% 46.281us 1.08% 46.281us 7.714us 0.000us 0.00% 0.000us 0.000us 6
3935
+ cudaFuncSetAttribute 0.37% 15.900us 0.37% 15.900us 5.300us 0.000us 0.00% 0.000us 0.000us 3
3936
+ cudaLaunchKernel 1.04% 44.671us 1.04% 44.671us 14.890us 0.000us 0.00% 0.000us 0.000us 3
3937
+ cudaDeviceSynchronize 55.51% 2.384ms 55.51% 2.384ms 2.384ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.295ms
3940
+ Self CUDA time total: 2.688ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ hf_kernels_flash_attn3 3.06% 130.754us 41.10% 1.758ms 1.758ms 0.000us 0.00% 3.668ms 3.668ms 1
3951
+ FlashAttnFunc 2.23% 95.572us 38.05% 1.627ms 542.455us 0.000us 0.00% 3.668ms 1.223ms 3
3952
+ _flash_attn3_48fe103_dirty::fwd 1.23% 52.754us 35.81% 1.532ms 510.598us 2.747ms 100.00% 3.668ms 1.223ms 3
3953
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.748ms 100.05% 2.748ms 2.748ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.747ms 100.00% 2.747ms 915.501us 3
3955
+ Activity Buffer Request 33.10% 1.416ms 33.10% 1.416ms 1.416ms 921.272us 33.54% 921.272us 921.272us 1
3956
+ aten::empty 0.63% 26.890us 0.63% 26.890us 4.482us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.12% 4.970us 0.12% 4.970us 1.657us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.73% 31.351us 0.73% 31.351us 10.450us 0.000us 0.00% 0.000us 0.000us 3
3959
+ cudaDeviceSynchronize 58.90% 2.519ms 58.90% 2.519ms 2.519ms 0.000us 0.00% 0.000us 0.000us 1
3960
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3961
+ Self CPU time total: 4.277ms
3962
+ Self CUDA time total: 2.747ms
3963
 
3964
 
3965
 
 
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
+ hf_kernels_flash_attn3 2.33% 101.653us 39.53% 1.727ms 1.727ms 0.000us 0.00% 3.829ms 3.829ms 1
3973
+ FlashAttnFunc 2.05% 89.593us 37.20% 1.625ms 541.619us 0.000us 0.00% 3.829ms 1.276ms 3
3974
+ _flash_attn3_48fe103_dirty::fwd 1.17% 51.051us 35.15% 1.535ms 511.754us 2.856ms 100.00% 3.829ms 1.276ms 3
3975
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.06% 2.858ms 2.858ms 1
3976
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.856ms 100.00% 2.856ms 952.136us 3
3977
+ Activity Buffer Request 32.54% 1.421ms 32.54% 1.421ms 1.421ms 972.574us 34.05% 972.574us 972.574us 1
3978
+ aten::empty 0.62% 27.231us 0.62% 27.231us 4.538us 0.000us 0.00% 0.000us 0.000us 6
3979
+ cudaFuncSetAttribute 0.12% 5.411us 0.12% 5.411us 1.804us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaLaunchKernel 0.69% 30.341us 0.69% 30.341us 10.114us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaDeviceSynchronize 60.47% 2.642ms 60.47% 2.642ms 2.642ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ Self CPU time total: 4.368ms
3984
+ Self CUDA time total: 2.856ms
3985
 
3986
 
3987
 
 
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
+ hf_kernels_flash_attn3 2.61% 122.474us 42.62% 2.001ms 2.001ms 0.000us 0.00% 3.906ms 3.906ms 1
3995
+ FlashAttnFunc 1.99% 93.683us 40.01% 1.879ms 626.332us 0.000us 0.00% 3.906ms 1.302ms 3
3996
+ _flash_attn3_48fe103_dirty::fwd 1.17% 54.872us 38.02% 1.785ms 595.104us 2.915ms 100.00% 3.906ms 1.302ms 3
3997
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.917ms 100.05% 2.917ms 2.917ms 1
3998
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.00% 2.915ms 971.727us 3
3999
+ Activity Buffer Request 31.11% 1.461ms 31.11% 1.461ms 1.461ms 991.129us 34.00% 991.129us 991.129us 1
4000
+ aten::empty 0.59% 27.622us 0.59% 27.622us 4.604us 0.000us 0.00% 0.000us 0.000us 6
4001
+ cudaFuncSetAttribute 0.12% 5.820us 0.12% 5.820us 1.940us 0.000us 0.00% 0.000us 0.000us 3
4002
+ cudaLaunchKernel 5.03% 236.178us 5.03% 236.178us 78.726us 0.000us 0.00% 0.000us 0.000us 3
4003
+ cudaDeviceSynchronize 57.38% 2.695ms 57.38% 2.695ms 2.695ms 0.000us 0.00% 0.000us 0.000us 1
4004
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4005
+ Self CPU time total: 4.696ms
4006
+ Self CUDA time total: 2.915ms
4007
 
4008
 
4009
 
 
4013
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4014
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
+ hf_kernels_flash_attn3 2.45% 124.235us 37.18% 1.882ms 1.882ms 0.000us 0.00% 4.537ms 4.537ms 1
4017
+ FlashAttnFunc 1.83% 92.522us 34.73% 1.758ms 585.897us 0.000us 0.00% 4.537ms 1.512ms 3
4018
+ _flash_attn3_48fe103_dirty::fwd 1.03% 52.313us 32.90% 1.665ms 555.056us 3.398ms 100.00% 4.537ms 1.512ms 3
4019
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.399ms 100.05% 3.399ms 3.399ms 1
4020
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3
4021
+ Activity Buffer Request 27.82% 1.408ms 27.82% 1.408ms 1.408ms 1.139ms 33.52% 1.139ms 1.139ms 1
4022
+ aten::empty 0.54% 27.441us 0.54% 27.441us 4.573us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaFuncSetAttribute 0.12% 5.839us 0.12% 5.839us 1.946us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaLaunchKernel 3.39% 171.646us 3.39% 171.646us 57.215us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaDeviceSynchronize 62.82% 3.179ms 62.82% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Self CPU time total: 5.061ms
4028
+ Self CUDA time total: 3.398ms
4029
 
4030
 
4031
 
 
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ hf_kernels_flash_attn3 2.74% 138.223us 36.95% 1.864ms 1.864ms 0.000us 0.00% 4.557ms 4.557ms 1
4039
+ FlashAttnFunc 1.84% 92.725us 34.21% 1.726ms 575.197us 0.000us 0.00% 4.557ms 1.519ms 3
4040
+ _flash_attn3_48fe103_dirty::fwd 1.03% 52.171us 32.37% 1.633ms 544.289us 3.424ms 100.00% 4.557ms 1.519ms 3
4041
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.425ms 100.04% 3.425ms 3.425ms 1
4042
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.424ms 100.00% 3.424ms 1.141ms 3
4043
+ Activity Buffer Request 27.34% 1.379ms 27.34% 1.379ms 1.379ms 1.133ms 33.10% 1.133ms 1.133ms 1
4044
+ aten::empty 0.57% 28.661us 0.57% 28.661us 4.777us 0.000us 0.00% 0.000us 0.000us 6
4045
+ cudaFuncSetAttribute 0.10% 5.240us 0.10% 5.240us 1.747us 0.000us 0.00% 0.000us 0.000us 3
4046
+ cudaLaunchKernel 3.33% 167.776us 3.33% 167.776us 55.925us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaDeviceSynchronize 63.05% 3.181ms 63.05% 3.181ms 3.181ms 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 5.045ms
4050
+ Self CUDA time total: 3.424ms
4051
 
4052
 
4053
  impl wl p50(ms) ok
4054
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4055
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4056
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
4057
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4058
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4059
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4060
  </pre></div>
4061
  <div class="cell-stderr">
4062
  Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4063
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.27it/s]
4064
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.55it/s]
4065
  </div>
4066
  <div class="cell-artifacts">
4067
  <h4>Artifacts:</h4>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3869,9 +3869,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 4.02s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3924,28 +3924,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
- torch_mem_eff 4.61% 329.029us 32.49% 2.320ms 2.320ms 0.000us 0.00% 5.545ms 5.545ms 1
3928
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.524ms 100.54% 5.524ms 5.524ms 1
3929
- aten::scaled_dot_product_attention 0.42% 29.860us 2.75% 196.242us 65.414us 0.000us 0.00% 4.878ms 1.626ms 3
3930
- aten::_scaled_dot_product_efficient_attention 0.35% 25.230us 2.33% 166.382us 55.461us 0.000us 0.00% 4.878ms 1.626ms 3
3931
- aten::_efficient_attention_forward 0.73% 52.049us 1.68% 119.861us 39.954us 4.878ms 88.79% 4.878ms 1.626ms 3
3932
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.878ms 88.79% 4.878ms 1.626ms 3
3933
- aten::contiguous 0.18% 13.143us 24.28% 1.734ms 192.643us 0.000us 0.00% 666.300us 74.033us 9
3934
- aten::clone 0.50% 35.608us 24.09% 1.721ms 191.183us 0.000us 0.00% 666.300us 74.033us 9
3935
- aten::copy_ 1.01% 71.952us 22.59% 1.613ms 179.214us 615.708us 11.21% 666.300us 74.033us 9
3936
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 615.708us 11.21% 615.708us 68.412us 9
3937
- Activity Buffer Request 20.33% 1.452ms 20.33% 1.452ms 1.452ms 50.592us 0.92% 50.592us 50.592us 1
3938
- aten::transpose 0.87% 61.994us 1.16% 82.494us 3.437us 0.000us 0.00% 0.000us 0.000us 24
3939
- aten::as_strided 0.29% 20.500us 0.29% 20.500us 0.854us 0.000us 0.00% 0.000us 0.000us 24
3940
- aten::empty_like 0.25% 17.742us 1.01% 72.112us 8.012us 0.000us 0.00% 0.000us 0.000us 9
3941
- aten::empty 1.17% 83.610us 1.17% 83.610us 3.981us 0.000us 0.00% 0.000us 0.000us 21
3942
- cudaLaunchKernel 1.60% 114.582us 1.60% 114.582us 9.548us 0.000us 0.00% 0.000us 0.000us 12
3943
- cudaStreamIsCapturing 0.04% 3.180us 0.04% 3.180us 1.060us 0.000us 0.00% 0.000us 0.000us 3
3944
- cudaFuncSetAttribute 0.14% 10.280us 0.14% 10.280us 3.427us 0.000us 0.00% 0.000us 0.000us 3
3945
- cudaDeviceSynchronize 67.51% 4.821ms 67.51% 4.821ms 4.821ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
- Self CPU time total: 7.141ms
3948
- Self CUDA time total: 5.494ms
3949
 
3950
 
3951
 
@@ -3955,28 +3955,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
- torch_mem_eff 3.39% 253.102us 28.13% 2.097ms 2.097ms 0.000us 0.00% 5.972ms 5.972ms 1
3959
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.926ms 100.15% 5.926ms 5.926ms 1
3960
- aten::scaled_dot_product_attention 0.26% 19.190us 1.92% 143.113us 47.704us 0.000us 0.00% 5.278ms 1.759ms 3
3961
- aten::_scaled_dot_product_efficient_attention 0.26% 19.540us 1.66% 123.923us 41.308us 0.000us 0.00% 5.278ms 1.759ms 3
3962
- aten::_efficient_attention_forward 0.37% 27.385us 1.10% 81.652us 27.217us 5.278ms 89.20% 5.278ms 1.759ms 3
3963
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.278ms 89.20% 5.278ms 1.759ms 3
3964
- aten::contiguous 0.09% 6.999us 22.26% 1.660ms 184.423us 0.000us 0.00% 693.503us 77.056us 9
3965
- aten::clone 0.31% 23.031us 22.17% 1.653ms 183.645us 0.000us 0.00% 693.503us 77.056us 9
3966
- aten::copy_ 0.83% 61.989us 21.18% 1.579ms 175.477us 638.911us 10.80% 693.503us 77.056us 9
3967
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 638.911us 10.80% 638.911us 70.990us 9
3968
- Activity Buffer Request 19.45% 1.450ms 19.45% 1.450ms 1.450ms 54.592us 0.92% 54.592us 54.592us 1
3969
- aten::transpose 0.64% 47.641us 0.86% 64.101us 2.671us 0.000us 0.00% 0.000us 0.000us 24
3970
- aten::as_strided 0.22% 16.460us 0.22% 16.460us 0.686us 0.000us 0.00% 0.000us 0.000us 24
3971
- aten::empty_like 0.16% 11.730us 0.68% 50.483us 5.609us 0.000us 0.00% 0.000us 0.000us 9
3972
- aten::empty 0.86% 64.470us 0.86% 64.470us 3.070us 0.000us 0.00% 0.000us 0.000us 21
3973
- cudaLaunchKernel 1.21% 90.240us 1.21% 90.240us 7.520us 0.000us 0.00% 0.000us 0.000us 12
3974
- cudaStreamIsCapturing 0.03% 2.290us 0.03% 2.290us 0.763us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaFuncSetAttribute 0.04% 3.130us 0.04% 3.130us 1.043us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 71.87% 5.359ms 71.87% 5.359ms 5.359ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 7.456ms
3979
- Self CUDA time total: 5.917ms
3980
 
3981
 
3982
 
@@ -3986,28 +3986,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- torch_mem_eff 3.16% 240.823us 26.89% 2.051ms 2.051ms 0.000us 0.00% 6.167ms 6.167ms 1
3990
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.14% 6.117ms 6.117ms 1
3991
- aten::scaled_dot_product_attention 0.24% 18.220us 1.81% 137.732us 45.911us 0.000us 0.00% 5.453ms 1.818ms 3
3992
- aten::_scaled_dot_product_efficient_attention 0.24% 18.402us 1.57% 119.512us 39.837us 0.000us 0.00% 5.453ms 1.818ms 3
3993
- aten::_efficient_attention_forward 0.35% 26.389us 1.04% 79.670us 26.557us 5.453ms 89.28% 5.453ms 1.818ms 3
3994
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.453ms 89.28% 5.453ms 1.818ms 3
3995
- aten::contiguous 0.09% 6.950us 21.38% 1.630ms 181.132us 0.000us 0.00% 713.534us 79.282us 9
3996
- aten::clone 0.28% 21.189us 21.28% 1.623ms 180.360us 0.000us 0.00% 713.534us 79.282us 9
3997
- aten::copy_ 0.81% 62.032us 20.34% 1.551ms 172.330us 655.038us 10.72% 713.534us 79.282us 9
3998
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 655.038us 10.72% 655.038us 72.782us 9
3999
- Activity Buffer Request 18.63% 1.421ms 18.63% 1.421ms 1.421ms 58.496us 0.96% 58.496us 58.496us 1
4000
- aten::transpose 0.62% 47.348us 0.84% 63.699us 2.654us 0.000us 0.00% 0.000us 0.000us 24
4001
- aten::as_strided 0.21% 16.351us 0.21% 16.351us 0.681us 0.000us 0.00% 0.000us 0.000us 24
4002
- aten::empty_like 0.15% 11.091us 0.67% 51.081us 5.676us 0.000us 0.00% 0.000us 0.000us 9
4003
- aten::empty 0.86% 65.760us 0.86% 65.760us 3.131us 0.000us 0.00% 0.000us 0.000us 21
4004
- cudaLaunchKernel 1.18% 89.982us 1.18% 89.982us 7.498us 0.000us 0.00% 0.000us 0.000us 12
4005
- cudaStreamIsCapturing 0.03% 2.210us 0.03% 2.210us 0.737us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaFuncSetAttribute 0.04% 3.100us 0.04% 3.100us 1.033us 0.000us 0.00% 0.000us 0.000us 3
4007
- cudaDeviceSynchronize 73.11% 5.575ms 73.11% 5.575ms 5.575ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
- Self CPU time total: 7.626ms
4010
- Self CUDA time total: 6.108ms
4011
 
4012
 
4013
 
@@ -4017,28 +4017,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
- torch_mem_eff 4.44% 356.182us 33.00% 2.648ms 2.648ms 0.000us 0.00% 6.210ms 6.210ms 1
4021
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.165ms 100.21% 6.165ms 6.165ms 1
4022
- aten::scaled_dot_product_attention 0.29% 23.400us 2.31% 185.263us 61.754us 0.000us 0.00% 5.497ms 1.832ms 3
4023
- aten::_scaled_dot_product_efficient_attention 0.29% 23.202us 2.02% 161.863us 53.954us 0.000us 0.00% 5.497ms 1.832ms 3
4024
- aten::_efficient_attention_forward 0.44% 35.239us 1.36% 108.811us 36.270us 5.497ms 89.36% 5.497ms 1.832ms 3
4025
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.497ms 89.36% 5.497ms 1.832ms 3
4026
- aten::contiguous 0.11% 9.040us 25.54% 2.050ms 227.726us 0.000us 0.00% 712.735us 79.193us 9
4027
- aten::clone 0.35% 28.461us 25.43% 2.040ms 226.722us 0.000us 0.00% 712.735us 79.193us 9
4028
- aten::copy_ 1.02% 82.020us 24.22% 1.944ms 215.993us 654.527us 10.64% 712.735us 79.193us 9
4029
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.527us 10.64% 654.527us 72.725us 9
4030
- Activity Buffer Request 19.35% 1.553ms 19.35% 1.553ms 1.553ms 58.208us 0.95% 58.208us 58.208us 1
4031
- aten::transpose 0.81% 64.960us 1.09% 87.330us 3.639us 0.000us 0.00% 0.000us 0.000us 24
4032
- aten::as_strided 0.28% 22.370us 0.28% 22.370us 0.932us 0.000us 0.00% 0.000us 0.000us 24
4033
- aten::empty_like 0.19% 15.081us 0.85% 68.092us 7.566us 0.000us 0.00% 0.000us 0.000us 9
4034
- aten::empty 1.09% 87.522us 1.09% 87.522us 4.168us 0.000us 0.00% 0.000us 0.000us 21
4035
- cudaLaunchKernel 4.25% 341.154us 4.25% 341.154us 28.429us 0.000us 0.00% 0.000us 0.000us 12
4036
- cudaStreamIsCapturing 0.04% 2.841us 0.04% 2.841us 0.947us 0.000us 0.00% 0.000us 0.000us 3
4037
- cudaFuncSetAttribute 0.05% 4.120us 0.05% 4.120us 1.373us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaDeviceSynchronize 67.00% 5.376ms 67.00% 5.376ms 5.376ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
- Self CPU time total: 8.025ms
4041
- Self CUDA time total: 6.152ms
4042
 
4043
 
4044
 
@@ -4048,28 +4048,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
- torch_mem_eff 3.33% 272.217us 28.45% 2.323ms 2.323ms 0.000us 0.00% 6.452ms 6.452ms 1
4052
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.401ms 100.14% 6.401ms 6.401ms 1
4053
- aten::scaled_dot_product_attention 0.25% 20.040us 1.74% 141.700us 47.233us 0.000us 0.00% 5.729ms 1.910ms 3
4054
- aten::_scaled_dot_product_efficient_attention 0.23% 18.560us 1.49% 121.660us 40.553us 0.000us 0.00% 5.729ms 1.910ms 3
4055
- aten::_efficient_attention_forward 0.34% 27.420us 1.00% 81.440us 27.147us 5.729ms 89.62% 5.729ms 1.910ms 3
4056
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.729ms 89.62% 5.729ms 1.910ms 3
4057
- aten::contiguous 0.09% 7.310us 22.83% 1.865ms 207.177us 0.000us 0.00% 723.614us 80.402us 9
4058
- aten::clone 0.27% 22.438us 22.75% 1.857ms 206.364us 0.000us 0.00% 723.614us 80.402us 9
4059
- aten::copy_ 0.75% 61.292us 21.84% 1.783ms 198.108us 663.806us 10.38% 723.614us 80.402us 9
4060
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 663.806us 10.38% 663.806us 73.756us 9
4061
- Activity Buffer Request 18.13% 1.481ms 18.13% 1.481ms 1.481ms 59.808us 0.94% 59.808us 59.808us 1
4062
- aten::transpose 0.61% 49.591us 0.81% 66.019us 2.751us 0.000us 0.00% 0.000us 0.000us 24
4063
- aten::as_strided 0.20% 16.428us 0.20% 16.428us 0.684us 0.000us 0.00% 0.000us 0.000us 24
4064
- aten::empty_like 0.14% 11.501us 0.64% 51.871us 5.763us 0.000us 0.00% 0.000us 0.000us 9
4065
- aten::empty 0.80% 65.620us 0.80% 65.620us 3.125us 0.000us 0.00% 0.000us 0.000us 21
4066
- cudaLaunchKernel 3.24% 264.473us 3.24% 264.473us 22.039us 0.000us 0.00% 0.000us 0.000us 12
4067
- cudaStreamIsCapturing 0.03% 2.310us 0.03% 2.310us 0.770us 0.000us 0.00% 0.000us 0.000us 3
4068
- cudaFuncSetAttribute 0.04% 3.060us 0.04% 3.060us 1.020us 0.000us 0.00% 0.000us 0.000us 3
4069
- cudaDeviceSynchronize 71.55% 5.843ms 71.55% 5.843ms 5.843ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
- Self CPU time total: 8.166ms
4072
- Self CUDA time total: 6.392ms
4073
 
4074
 
4075
 
@@ -4079,38 +4079,90 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_mem_eff 2.84% 238.921us 26.25% 2.206ms 2.206ms 0.000us 0.00% 6.803ms 6.803ms 1
4083
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.751ms 100.13% 6.751ms 6.751ms 1
4084
- aten::scaled_dot_product_attention 0.23% 19.080us 1.67% 140.122us 46.707us 0.000us 0.00% 6.072ms 2.024ms 3
4085
- aten::_scaled_dot_product_efficient_attention 0.22% 18.680us 1.44% 121.042us 40.347us 0.000us 0.00% 6.072ms 2.024ms 3
4086
- aten::_efficient_attention_forward 0.32% 27.009us 0.95% 79.840us 26.613us 6.072ms 90.07% 6.072ms 2.024ms 3
4087
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.072ms 90.07% 6.072ms 2.024ms 3
4088
- aten::contiguous 0.09% 7.439us 21.24% 1.785ms 198.324us 0.000us 0.00% 731.099us 81.233us 9
4089
- aten::clone 0.26% 21.852us 21.15% 1.777ms 197.498us 0.000us 0.00% 731.099us 81.233us 9
4090
- aten::copy_ 0.77% 64.769us 20.27% 1.703ms 189.239us 669.820us 9.93% 731.099us 81.233us 9
4091
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 669.820us 9.93% 669.820us 74.424us 9
4092
- Activity Buffer Request 16.92% 1.422ms 16.92% 1.422ms 1.422ms 61.279us 0.91% 61.279us 61.279us 1
4093
- aten::transpose 0.57% 48.271us 0.77% 64.334us 2.681us 0.000us 0.00% 0.000us 0.000us 24
4094
- aten::as_strided 0.19% 16.063us 0.19% 16.063us 0.669us 0.000us 0.00% 0.000us 0.000us 24
4095
- aten::empty_like 0.14% 11.440us 0.62% 52.480us 5.831us 0.000us 0.00% 0.000us 0.000us 9
4096
- aten::empty 0.79% 66.661us 0.79% 66.661us 3.174us 0.000us 0.00% 0.000us 0.000us 21
4097
- cudaLaunchKernel 2.84% 238.383us 2.84% 238.383us 19.865us 0.000us 0.00% 0.000us 0.000us 12
4098
- cudaStreamIsCapturing 0.03% 2.270us 0.03% 2.270us 0.757us 0.000us 0.00% 0.000us 0.000us 3
4099
- cudaFuncSetAttribute 0.04% 3.090us 0.04% 3.090us 1.030us 0.000us 0.00% 0.000us 0.000us 3
4100
- cudaDeviceSynchronize 73.75% 6.196ms 73.75% 6.196ms 6.196ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
- Self CPU time total: 8.402ms
4103
- Self CUDA time total: 6.742ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
- torch_mem_eff cuda_attn_L128_bfloat16 1.89 True
4108
- torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4109
- torch_mem_eff cuda_attn_L320_bfloat16 2.05 True
4110
- torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4111
- torch_mem_eff cuda_attn_L448_bfloat16 2.13 True
4112
- torch_mem_eff cuda_attn_L512_bfloat16 2.27 True
4113
  </pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4114
  <div class="cell-artifacts">
4115
  <h4>Artifacts:</h4>
4116
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 32.68s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3924
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3925
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3926
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3927
+ torch_mem_eff 4.77% 340.490us 32.91% 2.350ms 2.350ms 0.000us 0.00% 5.530ms 5.530ms 1
3928
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.523ms 100.81% 5.523ms 5.523ms 1
3929
+ aten::scaled_dot_product_attention 0.44% 31.421us 2.67% 190.938us 63.646us 0.000us 0.00% 4.861ms 1.620ms 3
3930
+ aten::_scaled_dot_product_efficient_attention 0.35% 24.771us 2.23% 159.517us 53.172us 0.000us 0.00% 4.861ms 1.620ms 3
3931
+ aten::_efficient_attention_forward 0.51% 36.163us 1.50% 107.413us 35.804us 4.861ms 88.73% 4.861ms 1.620ms 3
3932
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.861ms 88.73% 4.861ms 1.620ms 3
3933
+ aten::contiguous 0.17% 12.232us 24.52% 1.751ms 194.525us 0.000us 0.00% 668.128us 74.236us 9
3934
+ aten::clone 0.48% 34.579us 24.35% 1.738ms 193.165us 0.000us 0.00% 668.128us 74.236us 9
3935
+ aten::copy_ 1.16% 82.494us 22.79% 1.628ms 180.845us 617.312us 11.27% 668.128us 74.236us 9
3936
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.312us 11.27% 617.312us 68.590us 9
3937
+ Activity Buffer Request 20.35% 1.453ms 20.35% 1.453ms 1.453ms 50.816us 0.93% 50.816us 50.816us 1
3938
+ aten::transpose 1.00% 71.754us 1.33% 95.065us 3.961us 0.000us 0.00% 0.000us 0.000us 24
3939
+ aten::as_strided 0.33% 23.311us 0.33% 23.311us 0.971us 0.000us 0.00% 0.000us 0.000us 24
3940
+ aten::empty_like 0.27% 19.481us 1.07% 76.301us 8.478us 0.000us 0.00% 0.000us 0.000us 9
3941
+ aten::empty 1.26% 89.759us 1.26% 89.759us 4.274us 0.000us 0.00% 0.000us 0.000us 21
3942
+ cudaLaunchKernel 1.62% 115.656us 1.62% 115.656us 9.638us 0.000us 0.00% 0.000us 0.000us 12
3943
+ cudaStreamIsCapturing 0.04% 2.980us 0.04% 2.980us 0.993us 0.000us 0.00% 0.000us 0.000us 3
3944
+ cudaFuncSetAttribute 0.16% 11.490us 0.16% 11.490us 3.830us 0.000us 0.00% 0.000us 0.000us 3
3945
+ cudaDeviceSynchronize 67.09% 4.790ms 67.09% 4.790ms 4.790ms 0.000us 0.00% 0.000us 0.000us 1
3946
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3947
+ Self CPU time total: 7.140ms
3948
+ Self CUDA time total: 5.479ms
3949
 
3950
 
3951
 
 
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3957
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3958
+ torch_mem_eff 3.38% 251.986us 27.98% 2.086ms 2.086ms 0.000us 0.00% 6.014ms 6.014ms 1
3959
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.969ms 100.15% 5.969ms 5.969ms 1
3960
+ aten::scaled_dot_product_attention 0.27% 19.962us 1.97% 146.646us 48.882us 0.000us 0.00% 5.323ms 1.774ms 3
3961
+ aten::_scaled_dot_product_efficient_attention 0.26% 19.141us 1.70% 126.684us 42.228us 0.000us 0.00% 5.323ms 1.774ms 3
3962
+ aten::_efficient_attention_forward 0.39% 29.281us 1.12% 83.514us 27.838us 5.323ms 89.32% 5.323ms 1.774ms 3
3963
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.323ms 89.32% 5.323ms 1.774ms 3
3964
+ aten::contiguous 0.10% 7.510us 22.05% 1.644ms 182.655us 0.000us 0.00% 690.909us 76.768us 9
3965
+ aten::clone 0.31% 23.251us 21.95% 1.636ms 181.821us 0.000us 0.00% 690.909us 76.768us 9
3966
+ aten::copy_ 0.91% 68.131us 20.95% 1.562ms 173.540us 636.478us 10.68% 690.909us 76.768us 9
3967
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 636.478us 10.68% 636.478us 70.720us 9
3968
+ Activity Buffer Request 19.09% 1.423ms 19.09% 1.423ms 1.423ms 54.431us 0.91% 54.431us 54.431us 1
3969
+ aten::transpose 0.68% 50.542us 0.90% 67.292us 2.804us 0.000us 0.00% 0.000us 0.000us 24
3970
+ aten::as_strided 0.22% 16.750us 0.22% 16.750us 0.698us 0.000us 0.00% 0.000us 0.000us 24
3971
+ aten::empty_like 0.17% 12.371us 0.69% 51.272us 5.697us 0.000us 0.00% 0.000us 0.000us 9
3972
+ aten::empty 0.87% 64.771us 0.87% 64.771us 3.084us 0.000us 0.00% 0.000us 0.000us 21
3973
+ cudaLaunchKernel 1.25% 93.466us 1.25% 93.466us 7.789us 0.000us 0.00% 0.000us 0.000us 12
3974
+ cudaStreamIsCapturing 0.03% 2.400us 0.03% 2.400us 0.800us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaFuncSetAttribute 0.05% 3.371us 0.05% 3.371us 1.124us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaDeviceSynchronize 72.02% 5.368ms 72.02% 5.368ms 5.368ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
+ Self CPU time total: 7.454ms
3979
+ Self CUDA time total: 5.959ms
3980
 
3981
 
3982
 
 
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
+ torch_mem_eff 3.08% 235.490us 27.25% 2.083ms 2.083ms 0.000us 0.00% 6.182ms 6.182ms 1
3990
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.132ms 100.15% 6.132ms 6.132ms 1
3991
+ aten::scaled_dot_product_attention 0.24% 18.220us 1.86% 142.046us 47.349us 0.000us 0.00% 5.466ms 1.822ms 3
3992
+ aten::_scaled_dot_product_efficient_attention 0.24% 18.131us 1.62% 123.826us 41.275us 0.000us 0.00% 5.466ms 1.822ms 3
3993
+ aten::_efficient_attention_forward 0.37% 27.940us 1.08% 82.291us 27.430us 5.466ms 89.28% 5.466ms 1.822ms 3
3994
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.466ms 89.28% 5.466ms 1.822ms 3
3995
+ aten::contiguous 0.10% 7.272us 21.47% 1.642ms 182.409us 0.000us 0.00% 715.197us 79.466us 9
3996
+ aten::clone 0.29% 22.290us 21.38% 1.634ms 181.601us 0.000us 0.00% 715.197us 79.466us 9
3997
+ aten::copy_ 0.83% 63.251us 20.39% 1.559ms 173.182us 656.318us 10.72% 715.197us 79.466us 9
3998
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 656.318us 10.72% 656.318us 72.924us 9
3999
+ Activity Buffer Request 18.70% 1.430ms 18.70% 1.430ms 1.430ms 58.879us 0.96% 58.879us 58.879us 1
4000
+ aten::transpose 0.93% 71.209us 1.15% 87.625us 3.651us 0.000us 0.00% 0.000us 0.000us 24
4001
+ aten::as_strided 0.21% 16.416us 0.21% 16.416us 0.684us 0.000us 0.00% 0.000us 0.000us 24
4002
+ aten::empty_like 0.15% 11.741us 0.70% 53.481us 5.942us 0.000us 0.00% 0.000us 0.000us 9
4003
+ aten::empty 0.89% 67.840us 0.89% 67.840us 3.230us 0.000us 0.00% 0.000us 0.000us 21
4004
+ cudaLaunchKernel 1.15% 88.022us 1.15% 88.022us 7.335us 0.000us 0.00% 0.000us 0.000us 12
4005
+ cudaStreamIsCapturing 0.03% 2.651us 0.03% 2.651us 0.884us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaFuncSetAttribute 0.04% 3.370us 0.04% 3.370us 1.123us 0.000us 0.00% 0.000us 0.000us 3
4007
+ cudaDeviceSynchronize 72.75% 5.562ms 72.75% 5.562ms 5.562ms 0.000us 0.00% 0.000us 0.000us 1
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
+ Self CPU time total: 7.646ms
4010
+ Self CUDA time total: 6.123ms
4011
 
4012
 
4013
 
 
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
+ torch_mem_eff 2.84% 224.838us 29.78% 2.354ms 2.354ms 0.000us 0.00% 6.170ms 6.170ms 1
4021
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.121ms 100.15% 6.121ms 6.121ms 1
4022
+ aten::scaled_dot_product_attention 0.24% 18.891us 1.82% 143.646us 47.882us 0.000us 0.00% 5.458ms 1.819ms 3
4023
+ aten::_scaled_dot_product_efficient_attention 0.24% 19.093us 1.58% 124.755us 41.585us 0.000us 0.00% 5.458ms 1.819ms 3
4024
+ aten::_efficient_attention_forward 0.36% 28.140us 1.04% 82.213us 27.404us 5.458ms 89.30% 5.458ms 1.819ms 3
4025
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.458ms 89.30% 5.458ms 1.819ms 3
4026
+ aten::contiguous 0.10% 7.739us 24.57% 1.942ms 215.806us 0.000us 0.00% 711.998us 79.111us 9
4027
+ aten::clone 0.31% 24.450us 24.47% 1.935ms 214.946us 0.000us 0.00% 711.998us 79.111us 9
4028
+ aten::copy_ 0.86% 68.064us 23.51% 1.859ms 206.523us 653.982us 10.70% 711.998us 79.111us 9
4029
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 653.982us 10.70% 653.982us 72.665us 9
4030
+ Activity Buffer Request 18.84% 1.489ms 18.84% 1.489ms 1.489ms 58.016us 0.95% 58.016us 58.016us 1
4031
+ aten::transpose 0.62% 49.288us 0.84% 66.489us 2.770us 0.000us 0.00% 0.000us 0.000us 24
4032
+ aten::as_strided 0.22% 17.201us 0.22% 17.201us 0.717us 0.000us 0.00% 0.000us 0.000us 24
4033
+ aten::empty_like 0.15% 12.041us 0.65% 51.362us 5.707us 0.000us 0.00% 0.000us 0.000us 9
4034
+ aten::empty 0.83% 65.351us 0.83% 65.351us 3.112us 0.000us 0.00% 0.000us 0.000us 21
4035
+ cudaLaunchKernel 4.09% 323.234us 4.09% 323.234us 26.936us 0.000us 0.00% 0.000us 0.000us 12
4036
+ cudaStreamIsCapturing 0.03% 2.670us 0.03% 2.670us 0.890us 0.000us 0.00% 0.000us 0.000us 3
4037
+ cudaFuncSetAttribute 0.04% 3.430us 0.04% 3.430us 1.143us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaDeviceSynchronize 70.22% 5.551ms 70.22% 5.551ms 5.551ms 0.000us 0.00% 0.000us 0.000us 1
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
+ Self CPU time total: 7.905ms
4041
+ Self CUDA time total: 6.112ms
4042
 
4043
 
4044
 
 
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4050
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4051
+ torch_mem_eff 2.78% 220.799us 28.42% 2.258ms 2.258ms 0.000us 0.00% 6.296ms 6.296ms 1
4052
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.245ms 100.15% 6.245ms 6.245ms 1
4053
+ aten::scaled_dot_product_attention 0.24% 19.311us 1.79% 142.116us 47.372us 0.000us 0.00% 5.574ms 1.858ms 3
4054
+ aten::_scaled_dot_product_efficient_attention 0.23% 17.909us 1.55% 122.805us 40.935us 0.000us 0.00% 5.574ms 1.858ms 3
4055
+ aten::_efficient_attention_forward 0.36% 28.682us 1.03% 82.073us 27.358us 5.574ms 89.39% 5.574ms 1.858ms 3
4056
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.574ms 89.39% 5.574ms 1.858ms 3
4057
+ aten::contiguous 0.09% 7.009us 23.32% 1.852ms 205.811us 0.000us 0.00% 721.599us 80.178us 9
4058
+ aten::clone 0.28% 22.450us 23.23% 1.845ms 205.033us 0.000us 0.00% 721.599us 80.178us 9
4059
+ aten::copy_ 0.87% 68.713us 22.33% 1.774ms 197.096us 661.695us 10.61% 721.599us 80.178us 9
4060
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 661.695us 10.61% 661.695us 73.522us 9
4061
+ Activity Buffer Request 17.91% 1.422ms 17.91% 1.422ms 1.422ms 59.904us 0.96% 59.904us 59.904us 1
4062
+ aten::transpose 0.61% 48.435us 0.82% 65.304us 2.721us 0.000us 0.00% 0.000us 0.000us 24
4063
+ aten::as_strided 0.21% 16.869us 0.21% 16.869us 0.703us 0.000us 0.00% 0.000us 0.000us 24
4064
+ aten::empty_like 0.14% 11.511us 0.62% 48.982us 5.442us 0.000us 0.00% 0.000us 0.000us 9
4065
+ aten::empty 0.78% 61.691us 0.78% 61.691us 2.938us 0.000us 0.00% 0.000us 0.000us 21
4066
+ cudaLaunchKernel 3.85% 305.580us 3.85% 305.580us 25.465us 0.000us 0.00% 0.000us 0.000us 12
4067
+ cudaStreamIsCapturing 0.03% 2.440us 0.03% 2.440us 0.813us 0.000us 0.00% 0.000us 0.000us 3
4068
+ cudaFuncSetAttribute 0.05% 3.920us 0.05% 3.920us 1.307us 0.000us 0.00% 0.000us 0.000us 3
4069
+ cudaDeviceSynchronize 71.58% 5.685ms 71.58% 5.685ms 5.685ms 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
+ Self CPU time total: 7.943ms
4072
+ Self CUDA time total: 6.236ms
4073
 
4074
 
4075
 
 
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ torch_mem_eff 3.27% 267.711us 29.30% 2.401ms 2.401ms 0.000us 0.00% 6.459ms 6.459ms 1
4083
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.406ms 100.13% 6.406ms 6.406ms 1
4084
+ aten::scaled_dot_product_attention 0.24% 19.643us 1.85% 151.176us 50.392us 0.000us 0.00% 5.726ms 1.909ms 3
4085
+ aten::_scaled_dot_product_efficient_attention 0.26% 20.920us 1.61% 131.533us 43.844us 0.000us 0.00% 5.726ms 1.909ms 3
4086
+ aten::_efficient_attention_forward 0.37% 30.563us 1.03% 84.603us 28.201us 5.726ms 89.50% 5.726ms 1.909ms 3
4087
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.726ms 89.50% 5.726ms 1.909ms 3
4088
+ aten::contiguous 0.09% 7.670us 23.58% 1.932ms 214.647us 0.000us 0.00% 733.247us 81.472us 9
4089
+ aten::clone 0.31% 25.042us 23.48% 1.924ms 213.795us 0.000us 0.00% 733.247us 81.472us 9
4090
+ aten::copy_ 0.88% 72.162us 22.52% 1.845ms 205.052us 671.711us 10.50% 733.247us 81.472us 9
4091
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 671.711us 10.50% 671.711us 74.635us 9
4092
+ Activity Buffer Request 17.78% 1.456ms 17.78% 1.456ms 1.456ms 61.536us 0.96% 61.536us 61.536us 1
4093
+ aten::transpose 0.71% 58.110us 0.93% 75.842us 3.160us 0.000us 0.00% 0.000us 0.000us 24
4094
+ aten::as_strided 0.22% 17.732us 0.22% 17.732us 0.739us 0.000us 0.00% 0.000us 0.000us 24
4095
+ aten::empty_like 0.15% 12.319us 0.65% 53.641us 5.960us 0.000us 0.00% 0.000us 0.000us 9
4096
+ aten::empty 0.81% 66.513us 0.81% 66.513us 3.167us 0.000us 0.00% 0.000us 0.000us 21
4097
+ cudaLaunchKernel 4.14% 339.159us 4.14% 339.159us 28.263us 0.000us 0.00% 0.000us 0.000us 12
4098
+ cudaStreamIsCapturing 0.03% 2.379us 0.03% 2.379us 0.793us 0.000us 0.00% 0.000us 0.000us 3
4099
+ cudaFuncSetAttribute 0.05% 4.230us 0.05% 4.230us 1.410us 0.000us 0.00% 0.000us 0.000us 3
4100
+ cudaDeviceSynchronize 70.70% 5.793ms 70.70% 5.793ms 5.793ms 0.000us 0.00% 0.000us 0.000us 1
4101
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4102
+ Self CPU time total: 8.193ms
4103
+ Self CUDA time total: 6.398ms
4104
 
4105
 
4106
  impl wl p50(ms) ok
4107
+ torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4108
+ torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
4109
+ torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
4110
+ torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
4111
+ torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
4112
+ torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4113
  </pre></div>
4114
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4115
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4116
+ <div class="uv-logs-content" style="display: none;">
4117
+ Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4118
+ Downloading networkx (1.9MiB)
4119
+ Downloading matplotlib (8.3MiB)
4120
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4121
+ Downloading sympy (6.0MiB)
4122
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4123
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4124
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4125
+ Downloading numpy (16.2MiB)
4126
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4127
+ Downloading setuptools (1.1MiB)
4128
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4129
+ Downloading nvidia-curand-cu12 (60.7MiB)
4130
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4131
+ Downloading kiwisolver (1.4MiB)
4132
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4133
+ Downloading fonttools (4.7MiB)
4134
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4135
+ Downloading pillow (6.7MiB)
4136
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4137
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4138
+ Downloading triton (148.3MiB)
4139
+ Downloading torch (846.9MiB)
4140
+ Downloading nvidia-cufile-cu12
4141
+ Downloading kiwisolver
4142
+ Downloading setuptools
4143
+ Downloading fonttools
4144
+ Downloading networkx
4145
+ Downloading pillow
4146
+ Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4147
+ Downloading nvidia-cuda-cupti-cu12
4148
+ Downloading matplotlib
4149
+ Downloading numpy
4150
+ Downloading sympy
4151
+ Downloading nvidia-nvjitlink-cu12
4152
+ Downloading nvidia-curand-cu12
4153
+ Downloading nvidia-cuda-nvrtc-cu12
4154
+ Downloading triton
4155
+ Downloading nvidia-cufft-cu12
4156
+ Downloading nvidia-cusolver-cu12
4157
+ Downloading nvidia-cusparse-cu12
4158
+ Downloading nvidia-cusparselt-cu12
4159
+ Downloading nvidia-nccl-cu12
4160
+ Downloading nvidia-cublas-cu12
4161
+ Downloading nvidia-cudnn-cu12
4162
+ Downloading torch
4163
+ Installed 37 packages in 216ms
4164
+ </div>
4165
+ </div>
4166
  <div class="cell-artifacts">
4167
  <h4>Artifacts:</h4>
4168
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/sage_attention.html CHANGED
@@ -3869,15 +3869,15 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 4.37s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3878
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
3879
  </div>
3880
- <div id="code-benchmark" class="cell-code" data-lines="33">
3881
  <div class="code-wrap">
3882
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3883
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
@@ -3886,7 +3886,6 @@ Cell: benchmark | 4.37s
3886
  <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3887
  <span class="c1"># &quot;kernels&quot;,</span>
3888
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
3889
- <span class="c1"># &quot;sageattention&quot;,</span>
3890
  <span class="c1"># ]</span>
3891
  <span class="c1">#</span>
3892
  <span class="c1"># [tool.uv.sources]</span>
@@ -3921,28 +3920,23 @@ Cell: benchmark | 4.37s
3921
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3922
  impl wl p50(ms) ok
3923
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3924
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3925
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3926
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3927
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3928
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3929
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3930
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3931
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3932
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3933
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3934
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
3935
  </pre></div>
3936
- <div class="uv-install-logs" id="uv-logs-benchmark">
3937
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3938
- <div class="uv-logs-content" style="display: none;">
3939
- Installed 1 package in 11ms
3940
  </div>
3941
- </div>
3942
- <div class="cell-stderr">Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3943
- Fetching 11 files: 27%|██▋ | 3/11 [00:00&lt;00:00, 14.92it/s]
3944
- Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 14.19it/s]
3945
- Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 19.60it/s]</div>
3946
  <div class="cell-artifacts">
3947
  <h4>Artifacts:</h4>
3948
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3869
  <span class="collapse-indicators">
3870
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 4.22s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
3878
  <a href="https://github.com/huggingface/kernels-uvnotes/blob/main/flash_attn/impls/sage_attention.md" target="_blank" class="github-btn">GitHub</a>
3879
  </div>
3880
+ <div id="code-benchmark" class="cell-code" data-lines="32">
3881
  <div class="code-wrap">
3882
  <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
3883
  <span class="c1"># requires-python = &quot;&gt;=3.10&quot;</span>
 
3886
  <span class="c1"># &quot;torch==2.8.0&quot;,</span>
3887
  <span class="c1"># &quot;kernels&quot;,</span>
3888
  <span class="c1"># &quot;kernels-benchmark-tools&quot;,</span>
 
3889
  <span class="c1"># ]</span>
3890
  <span class="c1">#</span>
3891
  <span class="c1"># [tool.uv.sources]</span>
 
3920
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3921
  impl wl p50(ms) ok
3922
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3923
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3924
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3925
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3926
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3927
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3928
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3929
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3930
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3931
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3932
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3933
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
3934
  </pre></div>
3935
+ <div class="cell-stderr">
3936
+ Fetching 11 files: 0%| | 0/11 [00:00&lt;?, ?it/s]
3937
+ Fetching 11 files: 73%|███████▎ | 8/11 [00:00&lt;00:00, 13.92it/s]
3938
+ Fetching 11 files: 100%|██████████| 11/11 [00:00&lt;00:00, 19.13it/s]
3939
  </div>
 
 
 
 
 
3940
  <div class="cell-artifacts">
3941
  <h4>Artifacts:</h4>
3942
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/xformers.html CHANGED
@@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
- Cell: benchmark | 5.09s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3923,21 +3923,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
- xformers_meff 10.73% 481.606us 51.24% 2.299ms 2.299ms 0.000us 0.00% 3.630ms 3.630ms 1
3927
- xformers_flash3::flash_fwd 4.33% 194.084us 39.70% 1.781ms 593.782us 0.000us 0.00% 3.630ms 1.210ms 3
3928
- flash_attn_3::fwd 1.76% 78.961us 35.37% 1.587ms 529.087us 2.729ms 100.00% 3.630ms 1.210ms 3
3929
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.730ms 100.05% 2.730ms 2.730ms 1
3930
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.729ms 100.00% 2.729ms 909.588us 3
3931
- Activity Buffer Request 31.70% 1.423ms 31.70% 1.423ms 1.423ms 901.535us 33.04% 901.535us 901.535us 1
3932
- aten::empty 0.75% 33.761us 0.75% 33.761us 5.627us 0.000us 0.00% 0.000us 0.000us 6
3933
- cudaFuncSetAttribute 0.28% 12.380us 0.28% 12.380us 4.127us 0.000us 0.00% 0.000us 0.000us 3
3934
- cudaLaunchKernel 0.88% 39.570us 0.88% 39.570us 13.190us 0.000us 0.00% 0.000us 0.000us 3
3935
- aten::reshape 0.30% 13.520us 0.80% 36.080us 6.013us 0.000us 0.00% 0.000us 0.000us 6
3936
- aten::view 0.50% 22.560us 0.50% 22.560us 3.760us 0.000us 0.00% 0.000us 0.000us 6
3937
- cudaDeviceSynchronize 48.76% 2.188ms 48.76% 2.188ms 2.188ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
- Self CPU time total: 4.487ms
3940
- Self CUDA time total: 2.729ms
3941
 
3942
 
3943
 
@@ -3947,21 +3947,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
- xformers_meff 7.10% 312.113us 46.81% 2.059ms 2.059ms 0.000us 0.00% 3.744ms 3.744ms 1
3951
- xformers_flash3::flash_fwd 3.88% 170.673us 39.17% 1.723ms 574.405us 0.000us 0.00% 3.744ms 1.248ms 3
3952
- flash_attn_3::fwd 1.28% 56.171us 35.29% 1.553ms 517.514us 2.795ms 100.00% 3.744ms 1.248ms 3
3953
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.796ms 100.05% 2.796ms 2.796ms 1
3954
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.630us 3
3955
- Activity Buffer Request 32.47% 1.428ms 32.47% 1.428ms 1.428ms 948.729us 33.95% 948.729us 948.729us 1
3956
- aten::empty 0.66% 29.091us 0.66% 29.091us 4.848us 0.000us 0.00% 0.000us 0.000us 6
3957
- cudaFuncSetAttribute 0.13% 5.590us 0.13% 5.590us 1.863us 0.000us 0.00% 0.000us 0.000us 3
3958
- cudaLaunchKernel 0.76% 33.440us 0.76% 33.440us 11.147us 0.000us 0.00% 0.000us 0.000us 3
3959
- aten::reshape 0.20% 8.951us 0.54% 23.831us 3.972us 0.000us 0.00% 0.000us 0.000us 6
3960
- aten::view 0.34% 14.880us 0.34% 14.880us 2.480us 0.000us 0.00% 0.000us 0.000us 6
3961
- cudaDeviceSynchronize 53.19% 2.340ms 53.19% 2.340ms 2.340ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
- Self CPU time total: 4.399ms
3964
- Self CUDA time total: 2.795ms
3965
 
3966
 
3967
 
@@ -3971,21 +3971,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
- xformers_meff 6.52% 299.466us 45.41% 2.085ms 2.085ms 0.000us 0.00% 3.907ms 3.907ms 1
3975
- xformers_flash3::flash_fwd 3.09% 142.061us 38.39% 1.763ms 587.558us 0.000us 0.00% 3.907ms 1.302ms 3
3976
- flash_attn_3::fwd 1.15% 53.012us 35.30% 1.621ms 540.204us 2.913ms 100.00% 3.907ms 1.302ms 3
3977
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.915ms 100.06% 2.915ms 2.915ms 1
3978
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.913ms 100.00% 2.913ms 971.158us 3
3979
- Activity Buffer Request 32.68% 1.500ms 32.68% 1.500ms 1.500ms 993.281us 34.09% 993.281us 993.281us 1
3980
- aten::empty 0.62% 28.380us 0.62% 28.380us 4.730us 0.000us 0.00% 0.000us 0.000us 6
3981
- cudaFuncSetAttribute 0.11% 5.270us 0.11% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
3982
- cudaLaunchKernel 0.73% 33.640us 0.73% 33.640us 11.213us 0.000us 0.00% 0.000us 0.000us 3
3983
- aten::reshape 0.18% 8.421us 0.49% 22.660us 3.777us 0.000us 0.00% 0.000us 0.000us 6
3984
- aten::view 0.31% 14.239us 0.31% 14.239us 2.373us 0.000us 0.00% 0.000us 0.000us 6
3985
- cudaDeviceSynchronize 54.59% 2.507ms 54.59% 2.507ms 2.507ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
- Self CPU time total: 4.591ms
3988
- Self CUDA time total: 2.913ms
3989
 
3990
 
3991
 
@@ -3995,21 +3995,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- xformers_meff 6.26% 300.335us 46.54% 2.234ms 2.234ms 0.000us 0.00% 3.980ms 3.980ms 1
3999
- xformers_flash3::flash_fwd 3.08% 147.673us 39.81% 1.911ms 637.009us 0.000us 0.00% 3.980ms 1.327ms 3
4000
- flash_attn_3::fwd 1.12% 53.571us 36.74% 1.763ms 587.785us 2.981ms 100.00% 3.980ms 1.327ms 3
4001
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.05% 2.982ms 2.982ms 1
4002
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.981ms 100.00% 2.981ms 993.631us 3
4003
- Activity Buffer Request 29.81% 1.431ms 29.81% 1.431ms 1.431ms 999.263us 33.52% 999.263us 999.263us 1
4004
- aten::empty 0.60% 28.930us 0.60% 28.930us 4.822us 0.000us 0.00% 0.000us 0.000us 6
4005
- cudaFuncSetAttribute 0.12% 5.610us 0.12% 5.610us 1.870us 0.000us 0.00% 0.000us 0.000us 3
4006
- cudaLaunchKernel 5.09% 244.533us 5.09% 244.533us 81.511us 0.000us 0.00% 0.000us 0.000us 3
4007
- aten::reshape 0.18% 8.489us 0.47% 22.530us 3.755us 0.000us 0.00% 0.000us 0.000us 6
4008
- aten::view 0.29% 14.041us 0.29% 14.041us 2.340us 0.000us 0.00% 0.000us 0.000us 6
4009
- cudaDeviceSynchronize 53.46% 2.566ms 53.46% 2.566ms 2.566ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- Self CPU time total: 4.800ms
4012
- Self CUDA time total: 2.981ms
4013
 
4014
 
4015
 
@@ -4019,21 +4019,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- xformers_meff 5.98% 313.865us 42.05% 2.207ms 2.207ms 0.000us 0.00% 4.635ms 4.635ms 1
4023
- xformers_flash3::flash_fwd 2.80% 146.723us 35.63% 1.870ms 623.176us 0.000us 0.00% 4.635ms 1.545ms 3
4024
- flash_attn_3::fwd 0.99% 51.861us 32.83% 1.723ms 574.268us 3.467ms 100.00% 4.635ms 1.545ms 3
4025
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.469ms 100.05% 3.469ms 3.469ms 1
4026
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.467ms 100.00% 3.467ms 1.156ms 3
4027
- Activity Buffer Request 27.82% 1.460ms 27.82% 1.460ms 1.460ms 1.168ms 33.68% 1.168ms 1.168ms 1
4028
- aten::empty 0.56% 29.260us 0.56% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6
4029
- cudaFuncSetAttribute 0.12% 6.040us 0.12% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3
4030
- cudaLaunchKernel 3.35% 175.903us 3.35% 175.903us 58.634us 0.000us 0.00% 0.000us 0.000us 3
4031
- aten::reshape 0.16% 8.638us 0.44% 23.169us 3.862us 0.000us 0.00% 0.000us 0.000us 6
4032
- aten::view 0.28% 14.531us 0.28% 14.531us 2.422us 0.000us 0.00% 0.000us 0.000us 6
4033
- cudaDeviceSynchronize 57.95% 3.041ms 57.95% 3.041ms 3.041ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
- Self CPU time total: 5.247ms
4036
- Self CUDA time total: 3.467ms
4037
 
4038
 
4039
 
@@ -4043,30 +4043,30 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
- xformers_meff 5.97% 309.094us 41.86% 2.166ms 2.166ms 0.000us 0.00% 4.567ms 4.567ms 1
4047
- xformers_flash3::flash_fwd 2.75% 142.242us 35.45% 1.834ms 611.405us 0.000us 0.00% 4.567ms 1.522ms 3
4048
- flash_attn_3::fwd 1.04% 53.951us 32.70% 1.692ms 563.991us 3.419ms 100.00% 4.567ms 1.522ms 3
4049
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.421ms 100.05% 3.421ms 3.421ms 1
4050
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
4051
- Activity Buffer Request 27.74% 1.436ms 27.74% 1.436ms 1.436ms 1.148ms 33.59% 1.148ms 1.148ms 1
4052
- aten::empty 0.58% 29.770us 0.58% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6
4053
- cudaFuncSetAttribute 0.11% 5.591us 0.11% 5.591us 1.864us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaLaunchKernel 3.23% 167.152us 3.23% 167.152us 55.717us 0.000us 0.00% 0.000us 0.000us 3
4055
- aten::reshape 0.16% 8.371us 0.44% 22.751us 3.792us 0.000us 0.00% 0.000us 0.000us 6
4056
- aten::view 0.28% 14.380us 0.28% 14.380us 2.397us 0.000us 0.00% 0.000us 0.000us 6
4057
- cudaDeviceSynchronize 58.14% 3.008ms 58.14% 3.008ms 3.008ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- Self CPU time total: 5.174ms
4060
- Self CUDA time total: 3.419ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
- xformers_meff cuda_attn_L128_bfloat16 1.00 True
4065
- xformers_meff cuda_attn_L256_bfloat16 1.04 True
4066
- xformers_meff cuda_attn_L320_bfloat16 1.09 True
4067
- xformers_meff cuda_attn_L384_bfloat16 1.11 True
4068
- xformers_meff cuda_attn_L448_bfloat16 1.26 True
4069
- xformers_meff cuda_attn_L512_bfloat16 1.25 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 
3871
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3872
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3873
  </span> |
3874
+ Cell: benchmark | 5.02s
3875
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3876
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3877
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3923
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3924
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3925
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3926
+ xformers_meff 9.93% 451.937us 49.71% 2.262ms 2.262ms 0.000us 0.00% 3.695ms 3.695ms 1
3927
+ xformers_flash3::flash_fwd 4.26% 193.656us 38.96% 1.773ms 590.904us 0.000us 0.00% 3.695ms 1.232ms 3
3928
+ flash_attn_3::fwd 1.62% 73.841us 34.71% 1.579ms 526.352us 2.795ms 100.00% 3.695ms 1.232ms 3
3929
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.797ms 100.05% 2.797ms 2.797ms 1
3930
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.00% 2.795ms 931.773us 3
3931
+ Activity Buffer Request 31.17% 1.418ms 31.17% 1.418ms 1.418ms 899.421us 32.18% 899.421us 899.421us 1
3932
+ aten::empty 0.76% 34.741us 0.76% 34.741us 5.790us 0.000us 0.00% 0.000us 0.000us 6
3933
+ cudaFuncSetAttribute 0.30% 13.732us 0.30% 13.732us 4.577us 0.000us 0.00% 0.000us 0.000us 3
3934
+ cudaLaunchKernel 0.85% 38.662us 0.85% 38.662us 12.887us 0.000us 0.00% 0.000us 0.000us 3
3935
+ aten::reshape 0.35% 15.860us 0.82% 37.181us 6.197us 0.000us 0.00% 0.000us 0.000us 6
3936
+ aten::view 0.47% 21.321us 0.47% 21.321us 3.553us 0.000us 0.00% 0.000us 0.000us 6
3937
+ cudaDeviceSynchronize 50.29% 2.288ms 50.29% 2.288ms 2.288ms 0.000us 0.00% 0.000us 0.000us 1
3938
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3939
+ Self CPU time total: 4.550ms
3940
+ Self CUDA time total: 2.795ms
3941
 
3942
 
3943
 
 
3947
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3948
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3949
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3950
+ xformers_meff 6.95% 312.321us 44.96% 2.021ms 2.021ms 0.000us 0.00% 3.832ms 3.832ms 1
3951
+ xformers_flash3::flash_fwd 3.14% 141.315us 37.51% 1.686ms 561.970us 0.000us 0.00% 3.832ms 1.277ms 3
3952
+ flash_attn_3::fwd 1.18% 53.030us 34.37% 1.545ms 514.865us 2.890ms 100.00% 3.832ms 1.277ms 3
3953
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.05% 2.892ms 2.892ms 1
3954
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.00% 2.890ms 963.329us 3
3955
+ Activity Buffer Request 31.64% 1.422ms 31.64% 1.422ms 1.422ms 942.465us 32.61% 942.465us 942.465us 1
3956
+ aten::empty 0.68% 30.660us 0.68% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
3957
+ cudaFuncSetAttribute 0.12% 5.592us 0.12% 5.592us 1.864us 0.000us 0.00% 0.000us 0.000us 3
3958
+ cudaLaunchKernel 0.74% 33.432us 0.74% 33.432us 11.144us 0.000us 0.00% 0.000us 0.000us 3
3959
+ aten::reshape 0.20% 8.951us 0.50% 22.691us 3.782us 0.000us 0.00% 0.000us 0.000us 6
3960
+ aten::view 0.31% 13.740us 0.31% 13.740us 2.290us 0.000us 0.00% 0.000us 0.000us 6
3961
+ cudaDeviceSynchronize 55.04% 2.474ms 55.04% 2.474ms 2.474ms 0.000us 0.00% 0.000us 0.000us 1
3962
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3963
+ Self CPU time total: 4.495ms
3964
+ Self CUDA time total: 2.890ms
3965
 
3966
 
3967
 
 
3971
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3972
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3973
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3974
+ xformers_meff 6.65% 298.008us 44.73% 2.006ms 2.006ms 0.000us 0.00% 3.867ms 3.867ms 1
3975
+ xformers_flash3::flash_fwd 3.15% 141.235us 37.58% 1.685ms 561.690us 0.000us 0.00% 3.867ms 1.289ms 3
3976
+ flash_attn_3::fwd 1.18% 53.120us 34.43% 1.544ms 514.611us 2.888ms 100.00% 3.867ms 1.289ms 3
3977
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1
3978
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.683us 3
3979
+ Activity Buffer Request 31.72% 1.422ms 31.72% 1.422ms 1.422ms 978.939us 33.90% 978.939us 978.939us 1
3980
+ aten::empty 0.67% 30.192us 0.67% 30.192us 5.032us 0.000us 0.00% 0.000us 0.000us 6
3981
+ cudaFuncSetAttribute 0.12% 5.491us 0.12% 5.491us 1.830us 0.000us 0.00% 0.000us 0.000us 3
3982
+ cudaLaunchKernel 0.73% 32.901us 0.73% 32.901us 10.967us 0.000us 0.00% 0.000us 0.000us 3
3983
+ aten::reshape 0.20% 8.773us 0.50% 22.603us 3.767us 0.000us 0.00% 0.000us 0.000us 6
3984
+ aten::view 0.31% 13.830us 0.31% 13.830us 2.305us 0.000us 0.00% 0.000us 0.000us 6
3985
+ cudaDeviceSynchronize 55.27% 2.478ms 55.27% 2.478ms 2.478ms 0.000us 0.00% 0.000us 0.000us 1
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
+ Self CPU time total: 4.484ms
3988
+ Self CUDA time total: 2.888ms
3989
 
3990
 
3991
 
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ xformers_meff 6.31% 299.042us 46.56% 2.205ms 2.205ms 0.000us 0.00% 3.936ms 3.936ms 1
3999
+ xformers_flash3::flash_fwd 2.97% 140.784us 39.75% 1.883ms 627.609us 0.000us 0.00% 3.936ms 1.312ms 3
4000
+ flash_attn_3::fwd 1.10% 52.191us 36.78% 1.742ms 580.681us 2.941ms 100.00% 3.936ms 1.312ms 3
4001
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1
4002
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.941ms 100.00% 2.941ms 980.445us 3
4003
+ Activity Buffer Request 30.11% 1.426ms 30.11% 1.426ms 1.426ms 994.973us 33.83% 994.973us 994.973us 1
4004
+ aten::empty 0.64% 30.333us 0.64% 30.333us 5.055us 0.000us 0.00% 0.000us 0.000us 6
4005
+ cudaFuncSetAttribute 0.11% 5.440us 0.11% 5.440us 1.813us 0.000us 0.00% 0.000us 0.000us 3
4006
+ cudaLaunchKernel 4.81% 227.898us 4.81% 227.898us 75.966us 0.000us 0.00% 0.000us 0.000us 3
4007
+ aten::reshape 0.19% 8.769us 0.49% 23.220us 3.870us 0.000us 0.00% 0.000us 0.000us 6
4008
+ aten::view 0.31% 14.451us 0.31% 14.451us 2.409us 0.000us 0.00% 0.000us 0.000us 6
4009
+ cudaDeviceSynchronize 53.44% 2.531ms 53.44% 2.531ms 2.531ms 0.000us 0.00% 0.000us 0.000us 1
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
+ Self CPU time total: 4.736ms
4012
+ Self CUDA time total: 2.941ms
4013
 
4014
 
4015
 
 
4019
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4020
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ xformers_meff 5.82% 299.962us 41.73% 2.152ms 2.152ms 0.000us 0.00% 4.566ms 4.566ms 1
4023
+ xformers_flash3::flash_fwd 2.76% 142.114us 35.47% 1.829ms 609.751us 0.000us 0.00% 4.566ms 1.522ms 3
4024
+ flash_attn_3::fwd 1.04% 53.631us 32.71% 1.687ms 562.380us 3.419ms 100.00% 4.566ms 1.522ms 3
4025
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.420ms 100.05% 3.420ms 3.420ms 1
4026
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.419ms 100.00% 3.419ms 1.140ms 3
4027
+ Activity Buffer Request 27.56% 1.422ms 27.56% 1.422ms 1.422ms 1.148ms 33.58% 1.148ms 1.148ms 1
4028
+ aten::empty 0.60% 31.172us 0.60% 31.172us 5.195us 0.000us 0.00% 0.000us 0.000us 6
4029
+ cudaFuncSetAttribute 0.11% 5.431us 0.11% 5.431us 1.810us 0.000us 0.00% 0.000us 0.000us 3
4030
+ cudaLaunchKernel 3.40% 175.366us 3.40% 175.366us 58.455us 0.000us 0.00% 0.000us 0.000us 3
4031
+ aten::reshape 0.17% 8.849us 0.45% 23.030us 3.838us 0.000us 0.00% 0.000us 0.000us 6
4032
+ aten::view 0.27% 14.181us 0.27% 14.181us 2.363us 0.000us 0.00% 0.000us 0.000us 6
4033
+ cudaDeviceSynchronize 58.27% 3.005ms 58.27% 3.005ms 3.005ms 0.000us 0.00% 0.000us 0.000us 1
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
+ Self CPU time total: 5.157ms
4036
+ Self CUDA time total: 3.419ms
4037
 
4038
 
4039
 
 
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4045
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4046
+ xformers_meff 5.76% 295.800us 41.67% 2.139ms 2.139ms 0.000us 0.00% 4.557ms 4.557ms 1
4047
+ xformers_flash3::flash_fwd 2.75% 141.044us 35.47% 1.821ms 606.924us 0.000us 0.00% 4.557ms 1.519ms 3
4048
+ flash_attn_3::fwd 1.04% 53.523us 32.72% 1.680ms 559.910us 3.405ms 100.00% 4.557ms 1.519ms 3
4049
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.407ms 100.05% 3.407ms 3.407ms 1
4050
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.405ms 100.00% 3.405ms 1.135ms 3
4051
+ Activity Buffer Request 27.67% 1.420ms 27.67% 1.420ms 1.420ms 1.152ms 33.82% 1.152ms 1.152ms 1
4052
+ aten::empty 0.60% 30.610us 0.60% 30.610us 5.102us 0.000us 0.00% 0.000us 0.000us 6
4053
+ cudaFuncSetAttribute 0.12% 6.310us 0.12% 6.310us 2.103us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaLaunchKernel 3.29% 168.946us 3.29% 168.946us 56.315us 0.000us 0.00% 0.000us 0.000us 3
4055
+ aten::reshape 0.17% 8.721us 0.44% 22.392us 3.732us 0.000us 0.00% 0.000us 0.000us 6
4056
+ aten::view 0.27% 13.671us 0.27% 13.671us 2.279us 0.000us 0.00% 0.000us 0.000us 6
4057
+ cudaDeviceSynchronize 58.33% 2.994ms 58.33% 2.994ms 2.994ms 0.000us 0.00% 0.000us 0.000us 1
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ Self CPU time total: 5.133ms
4060
+ Self CUDA time total: 3.405ms
4061
 
4062
 
4063
  impl wl p50(ms) ok
4064
+ xformers_meff cuda_attn_L128_bfloat16 0.98 True
4065
+ xformers_meff cuda_attn_L256_bfloat16 1.03 True
4066
+ xformers_meff cuda_attn_L320_bfloat16 1.08 True
4067
+ xformers_meff cuda_attn_L384_bfloat16 1.10 True
4068
+ xformers_meff cuda_attn_L448_bfloat16 1.23 True
4069
+ xformers_meff cuda_attn_L512_bfloat16 1.22 True
4070
  </pre></div>
4071
  <div class="uv-install-logs" id="uv-logs-benchmark">
4072
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: a94beca550ea0b3ff8a0f0eef062da6a6179ae09e78edc24cbacb71d8bd623a2
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB

Git LFS Details

  • SHA256: 168c229932ad06a68508a4a77b66485ff9bcf48ed736a5ffdd003f5cb9e8e639
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/combined_results.html CHANGED
@@ -3872,7 +3872,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-27T14:46:38.946915</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3982,96 +3982,96 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
- <path d="M 47.81 413.210177 L 835.361742 413.210177 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
- <use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
- <path d="M 47.81 355.233116 L 835.361742 355.233116 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
- <use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
- <path d="M 47.81 297.256055 L 835.361742 297.256055 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
- <use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
- <path d="M 47.81 239.278993 L 835.361742 239.278993 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
- <use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
- <path d="M 47.81 181.301932 L 835.361742 181.301932 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
- <use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
- <path d="M 47.81 123.324871 L 835.361742 123.324871 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
- <use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_7">
4065
  <g id="grid-y--8" class="grid grid-y">
4066
- <path d="M 47.81 65.347809 L 835.361742 65.347809 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
- <use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
@@ -4079,73 +4079,73 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
- <path d="M 83.607806 349.439178 L 226.799032 333.602454 L 369.990258 324.473676 L 513.181484 316.069901 L 656.37271 272.899601 L 799.563935 261.559288 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
  <g clip-path="url(#p09feef2583)">
4087
- <use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
4088
- <use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
4089
- <use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
4090
- <use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
4091
- <use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
4092
- <use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
- <path d="M 83.607806 156.020744 L 226.799032 138.969401 L 369.990258 109.128607 L 513.181484 99.249026 L 656.37271 87.05645 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
  <g clip-path="url(#p09feef2583)">
4101
- <use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
- <use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
- <use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
- <use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
- <use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
- <path d="M 83.607806 414.345368 L 226.799032 400.181572 L 369.990258 385.808769 L 513.181484 380.581847 L 656.37271 338.122056 L 799.563935 339.866876 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
  <g clip-path="url(#p09feef2583)">
4115
- <use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
4116
- <use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
4117
- <use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
4118
- <use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
4119
- <use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
4120
- <use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
- <path d="M 83.607806 420.20395 L 226.799032 407.432473 L 369.990258 399.40236 L 513.181484 392.590345 L 656.37271 345.709514 L 799.563935 346.355668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
  <g clip-path="url(#p09feef2583)">
4129
- <use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
4130
- <use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
4131
- <use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
4132
- <use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
4133
- <use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
4134
- <use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
- <path d="M 83.607806 428.387702 L 226.799032 420.061906 L 369.990258 405.625328 L 513.181484 401.010644 L 656.37271 352.807645 L 799.563935 359.622849 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
  <g clip-path="url(#p09feef2583)">
4143
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
4145
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
4146
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
4147
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
4148
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
@@ -4230,7 +4230,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
4230
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4231
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4232
  </span> |
4233
- Cell: combine | 4.50s
4234
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4235
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4236
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4315,7 +4315,7 @@ LOADING BENCHMARK DATA
4315
  ✓ xFormers : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
4316
  ✓ HF Kernels Flash Attn : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
4317
  ✓ HF Kernels Flash Attn3 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
4318
- ✓ SageAttention : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f
4319
 
4320
  ✓ Found Flash (PyTorch SDPA)
4321
  Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
@@ -4328,7 +4328,7 @@ LOADING BENCHMARK DATA
4328
  ✓ Found HF Kernels Flash Attn3
4329
  Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
4330
  ✓ Found SageAttention
4331
- Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/1355120a3e88bcb74f4130be51dfe8b03e7dc2b7823f2a53b20da7899570a16f/attention.jsonl
4332
 
4333
  ======================================================================
4334
  Summary: 6 found, 0 skipped, 0 missing
@@ -4337,48 +4337,48 @@ Summary: 6 found, 0 skipped, 0 missing
4337
  COMBINED BENCHMARK SUMMARY
4338
 
4339
  impl wl p50(ms) ok
4340
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.98 True
4341
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.02 True
4342
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.05 True
4343
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.07 True
4344
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.23 True
4345
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.23 True
4346
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.95 True
4347
- hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4348
- hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.03 True
4349
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.04 True
4350
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.21 True
4351
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
- Error: module &#x27;sage_attention_12c766386675beb4&#x27; has no attribute &#x27;fwd&#x27;
4364
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4365
  torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4366
- torch_flash_ma cuda_attn_L320_bfloat16 1.31 True
4367
- torch_flash_ma cuda_attn_L384_bfloat16 1.34 True
4368
- torch_flash_ma cuda_attn_L448_bfloat16 1.48 True
4369
- torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4370
- torch_mem_eff cuda_attn_L128_bfloat16 1.89 True
4371
- torch_mem_eff cuda_attn_L256_bfloat16 1.95 True
4372
- torch_mem_eff cuda_attn_L320_bfloat16 2.05 True
4373
- torch_mem_eff cuda_attn_L384_bfloat16 2.08 True
4374
- torch_mem_eff cuda_attn_L448_bfloat16 2.13 True
4375
- torch_mem_eff cuda_attn_L512_bfloat16 2.27 True
4376
- xformers_meff cuda_attn_L128_bfloat16 1.00 True
4377
- xformers_meff cuda_attn_L256_bfloat16 1.04 True
4378
- xformers_meff cuda_attn_L320_bfloat16 1.09 True
4379
- xformers_meff cuda_attn_L384_bfloat16 1.11 True
4380
- xformers_meff cuda_attn_L448_bfloat16 1.26 True
4381
- xformers_meff cuda_attn_L512_bfloat16 1.25 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
@@ -4402,7 +4402,7 @@ Implementations included:
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
- Installed 37 packages in 259ms
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
@@ -4415,7 +4415,7 @@ Installed 37 packages in 259ms
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
- <dc:date>2025-10-27T14:46:38.946915</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
@@ -4525,96 +4525,96 @@ Installed 37 packages in 259ms
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
- <path d="M 47.81 413.210177 L 835.361742 413.210177 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
- <use ns4:href="#m0fca2865ba" x="47.81" y="413.210177" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="417.009396" transform="rotate(-0 40.81 417.009396)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
- <path d="M 47.81 355.233116 L 835.361742 355.233116 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
- <use ns4:href="#m0fca2865ba" x="47.81" y="355.233116" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="359.032335" transform="rotate(-0 40.81 359.032335)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
- <path d="M 47.81 297.256055 L 835.361742 297.256055 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
- <use ns4:href="#m0fca2865ba" x="47.81" y="297.256055" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="301.055273" transform="rotate(-0 40.81 301.055273)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
- <path d="M 47.81 239.278993 L 835.361742 239.278993 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
- <use ns4:href="#m0fca2865ba" x="47.81" y="239.278993" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="243.078212" transform="rotate(-0 40.81 243.078212)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
- <path d="M 47.81 181.301932 L 835.361742 181.301932 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
- <use ns4:href="#m0fca2865ba" x="47.81" y="181.301932" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="185.101151" transform="rotate(-0 40.81 185.101151)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
- <path d="M 47.81 123.324871 L 835.361742 123.324871 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
- <use ns4:href="#m0fca2865ba" x="47.81" y="123.324871" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="127.124089" transform="rotate(-0 40.81 127.124089)">2.0</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_7">
4608
  <g id="grid-y--8" class="grid grid-y">
4609
- <path d="M 47.81 65.347809 L 835.361742 65.347809 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_13">
4612
  <g>
4613
- <use ns4:href="#m0fca2865ba" x="47.81" y="65.347809" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_13">
4617
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="69.147028" transform="rotate(-0 40.81 69.147028)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
@@ -4622,73 +4622,73 @@ Installed 37 packages in 259ms
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
- <path d="M 83.607806 349.439178 L 226.799032 333.602454 L 369.990258 324.473676 L 513.181484 316.069901 L 656.37271 272.899601 L 799.563935 261.559288 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p09feef2583)">
4630
- <use ns4:href="#md7efaf3aec" x="83.607806" y="349.439178" style="fill: #1f77b4; stroke: #1f77b4" />
4631
- <use ns4:href="#md7efaf3aec" x="226.799032" y="333.602454" style="fill: #1f77b4; stroke: #1f77b4" />
4632
- <use ns4:href="#md7efaf3aec" x="369.990258" y="324.473676" style="fill: #1f77b4; stroke: #1f77b4" />
4633
- <use ns4:href="#md7efaf3aec" x="513.181484" y="316.069901" style="fill: #1f77b4; stroke: #1f77b4" />
4634
- <use ns4:href="#md7efaf3aec" x="656.37271" y="272.899601" style="fill: #1f77b4; stroke: #1f77b4" />
4635
- <use ns4:href="#md7efaf3aec" x="799.563935" y="261.559288" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
- <path d="M 83.607806 156.020744 L 226.799032 138.969401 L 369.990258 109.128607 L 513.181484 99.249026 L 656.37271 87.05645 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
  <g clip-path="url(#p09feef2583)">
4644
- <use ns4:href="#m9b8c54d372" x="83.607806" y="156.020744" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
- <use ns4:href="#m9b8c54d372" x="226.799032" y="138.969401" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
- <use ns4:href="#m9b8c54d372" x="369.990258" y="109.128607" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
- <use ns4:href="#m9b8c54d372" x="513.181484" y="99.249026" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
- <use ns4:href="#m9b8c54d372" x="656.37271" y="87.05645" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
- <path d="M 83.607806 414.345368 L 226.799032 400.181572 L 369.990258 385.808769 L 513.181484 380.581847 L 656.37271 338.122056 L 799.563935 339.866876 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
  <g clip-path="url(#p09feef2583)">
4658
- <use ns4:href="#mc655281e0b" x="83.607806" y="414.345368" style="fill: #2ca02c; stroke: #2ca02c" />
4659
- <use ns4:href="#mc655281e0b" x="226.799032" y="400.181572" style="fill: #2ca02c; stroke: #2ca02c" />
4660
- <use ns4:href="#mc655281e0b" x="369.990258" y="385.808769" style="fill: #2ca02c; stroke: #2ca02c" />
4661
- <use ns4:href="#mc655281e0b" x="513.181484" y="380.581847" style="fill: #2ca02c; stroke: #2ca02c" />
4662
- <use ns4:href="#mc655281e0b" x="656.37271" y="338.122056" style="fill: #2ca02c; stroke: #2ca02c" />
4663
- <use ns4:href="#mc655281e0b" x="799.563935" y="339.866876" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
- <path d="M 83.607806 420.20395 L 226.799032 407.432473 L 369.990258 399.40236 L 513.181484 392.590345 L 656.37271 345.709514 L 799.563935 346.355668 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
  <g clip-path="url(#p09feef2583)">
4672
- <use ns4:href="#m61c8040d7e" x="83.607806" y="420.20395" style="fill: #d62728; stroke: #d62728" />
4673
- <use ns4:href="#m61c8040d7e" x="226.799032" y="407.432473" style="fill: #d62728; stroke: #d62728" />
4674
- <use ns4:href="#m61c8040d7e" x="369.990258" y="399.40236" style="fill: #d62728; stroke: #d62728" />
4675
- <use ns4:href="#m61c8040d7e" x="513.181484" y="392.590345" style="fill: #d62728; stroke: #d62728" />
4676
- <use ns4:href="#m61c8040d7e" x="656.37271" y="345.709514" style="fill: #d62728; stroke: #d62728" />
4677
- <use ns4:href="#m61c8040d7e" x="799.563935" y="346.355668" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
- <path d="M 83.607806 428.387702 L 226.799032 420.061906 L 369.990258 405.625328 L 513.181484 401.010644 L 656.37271 352.807645 L 799.563935 359.622849 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
  <g clip-path="url(#p09feef2583)">
4686
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="420.061906" style="fill: #9467bd; stroke: #9467bd" />
4688
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.625328" style="fill: #9467bd; stroke: #9467bd" />
4689
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="401.010644" style="fill: #9467bd; stroke: #9467bd" />
4690
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="352.807645" style="fill: #9467bd; stroke: #9467bd" />
4691
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="359.622849" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
 
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-28T14:09:17.505622</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3982
  <g id="matplotlib.axis_2">
3983
  <g id="ytick_1">
3984
  <g id="grid-y--2" class="grid grid-y">
3985
+ <path d="M 47.81 403.521712 L 835.361742 403.521712 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3986
  </g>
3987
  <g id="line2d_7">
3988
  <defs>
3989
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3990
  </defs>
3991
  <g>
3992
+ <use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
3993
  </g>
3994
  </g>
3995
  <g id="text_7">
3996
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
3997
  </g>
3998
  </g>
3999
  <g id="ytick_2">
4000
  <g id="grid-y--3" class="grid grid-y">
4001
+ <path d="M 47.81 343.523424 L 835.361742 343.523424 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
  </g>
4003
  <g id="line2d_8">
4004
  <g>
4005
+ <use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
4006
  </g>
4007
  </g>
4008
  <g id="text_8">
4009
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
4010
  </g>
4011
  </g>
4012
  <g id="ytick_3">
4013
  <g id="grid-y--4" class="grid grid-y">
4014
+ <path d="M 47.81 283.525136 L 835.361742 283.525136 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
  </g>
4016
  <g id="line2d_9">
4017
  <g>
4018
+ <use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
4019
  </g>
4020
  </g>
4021
  <g id="text_9">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
4023
  </g>
4024
  </g>
4025
  <g id="ytick_4">
4026
  <g id="grid-y--5" class="grid grid-y">
4027
+ <path d="M 47.81 223.526848 L 835.361742 223.526848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4028
  </g>
4029
  <g id="line2d_10">
4030
  <g>
4031
+ <use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
4032
  </g>
4033
  </g>
4034
  <g id="text_10">
4035
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
4036
  </g>
4037
  </g>
4038
  <g id="ytick_5">
4039
  <g id="grid-y--6" class="grid grid-y">
4040
+ <path d="M 47.81 163.52856 L 835.361742 163.52856 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4041
  </g>
4042
  <g id="line2d_11">
4043
  <g>
4044
+ <use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
4045
  </g>
4046
  </g>
4047
  <g id="text_11">
4048
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
4049
  </g>
4050
  </g>
4051
  <g id="ytick_6">
4052
  <g id="grid-y--7" class="grid grid-y">
4053
+ <path d="M 47.81 103.530273 L 835.361742 103.530273 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4054
  </g>
4055
  <g id="line2d_12">
4056
  <g>
4057
+ <use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
4058
  </g>
4059
  </g>
4060
  <g id="text_12">
4061
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
4062
  </g>
4063
  </g>
4064
  <g id="ytick_7">
4065
  <g id="grid-y--8" class="grid grid-y">
4066
+ <path d="M 47.81 43.531985 L 835.361742 43.531985 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4067
  </g>
4068
  <g id="line2d_13">
4069
  <g>
4070
+ <use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
4071
  </g>
4072
  </g>
4073
  <g id="text_13">
4074
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
4075
  </g>
4076
  </g>
4077
  <g id="label--y" class="ylabel">
 
4079
  </g>
4080
  </g>
4081
  <g id="series--torch-flash-ma" class="series">
4082
+ <path d="M 83.607806 337.456697 L 226.799032 322.330829 L 369.990258 318.592935 L 513.181484 311.965825 L 656.37271 262.663131 L 799.563935 254.692359 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4083
  <defs>
4084
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4085
  </defs>
4086
  <g clip-path="url(#p09feef2583)">
4087
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
4088
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
4089
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
4090
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
4091
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
4092
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
4093
  </g>
4094
  </g>
4095
  <g id="series--torch-mem-eff" class="series">
4096
+ <path d="M 83.607806 144.033917 L 226.799032 111.747638 L 369.990258 92.42159 L 513.181484 85.353791 L 656.37271 94.728524 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4097
  <defs>
4098
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4099
  </defs>
4100
  <g clip-path="url(#p09feef2583)">
4101
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
4102
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
4103
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
4104
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
4105
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
4106
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4107
  </g>
4108
  </g>
4109
  <g id="series--xformers-meff" class="series">
4110
+ <path d="M 83.607806 408.245077 L 226.799032 395.990127 L 369.990258 378.455027 L 513.181484 373.43287 L 656.37271 333.571508 L 799.563935 337.423698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4111
  <defs>
4112
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4113
  </defs>
4114
  <g clip-path="url(#p09feef2583)">
4115
+ <use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
4116
+ <use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
4117
+ <use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
4118
+ <use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
4119
+ <use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
4120
+ <use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
4121
  </g>
4122
  </g>
4123
  <g id="series--hf-kernels-flash-attn" class="series">
4124
+ <path d="M 83.607806 415.568468 L 226.799032 400.735991 L 369.990258 386.008812 L 513.181484 387.284075 L 656.37271 338.461368 L 799.563935 341.493982 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4125
  <defs>
4126
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4127
  </defs>
4128
  <g clip-path="url(#p09feef2583)">
4129
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
4130
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
4131
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
4132
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
4133
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
4134
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-flash-attn3" class="series">
4138
+ <path d="M 83.607806 428.387702 L 226.799032 415.50217 L 369.990258 397.727077 L 513.181484 397.526383 L 656.37271 348.148992 L 799.563935 348.55398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4141
  </defs>
4142
  <g clip-path="url(#p09feef2583)">
4143
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4144
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
4145
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
4146
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
4147
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
4148
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
4149
  </g>
4150
  </g>
4151
  <g id="patch_3">
 
4230
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4231
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4232
  </span> |
4233
+ Cell: combine | 4.25s
4234
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4235
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4236
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4315
  ✓ xFormers : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/269846603898e0ee1872d7a8b40fca43ba558b2f3400f8a7bedb1ee79df7da58
4316
  ✓ HF Kernels Flash Attn : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/c1c92a22d205ca145ffb0083188c0f8eef512cfd6aa091b1e49d6329fbd08849
4317
  ✓ HF Kernels Flash Attn3 : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20
4318
+ ✓ SageAttention : /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e
4319
 
4320
  ✓ Found Flash (PyTorch SDPA)
4321
  Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/4b81c2b991fc4a0f70c4117e933abc4007fd7f3f55394d7778a4074adf29df04/attention.jsonl
 
4328
  ✓ Found HF Kernels Flash Attn3
4329
  Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/8d741e4aa09c527ddf0f50ffa03a7e840559990c66178bfb9cf04bd97f3efd20/attention.jsonl
4330
  ✓ Found SageAttention
4331
+ Path: /__w/kernels-benchmarks/kernels-benchmarks/benches/flash_attn/impls/.uvnote/cache/f6be24aff45575cad8d1df490ac5fe9ec944103fb255665c71719ca2d7efea4e/attention.jsonl
4332
 
4333
  ======================================================================
4334
  Summary: 6 found, 0 skipped, 0 missing
 
4337
  COMBINED BENCHMARK SUMMARY
4338
 
4339
  impl wl p50(ms) ok
4340
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4341
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4342
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4343
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.05 True
4344
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4345
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True
4346
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.92 True
4347
+ hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True
4348
+ hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.02 True
4349
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4350
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4351
  hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.18 True
4352
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4353
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4354
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4355
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4356
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4357
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4358
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4359
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4360
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4361
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4362
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4363
+ Error: module &#x27;sage_attention_fd11035eb4318b27&#x27; has no attribute &#x27;fwd&#x27;
4364
  torch_flash_ma cuda_attn_L128_bfloat16 1.22 True
4365
  torch_flash_ma cuda_attn_L256_bfloat16 1.27 True
4366
+ torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4367
+ torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4368
+ torch_flash_ma cuda_attn_L448_bfloat16 1.47 True
4369
+ torch_flash_ma cuda_attn_L512_bfloat16 1.50 True
4370
+ torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4371
+ torch_mem_eff cuda_attn_L256_bfloat16 1.97 True
4372
+ torch_mem_eff cuda_attn_L320_bfloat16 2.04 True
4373
+ torch_mem_eff cuda_attn_L384_bfloat16 2.06 True
4374
+ torch_mem_eff cuda_attn_L448_bfloat16 2.03 True
4375
+ torch_mem_eff cuda_attn_L512_bfloat16 2.19 True
4376
+ xformers_meff cuda_attn_L128_bfloat16 0.98 True
4377
+ xformers_meff cuda_attn_L256_bfloat16 1.03 True
4378
+ xformers_meff cuda_attn_L320_bfloat16 1.08 True
4379
+ xformers_meff cuda_attn_L384_bfloat16 1.10 True
4380
+ xformers_meff cuda_attn_L448_bfloat16 1.23 True
4381
+ xformers_meff cuda_attn_L512_bfloat16 1.22 True
4382
 
4383
  GENERATING COMBINED VISUALIZATION
4384
 
 
4402
  <div class="uv-install-logs" id="uv-logs-combine">
4403
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4404
  <div class="uv-logs-content" style="display: none;">
4405
+ Installed 37 packages in 187ms
4406
  </div>
4407
  </div>
4408
  <div class="cell-artifacts">
 
4415
  <rdf:RDF>
4416
  <ns2:Work>
4417
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4418
+ <dc:date>2025-10-28T14:09:17.505622</dc:date>
4419
  <dc:format>image/svg+xml</dc:format>
4420
  <dc:creator>
4421
  <ns2:Agent>
 
4525
  <g id="matplotlib.axis_2">
4526
  <g id="ytick_1">
4527
  <g id="grid-y--2" class="grid grid-y">
4528
+ <path d="M 47.81 403.521712 L 835.361742 403.521712 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_7">
4531
  <defs>
4532
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4533
  </defs>
4534
  <g>
4535
+ <use ns4:href="#m0fca2865ba" x="47.81" y="403.521712" style="stroke: #000000; stroke-width: 0.8" />
4536
  </g>
4537
  </g>
4538
  <g id="text_7">
4539
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="407.320931" transform="rotate(-0 40.81 407.320931)">1.0</text>
4540
  </g>
4541
  </g>
4542
  <g id="ytick_2">
4543
  <g id="grid-y--3" class="grid grid-y">
4544
+ <path d="M 47.81 343.523424 L 835.361742 343.523424 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4545
  </g>
4546
  <g id="line2d_8">
4547
  <g>
4548
+ <use ns4:href="#m0fca2865ba" x="47.81" y="343.523424" style="stroke: #000000; stroke-width: 0.8" />
4549
  </g>
4550
  </g>
4551
  <g id="text_8">
4552
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="347.322643" transform="rotate(-0 40.81 347.322643)">1.2</text>
4553
  </g>
4554
  </g>
4555
  <g id="ytick_3">
4556
  <g id="grid-y--4" class="grid grid-y">
4557
+ <path d="M 47.81 283.525136 L 835.361742 283.525136 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4558
  </g>
4559
  <g id="line2d_9">
4560
  <g>
4561
+ <use ns4:href="#m0fca2865ba" x="47.81" y="283.525136" style="stroke: #000000; stroke-width: 0.8" />
4562
  </g>
4563
  </g>
4564
  <g id="text_9">
4565
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="287.324355" transform="rotate(-0 40.81 287.324355)">1.4</text>
4566
  </g>
4567
  </g>
4568
  <g id="ytick_4">
4569
  <g id="grid-y--5" class="grid grid-y">
4570
+ <path d="M 47.81 223.526848 L 835.361742 223.526848 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4571
  </g>
4572
  <g id="line2d_10">
4573
  <g>
4574
+ <use ns4:href="#m0fca2865ba" x="47.81" y="223.526848" style="stroke: #000000; stroke-width: 0.8" />
4575
  </g>
4576
  </g>
4577
  <g id="text_10">
4578
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="227.326067" transform="rotate(-0 40.81 227.326067)">1.6</text>
4579
  </g>
4580
  </g>
4581
  <g id="ytick_5">
4582
  <g id="grid-y--6" class="grid grid-y">
4583
+ <path d="M 47.81 163.52856 L 835.361742 163.52856 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4584
  </g>
4585
  <g id="line2d_11">
4586
  <g>
4587
+ <use ns4:href="#m0fca2865ba" x="47.81" y="163.52856" style="stroke: #000000; stroke-width: 0.8" />
4588
  </g>
4589
  </g>
4590
  <g id="text_11">
4591
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="167.327779" transform="rotate(-0 40.81 167.327779)">1.8</text>
4592
  </g>
4593
  </g>
4594
  <g id="ytick_6">
4595
  <g id="grid-y--7" class="grid grid-y">
4596
+ <path d="M 47.81 103.530273 L 835.361742 103.530273 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4597
  </g>
4598
  <g id="line2d_12">
4599
  <g>
4600
+ <use ns4:href="#m0fca2865ba" x="47.81" y="103.530273" style="stroke: #000000; stroke-width: 0.8" />
4601
  </g>
4602
  </g>
4603
  <g id="text_12">
4604
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="107.329491" transform="rotate(-0 40.81 107.329491)">2.0</text>
4605
  </g>
4606
  </g>
4607
  <g id="ytick_7">
4608
  <g id="grid-y--8" class="grid grid-y">
4609
+ <path d="M 47.81 43.531985 L 835.361742 43.531985 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4610
  </g>
4611
  <g id="line2d_13">
4612
  <g>
4613
+ <use ns4:href="#m0fca2865ba" x="47.81" y="43.531985" style="stroke: #000000; stroke-width: 0.8" />
4614
  </g>
4615
  </g>
4616
  <g id="text_13">
4617
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="47.331204" transform="rotate(-0 40.81 47.331204)">2.2</text>
4618
  </g>
4619
  </g>
4620
  <g id="label--y" class="ylabel">
 
4622
  </g>
4623
  </g>
4624
  <g id="series--torch-flash-ma" class="series">
4625
+ <path d="M 83.607806 337.456697 L 226.799032 322.330829 L 369.990258 318.592935 L 513.181484 311.965825 L 656.37271 262.663131 L 799.563935 254.692359 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4626
  <defs>
4627
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4628
  </defs>
4629
  <g clip-path="url(#p09feef2583)">
4630
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="337.456697" style="fill: #1f77b4; stroke: #1f77b4" />
4631
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="322.330829" style="fill: #1f77b4; stroke: #1f77b4" />
4632
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="318.592935" style="fill: #1f77b4; stroke: #1f77b4" />
4633
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="311.965825" style="fill: #1f77b4; stroke: #1f77b4" />
4634
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="262.663131" style="fill: #1f77b4; stroke: #1f77b4" />
4635
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="254.692359" style="fill: #1f77b4; stroke: #1f77b4" />
4636
  </g>
4637
  </g>
4638
  <g id="series--torch-mem-eff" class="series">
4639
+ <path d="M 83.607806 144.033917 L 226.799032 111.747638 L 369.990258 92.42159 L 513.181484 85.353791 L 656.37271 94.728524 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4640
  <defs>
4641
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4642
  </defs>
4643
  <g clip-path="url(#p09feef2583)">
4644
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="144.033917" style="fill: #ff7f0e; stroke: #ff7f0e" />
4645
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="111.747638" style="fill: #ff7f0e; stroke: #ff7f0e" />
4646
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="92.42159" style="fill: #ff7f0e; stroke: #ff7f0e" />
4647
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="85.353791" style="fill: #ff7f0e; stroke: #ff7f0e" />
4648
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="94.728524" style="fill: #ff7f0e; stroke: #ff7f0e" />
4649
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4650
  </g>
4651
  </g>
4652
  <g id="series--xformers-meff" class="series">
4653
+ <path d="M 83.607806 408.245077 L 226.799032 395.990127 L 369.990258 378.455027 L 513.181484 373.43287 L 656.37271 333.571508 L 799.563935 337.423698 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4654
  <defs>
4655
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4656
  </defs>
4657
  <g clip-path="url(#p09feef2583)">
4658
+ <use ns4:href="#mc655281e0b" x="83.607806" y="408.245077" style="fill: #2ca02c; stroke: #2ca02c" />
4659
+ <use ns4:href="#mc655281e0b" x="226.799032" y="395.990127" style="fill: #2ca02c; stroke: #2ca02c" />
4660
+ <use ns4:href="#mc655281e0b" x="369.990258" y="378.455027" style="fill: #2ca02c; stroke: #2ca02c" />
4661
+ <use ns4:href="#mc655281e0b" x="513.181484" y="373.43287" style="fill: #2ca02c; stroke: #2ca02c" />
4662
+ <use ns4:href="#mc655281e0b" x="656.37271" y="333.571508" style="fill: #2ca02c; stroke: #2ca02c" />
4663
+ <use ns4:href="#mc655281e0b" x="799.563935" y="337.423698" style="fill: #2ca02c; stroke: #2ca02c" />
4664
  </g>
4665
  </g>
4666
  <g id="series--hf-kernels-flash-attn" class="series">
4667
+ <path d="M 83.607806 415.568468 L 226.799032 400.735991 L 369.990258 386.008812 L 513.181484 387.284075 L 656.37271 338.461368 L 799.563935 341.493982 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4668
  <defs>
4669
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4670
  </defs>
4671
  <g clip-path="url(#p09feef2583)">
4672
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="415.568468" style="fill: #d62728; stroke: #d62728" />
4673
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="400.735991" style="fill: #d62728; stroke: #d62728" />
4674
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="386.008812" style="fill: #d62728; stroke: #d62728" />
4675
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="387.284075" style="fill: #d62728; stroke: #d62728" />
4676
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="338.461368" style="fill: #d62728; stroke: #d62728" />
4677
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="341.493982" style="fill: #d62728; stroke: #d62728" />
4678
  </g>
4679
  </g>
4680
  <g id="series--hf-kernels-flash-attn3" class="series">
4681
+ <path d="M 83.607806 428.387702 L 226.799032 415.50217 L 369.990258 397.727077 L 513.181484 397.526383 L 656.37271 348.148992 L 799.563935 348.55398 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4682
  <defs>
4683
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4684
  </defs>
4685
  <g clip-path="url(#p09feef2583)">
4686
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4687
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="415.50217" style="fill: #9467bd; stroke: #9467bd" />
4688
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="397.727077" style="fill: #9467bd; stroke: #9467bd" />
4689
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="397.526383" style="fill: #9467bd; stroke: #9467bd" />
4690
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="348.148992" style="fill: #9467bd; stroke: #9467bd" />
4691
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="348.55398" style="fill: #9467bd; stroke: #9467bd" />
4692
  </g>
4693
  </g>
4694
  <g id="patch_3">
index.html CHANGED
@@ -80,8 +80,10 @@
80
  <h1>Index of /</h1>
81
  <ul>
82
  <li><a href='activation/index.html' class='dir'>activation/</a></li>
 
83
  <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
84
  <li><a href='layer_norm/index.html' class='dir'>layer_norm/</a></li>
 
85
  </ul>
86
  </body>
87
  </html>
 
80
  <h1>Index of /</h1>
81
  <ul>
82
  <li><a href='activation/index.html' class='dir'>activation/</a></li>
83
+ <li><a href='causal_conv1d/index.html' class='dir'>causal_conv1d/</a></li>
84
  <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
85
  <li><a href='layer_norm/index.html' class='dir'>layer_norm/</a></li>
86
+ <li><a href='rotary/index.html' class='dir'>rotary/</a></li>
87
  </ul>
88
  </body>
89
  </html>
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,48 +1,4 @@
1
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D1024", "batch": 1, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03865100006805733, "p50": 0.03903099991475756, "p90": 0.04018100003122527, "mean": 0.03959079995183856, "iqr": 0.001300000121773337, "raw_times": [0.03888099990945193, 0.03903099991475756, 0.04018100003122527, 0.04120999983570073, 0.03865100006805733], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05060100011178292, "peak_bytes": 2363392, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
2
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D2048", "batch": 1, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04517999991549004, "p50": 0.04712100007964182, "p90": 0.04805000003216264, "mean": 0.04695459997492435, "iqr": 0.001779000058377278, "raw_times": [0.04517999991549004, 0.04805000003216264, 0.04712100007964182, 0.046270999973785365, 0.04815099987354188], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05504099999598111, "peak_bytes": 4726784, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1324882507324219e-05, "ref": "layer_norm_fp32"}, "err": null}
3
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D4096", "batch": 1, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04581999996844388, "p50": 0.04766099982589367, "p90": 0.04786099998455029, "mean": 0.047156599976005964, "iqr": 0.0017899999420478707, "raw_times": [0.04766099982589367, 0.04786099998455029, 0.04837000005863956, 0.04607100004250242, 0.04581999996844388], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05361099988476781, "peak_bytes": 9453568, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00146484375, "mse": 1.049041748046875e-05, "ref": "layer_norm_fp32"}, "err": null}
4
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S128_D8192", "batch": 1, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045190999799160636, "p50": 0.04684100008489622, "p90": 0.04752099994220771, "mean": 0.046596999982284615, "iqr": 0.00227999998969608, "raw_times": [0.04524099995251163, 0.04752099994220771, 0.04819100013264688, 0.04684100008489622, 0.045190999799160636], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052801000038016355, "peak_bytes": 18907136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016326904296875, "mse": 1.1801719665527344e-05, "ref": "layer_norm_fp32"}, "err": null}
5
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D1024", "batch": 1, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04395000019030704, "p50": 0.045061000037094345, "p90": 0.046920999920985196, "mean": 0.04563460001918429, "iqr": 0.0018609998733154498, "raw_times": [0.04718099989986513, 0.046920999920985196, 0.045060000047669746, 0.045061000037094345, 0.04395000019030704], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05090100012239418, "peak_bytes": 9441280, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
6
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D2048", "batch": 1, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04656000010072603, "p50": 0.046920999920985196, "p90": 0.04878100003224972, "mean": 0.04884479999418545, "iqr": 0.0020300001324358163, "raw_times": [0.04656000010072603, 0.046750999899813905, 0.04878100003224972, 0.0552110000171524, 0.046920999920985196], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0497399998948822, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
7
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D4096", "batch": 1, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04567099995256285, "p50": 0.04622100004780805, "p90": 0.04798100007974426, "mean": 0.047496800016233465, "iqr": 0.0018200000795332016, "raw_times": [0.04567099995256285, 0.0514500000008411, 0.04616100000021106, 0.04798100007974426, 0.04622100004780805], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04885000021204178, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
8
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S512_D8192", "batch": 1, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04487000001063279, "p50": 0.045961000068928115, "p90": 0.046200000042517786, "mean": 0.04860060003011313, "iqr": 0.000509000074089272, "raw_times": [0.06028100006005843, 0.04487000001063279, 0.045690999968428514, 0.045961000068928115, 0.046200000042517786], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05061100000602892, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
9
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D1024", "batch": 1, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043511000058060745, "p50": 0.046270999973785365, "p90": 0.04790999992110301, "mean": 0.047574600012012525, "iqr": 0.002919999815276242, "raw_times": [0.044990000105826766, 0.04790999992110301, 0.043511000058060745, 0.05519100000128674, 0.046270999973785365], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048970999841913, "peak_bytes": 21008384, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
10
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D2048", "batch": 1, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043170000026293565, "p50": 0.04767099994751334, "p90": 0.0476899999739544, "mean": 0.04691639996963204, "iqr": 0.0009390000741404947, "raw_times": [0.043170000026293565, 0.04930000000058499, 0.04767099994751334, 0.046750999899813905, 0.0476899999739544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05084099984742352, "peak_bytes": 37756928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
11
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D4096", "batch": 1, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044720999994751764, "p50": 0.045860000000175205, "p90": 0.046411000084845, "mean": 0.04585680003401649, "iqr": 0.0012000000424450263, "raw_times": [0.044720999994751764, 0.04708100004791049, 0.046411000084845, 0.045860000000175205, 0.045211000042399974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05302099998516496, "peak_bytes": 75513856, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0016021728515625, "mse": 1.1682510375976562e-05, "ref": "layer_norm_fp32"}, "err": null}
12
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S1024_D8192", "batch": 1, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04476999993130448, "p50": 0.04614999988916679, "p90": 0.04633100002138235, "mean": 0.04639259996110923, "iqr": 0.00019000003703695256, "raw_times": [0.04476999993130448, 0.04614999988916679, 0.04857099997934711, 0.0461409999843454, 0.04633100002138235], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.047730999995110324, "peak_bytes": 151027712, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
13
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D1024", "batch": 1, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04606099992088275, "p50": 0.046679999968546326, "p90": 0.04687099999500788, "mean": 0.0466285999664251, "iqr": 0.0006509999366244301, "raw_times": [0.04606099992088275, 0.047310999889305094, 0.04687099999500788, 0.046679999968546326, 0.04622000005838345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.050389999842082034, "peak_bytes": 41979904, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
14
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D2048", "batch": 1, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04560100001071987, "p50": 0.04617999979927845, "p90": 0.04656999999497202, "mean": 0.0462445999346528, "iqr": 0.0007090000053722179, "raw_times": [0.04617999979927845, 0.045860999989599804, 0.04560100001071987, 0.04701099987869384, 0.04656999999497202], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.049061000026995316, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
15
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D4096", "batch": 1, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04474100001061743, "p50": 0.04615000011654047, "p90": 0.04696099995271652, "mean": 0.046176800060493406, "iqr": 0.0009599998520570807, "raw_times": [0.047031000121933175, 0.04474100001061743, 0.04600100010065944, 0.04615000011654047, 0.04696099995271652], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051490000032572425, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
16
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B1_S2048_D8192", "batch": 1, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051341000016691396, "p50": 0.05155100006959401, "p90": 0.05226099983701715, "mean": 0.051880799992432, "iqr": 0.0007709998044447275, "raw_times": [0.051341000016691396, 0.051490000032572425, 0.05226099983701715, 0.05155100006959401, 0.05276100000628503], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053531000048678834, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
17
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D1024", "batch": 4, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044059999936507666, "p50": 0.04549100003714557, "p90": 0.045540999963122886, "mean": 0.04540859999906388, "iqr": 0.0004099999841855606, "raw_times": [0.04549100003714557, 0.044059999936507666, 0.04682000007960596, 0.045130999978937325, 0.045540999963122886], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048860999868338695, "peak_bytes": 69242880, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.138448715209961e-05, "ref": "layer_norm_fp32"}, "err": null}
18
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D2048", "batch": 4, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04604099990501709, "p50": 0.04642099997909099, "p90": 0.04698099996858218, "mean": 0.05290099998092046, "iqr": 0.0009299999419454252, "raw_times": [0.07901100002527528, 0.04698099996858218, 0.04604099990501709, 0.04605100002663676, 0.04642099997909099], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.048481000021638465, "peak_bytes": 18882560, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00154876708984375, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
19
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D4096", "batch": 4, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04422100005285756, "p50": 0.045961000068928115, "p90": 0.04607100004250242, "mean": 0.04557280003609776, "iqr": 0.0010700000530050602, "raw_times": [0.04500099998949736, 0.045961000068928115, 0.04422100005285756, 0.046610000026703347, 0.04607100004250242], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05000100009056041, "peak_bytes": 37765120, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
20
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S128_D8192", "batch": 4, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.044550999973580474, "p50": 0.04615100010596507, "p90": 0.04661999992094934, "mean": 0.04619880000973353, "iqr": 0.0006089999260439072, "raw_times": [0.04661999992094934, 0.047661000053267344, 0.04615100010596507, 0.04601099999490543, 0.044550999973580474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05021999982091074, "peak_bytes": 75530240, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
21
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D1024", "batch": 4, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04479100016396842, "p50": 0.04570999999486958, "p90": 0.04578100015351083, "mean": 0.04546060008578934, "iqr": 0.0006410000423784368, "raw_times": [0.045881000005465467, 0.04578100015351083, 0.045140000111132395, 0.04479100016396842, 0.04570999999486958], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05074099999546888, "peak_bytes": 37752832, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
22
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D2048", "batch": 4, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04388000002109038, "p50": 0.046260999852165696, "p90": 0.047070999926290824, "mean": 0.046070799999142764, "iqr": 0.0010899998414970469, "raw_times": [0.04716100011137314, 0.04598100008479378, 0.04388000002109038, 0.046260999852165696, 0.047070999926290824], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05007100003240339, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
23
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D4096", "batch": 4, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04435100004229753, "p50": 0.045130999978937325, "p90": 0.04698099996858218, "mean": 0.04562479998639901, "iqr": 0.0023600000531587284, "raw_times": [0.044620999915423454, 0.04698099996858218, 0.04704000002675457, 0.04435100004229753, 0.045130999978937325], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04849099991588446, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
24
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S512_D8192", "batch": 4, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05119000002196117, "p50": 0.05123000005369249, "p90": 0.05150099991624302, "mean": 0.051574400004028575, "iqr": 0.00027999999474559445, "raw_times": [0.05122099992149742, 0.05150099991624302, 0.05123000005369249, 0.052730000106748776, 0.05119000002196117], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05633099999613478, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
25
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D1024", "batch": 4, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04580100016937649, "p50": 0.04708999995273189, "p90": 0.04770099985762499, "mean": 0.05188039999666216, "iqr": 0.00096099984148168, "raw_times": [0.07206999998743413, 0.04674000001614331, 0.04580100016937649, 0.04708999995273189, 0.04770099985762499], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04944000011164462, "peak_bytes": 83922944, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
26
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D2048", "batch": 4, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04320099992582982, "p50": 0.04512100008469133, "p90": 0.04604099990501709, "mean": 0.04527500000222062, "iqr": 0.001329999804511317, "raw_times": [0.04320099992582982, 0.04604099990501709, 0.04471100010050577, 0.0473009999950591, 0.04512100008469133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.051181000117139774, "peak_bytes": 151003136, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
27
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D4096", "batch": 4, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984999986845651, "p50": 0.050290999979552, "p90": 0.050490999910834944, "mean": 0.050288599959458224, "iqr": 0.0005399999736255268, "raw_times": [0.04995099993720942, 0.050490999910834944, 0.050290999979552, 0.050860000101238256, 0.04984999986845651], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052241000048525166, "peak_bytes": 302006272, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
28
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S1024_D8192", "batch": 4, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2064129998871067, "p50": 0.2123330000358692, "p90": 0.218262999851504, "mean": 0.2148927999769512, "iqr": 0.010130999726243317, "raw_times": [0.20813200012526067, 0.218262999851504, 0.2123330000358692, 0.2064129998871067, 0.22932299998501549], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21481299995684822, "peak_bytes": 604012544, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
29
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D1024", "batch": 4, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04353000008450181, "p50": 0.04543000000012398, "p90": 0.04657099998439662, "mean": 0.04557060001388891, "iqr": 0.001390000079481979, "raw_times": [0.04543000000012398, 0.04518099990491464, 0.04714100009550748, 0.04657099998439662, 0.04353000008450181], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0490809998154873, "peak_bytes": 167809024, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
30
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D2048", "batch": 4, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054420999958892935, "p50": 0.05506100001184677, "p90": 0.055460999874412664, "mean": 0.055042999929355574, "iqr": 0.0008699998943484388, "raw_times": [0.054420999958892935, 0.055460999874412664, 0.05568099982156127, 0.054590999980064225, 0.05506100001184677], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05802099985885434, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
31
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D4096", "batch": 4, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20996300008846447, "p50": 0.2102230000673444, "p90": 0.21053299997220165, "mean": 0.21050080003988114, "iqr": 0.0004209998678561533, "raw_times": [0.20996300008846447, 0.21053299997220165, 0.2102230000673444, 0.2101120001043455, 0.2116729999670497], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21157300011509506, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
32
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B4_S2048_D8192", "batch": 4, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4341660001045966, "p50": 0.4372359999251785, "p90": 0.4383160000998032, "mean": 0.437980000015159, "iqr": 0.004120000085094944, "raw_times": [0.4383160000998032, 0.4372359999251785, 0.4341660001045966, 0.43419600001470826, 0.44598599993150856], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44448700009525055, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
33
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D1024", "batch": 16, "seq_len": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04514099987318332, "p50": 0.0465299999632407, "p90": 0.04655099996853096, "mean": 0.04629059999388119, "iqr": 0.0011309998626529705, "raw_times": [0.04514099987318332, 0.04655099996853096, 0.04781100005857297, 0.04542000010587799, 0.0465299999632407], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04811100006918423, "peak_bytes": 276860928, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
34
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D2048", "batch": 16, "seq_len": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.043820999962917995, "p50": 0.045551000084742554, "p90": 0.04633000003195775, "mean": 0.04580079998959263, "iqr": 0.0007890000688348664, "raw_times": [0.043820999962917995, 0.04776099990522198, 0.045551000084742554, 0.045540999963122886, 0.04633000003195775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05054000007476134, "peak_bytes": 75505664, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00159454345703125, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
35
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D4096", "batch": 16, "seq_len": 128, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04511099996307166, "p50": 0.04610100017998775, "p90": 0.04624100006367371, "mean": 0.04598500008796691, "iqr": 0.0004099999841855606, "raw_times": [0.04583100007948815, 0.04664100015361328, 0.04511099996307166, 0.04610100017998775, 0.04624100006367371], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04932000001645065, "peak_bytes": 151011328, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00156402587890625, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
36
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S128_D8192", "batch": 16, "seq_len": 128, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05165100014892232, "p50": 0.052130999847577186, "p90": 0.05317099999047059, "mean": 0.05250480003269331, "iqr": 0.0012309999419812812, "raw_times": [0.052130999847577186, 0.05165100014892232, 0.053631000128007145, 0.05317099999047059, 0.05194000004848931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055000999964249786, "peak_bytes": 302022656, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
37
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D1024", "batch": 16, "seq_len": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.045381000063571264, "p50": 0.045759999920846894, "p90": 0.04781100005857297, "mean": 0.04770240002471837, "iqr": 0.00238100005844899, "raw_times": [0.045759999920846894, 0.04781100005857297, 0.045381000063571264, 0.04543000000012398, 0.05413000008047675], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04919000002701068, "peak_bytes": 150999040, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
38
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D2048", "batch": 16, "seq_len": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05421099990599032, "p50": 0.054861000080563826, "p90": 0.05564100001720362, "mean": 0.05508300000656163, "iqr": 0.0010100000054080738, "raw_times": [0.056071000017254846, 0.054861000080563826, 0.05564100001720362, 0.05421099990599032, 0.05463100001179555], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05805000000691507, "peak_bytes": 301998080, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00157928466796875, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
39
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D4096", "batch": 16, "seq_len": 512, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.20916299990858533, "p50": 0.21016300001974741, "p90": 0.21141399997759436, "mean": 0.21107719999235997, "iqr": 0.0015210000583465444, "raw_times": [0.21141399997759436, 0.2147530001366249, 0.2098929999192478, 0.21016300001974741, 0.20916299990858533], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21191299993006396, "peak_bytes": 603996160, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1265277862548828e-05, "ref": "layer_norm_fp32"}, "err": null}
40
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S512_D8192", "batch": 16, "seq_len": 512, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43155599996680394, "p50": 0.43475600000419945, "p90": 0.4373360000045068, "mean": 0.43558200000006764, "iqr": 0.003800000058618025, "raw_times": [0.43475600000419945, 0.44072600007893925, 0.4373360000045068, 0.43353599994588876, 0.43155599996680394], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44892699997944874, "peak_bytes": 1207992320, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00152587890625, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
41
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D1024", "batch": 16, "seq_len": 1024, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0483500000427739, "p50": 0.049099999841928366, "p90": 0.04950099992129253, "mean": 0.050544599935165024, "iqr": 0.0011199999789823778, "raw_times": [0.048380999942310154, 0.04950099992129253, 0.05739099992752017, 0.049099999841928366, 0.0483500000427739], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05153099982635467, "peak_bytes": 335581184, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015869140625, "mse": 1.150369644165039e-05, "ref": "layer_norm_fp32"}, "err": null}
42
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D2048", "batch": 16, "seq_len": 1024, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2181429999836837, "p50": 0.2215729998624738, "p90": 0.2217329999893991, "mean": 0.22086119997766218, "iqr": 0.003440000000409782, "raw_times": [0.2181429999836837, 0.2217329999893991, 0.21829299998898932, 0.2215729998624738, 0.22456400006376498], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22583300005862839, "peak_bytes": 603987968, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
43
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D4096", "batch": 16, "seq_len": 1024, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.43596600016826415, "p50": 0.4398270000365301, "p90": 0.4409260000102222, "mean": 0.4390922000766295, "iqr": 0.003549999973984086, "raw_times": [0.4398270000365301, 0.4409260000102222, 0.4413660001318931, 0.43596600016826415, 0.4373760000362381], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.44040700004188693, "peak_bytes": 1207975936, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
44
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S1024_D8192", "batch": 16, "seq_len": 1024, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8329219999723136, "p50": 0.8419220000632777, "p90": 0.8434520000264456, "mean": 0.84072780000497, "iqr": 0.002130000211764127, "raw_times": [0.8329219999723136, 0.8419220000632777, 0.8440210001481319, 0.8434520000264456, 0.8413219998146815], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8442119999472197, "peak_bytes": 2415951872, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.00151824951171875, "mse": 1.1146068572998047e-05, "ref": "layer_norm_fp32"}, "err": null}
45
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D1024", "batch": 16, "seq_len": 2048, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21230300012575753, "p50": 0.2135429999725602, "p90": 0.2142630000889767, "mean": 0.21426700000120036, "iqr": 0.0008800002433417831, "raw_times": [0.21230300012575753, 0.2133829998456349, 0.2135429999725602, 0.2142630000889767, 0.21784299997307244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22175300000526477, "peak_bytes": 671125504, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015869140625, "mse": 1.1563301086425781e-05, "ref": "layer_norm_fp32"}, "err": null}
46
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D2048", "batch": 16, "seq_len": 2048, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.4536460000963416, "p50": 0.45670700001210207, "p90": 0.4569770001126017, "mean": 0.45669080004699936, "iqr": 0.00113999999484804, "raw_times": [0.4536460000963416, 0.4569770001126017, 0.45583700011775363, 0.45670700001210207, 0.4602869998961978], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.4546860000118613, "peak_bytes": 1207967744, "ok": false, "absmax": 0.0625, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.0625, "mae": 0.0015716552734375, "mse": 1.1444091796875e-05, "ref": "layer_norm_fp32"}, "err": null}
47
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8352710001418018, "p50": 0.8370320001631626, "p90": 0.8388319999994565, "mean": 0.8375798000997747, "iqr": 0.0019899998733308166, "raw_times": [0.8352710001418018, 0.8368420001261256, 0.8399220000683272, 0.8370320001631626, 0.8388319999994565], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.849921000053655, "peak_bytes": 2415935488, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_fp32"}, "err": null}
48
- {"ts": "2025-10-27T14:46:20Z", "run": "9dc250db88854e28964e0fea9bac9f3c", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6451530000267667, "p50": 1.6546740000649152, "p90": 1.6553830000702874, "mean": 1.6516054000476288, "iqr": 0.008870000101524056, "raw_times": [1.6553830000702874, 1.6465129999687633, 1.6563040001074114, 1.6546740000649152, 1.6451530000267667], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.655194000022675, "peak_bytes": 4831870976, "ok": false, "absmax": 0.03125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_fp32"}, "err": null}
 
1
+ {"ts": "2025-10-28T14:08:59Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8268990000033227, "p50": 0.8360890000176369, "p90": 0.8378790000733716, "mean": 0.8358750000070359, "iqr": 0.002010000116570154, "raw_times": [0.8426389999840467, 0.8268990000033227, 0.8378790000733716, 0.8360890000176369, 0.8358689999568014], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8452999999235544, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6477070000746608, "p50": 1.6516379999984565, "p90": 1.6565669999408783, "mean": 1.6533151999965412, "iqr": 0.006360999918797461, "raw_times": [1.6565669999408783, 1.6516379999984565, 1.6477070000746608, 1.6604579999466296, 1.6502060000220808], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6544470000781075, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6441269999631913, "p50": 1.6532669999378413, "p90": 1.6534970000066096, "mean": 1.6500411999913922, "iqr": 0.009149999982582813, "raw_times": [1.6441269999631913, 1.6534970000066096, 1.6532669999378413, 1.6443470000240268, 1.654968000025292], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6665570000213847, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-10-28T14:09:00Z", "run": "c74bca3f7fd14f779c98e3d8b69c0098", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.251962999911484, "p50": 3.270412999995642, "p90": 3.2735430000911947, "mean": 3.2660931999998866, "iqr": 0.01632000009976764, "raw_times": [3.2735430000911947, 3.251962999911484, 3.257222999991427, 3.277324000009685, 3.270412999995642], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2640430000583365, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
The diff for this file is too large to render. See raw diff
 
layer_norm/impls/torch_layer_norm.html CHANGED
The diff for this file is too large to render. See raw diff
 
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: e7883bd5f88a9163cc9fdaeec2076ca6319f97d413c6bea136db33612dc2b864
  • Pointer size: 128 Bytes
  • Size of remote file: 947 Bytes

Git LFS Details

  • SHA256: 1e41c135df9f0b506fa1ac950b90bd609d850f01d79b3171b3678c24fdab066a
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB
layer_norm/results/combined_results.html CHANGED
@@ -3867,12 +3867,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3867
  <h2>Combined Summary and Visualization</h2>
3868
  <div class="artifact-preview">
3869
  <?xml version='1.0' encoding='utf-8'?>
3870
- <svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="460.8pt" height="345.6pt" viewBox="0 0 460.8 345.6" version="1.1">
3871
  <metadata>
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
- <dc:date>2025-10-27T14:46:34.455868</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
@@ -3887,9 +3887,214 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3887
  </defs>
3888
  <g id="figure--latency" class="figure">
3889
  <g id="patch_1">
3890
- <path d="M 0 345.6 L 460.8 345.6 L 460.8 0 L 0 0 L 0 345.6 z " style="fill: none" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3891
  </g>
3892
  </g>
 
 
 
 
 
3893
  </svg>
3894
  </div>
3895
 
@@ -3900,7 +4105,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3900
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
3901
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
3902
  </span> |
3903
- Cell: combine | 4.28s
3904
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
3905
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
3906
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -3987,107 +4192,20 @@ Summary: 2 found, 0 skipped, 0 missing
3987
  COMBINED BENCHMARK SUMMARY
3988
 
3989
  impl wl p50(ms) ok
3990
- hf_kernels_layer_norm LN_B16_S1024_D1024 0.05 False
3991
- hf_kernels_layer_norm LN_B16_S1024_D2048 0.22 False
3992
- hf_kernels_layer_norm LN_B16_S1024_D4096 0.44 False
3993
- hf_kernels_layer_norm LN_B16_S1024_D8192 0.84 False
3994
- hf_kernels_layer_norm LN_B16_S128_D1024 0.05 False
3995
- hf_kernels_layer_norm LN_B16_S128_D2048 0.05 False
3996
- hf_kernels_layer_norm LN_B16_S128_D4096 0.05 False
3997
- hf_kernels_layer_norm LN_B16_S128_D8192 0.05 False
3998
- hf_kernels_layer_norm LN_B16_S2048_D1024 0.21 False
3999
- hf_kernels_layer_norm LN_B16_S2048_D2048 0.46 False
4000
- hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 False
4001
- hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 False
4002
- hf_kernels_layer_norm LN_B16_S512_D1024 0.05 False
4003
- hf_kernels_layer_norm LN_B16_S512_D2048 0.05 False
4004
- hf_kernels_layer_norm LN_B16_S512_D4096 0.21 False
4005
- hf_kernels_layer_norm LN_B16_S512_D8192 0.43 False
4006
- hf_kernels_layer_norm LN_B1_S1024_D1024 0.05 False
4007
- hf_kernels_layer_norm LN_B1_S1024_D2048 0.05 False
4008
- hf_kernels_layer_norm LN_B1_S1024_D4096 0.05 False
4009
- hf_kernels_layer_norm LN_B1_S1024_D8192 0.05 False
4010
- hf_kernels_layer_norm LN_B1_S128_D1024 0.04 False
4011
- hf_kernels_layer_norm LN_B1_S128_D2048 0.05 False
4012
- hf_kernels_layer_norm LN_B1_S128_D4096 0.05 False
4013
- hf_kernels_layer_norm LN_B1_S128_D8192 0.05 False
4014
- hf_kernels_layer_norm LN_B1_S2048_D1024 0.05 False
4015
- hf_kernels_layer_norm LN_B1_S2048_D2048 0.05 False
4016
- hf_kernels_layer_norm LN_B1_S2048_D4096 0.05 False
4017
- hf_kernels_layer_norm LN_B1_S2048_D8192 0.05 False
4018
- hf_kernels_layer_norm LN_B1_S512_D1024 0.05 False
4019
- hf_kernels_layer_norm LN_B1_S512_D2048 0.05 False
4020
- hf_kernels_layer_norm LN_B1_S512_D4096 0.05 False
4021
- hf_kernels_layer_norm LN_B1_S512_D8192 0.05 False
4022
- hf_kernels_layer_norm LN_B4_S1024_D1024 0.05 False
4023
- hf_kernels_layer_norm LN_B4_S1024_D2048 0.05 False
4024
- hf_kernels_layer_norm LN_B4_S1024_D4096 0.05 False
4025
- hf_kernels_layer_norm LN_B4_S1024_D8192 0.21 False
4026
- hf_kernels_layer_norm LN_B4_S128_D1024 0.05 False
4027
- hf_kernels_layer_norm LN_B4_S128_D2048 0.05 False
4028
- hf_kernels_layer_norm LN_B4_S128_D4096 0.05 False
4029
- hf_kernels_layer_norm LN_B4_S128_D8192 0.05 False
4030
- hf_kernels_layer_norm LN_B4_S2048_D1024 0.05 False
4031
- hf_kernels_layer_norm LN_B4_S2048_D2048 0.06 False
4032
- hf_kernels_layer_norm LN_B4_S2048_D4096 0.21 False
4033
- hf_kernels_layer_norm LN_B4_S2048_D8192 0.44 False
4034
- hf_kernels_layer_norm LN_B4_S512_D1024 0.05 False
4035
- hf_kernels_layer_norm LN_B4_S512_D2048 0.05 False
4036
- hf_kernels_layer_norm LN_B4_S512_D4096 0.05 False
4037
- hf_kernels_layer_norm LN_B4_S512_D8192 0.05 False
4038
- torch_layer_norm LN_B16_S1024_D1024 0.05 False
4039
- torch_layer_norm LN_B16_S1024_D2048 0.21 False
4040
- torch_layer_norm LN_B16_S1024_D4096 0.42 False
4041
- torch_layer_norm LN_B16_S1024_D8192 0.85 False
4042
- torch_layer_norm LN_B16_S128_D1024 0.03 False
4043
- torch_layer_norm LN_B16_S128_D2048 0.03 False
4044
- torch_layer_norm LN_B16_S128_D4096 0.04 False
4045
- torch_layer_norm LN_B16_S128_D8192 0.05 False
4046
- torch_layer_norm LN_B16_S2048_D1024 0.21 False
4047
- torch_layer_norm LN_B16_S2048_D2048 0.42 False
4048
- torch_layer_norm LN_B16_S2048_D4096 0.82 False
4049
- torch_layer_norm LN_B16_S2048_D8192 1.68 False
4050
- torch_layer_norm LN_B16_S512_D1024 0.04 False
4051
- torch_layer_norm LN_B16_S512_D2048 0.05 False
4052
- torch_layer_norm LN_B16_S512_D4096 0.21 False
4053
- torch_layer_norm LN_B16_S512_D8192 0.43 False
4054
- torch_layer_norm LN_B1_S1024_D1024 0.03 False
4055
- torch_layer_norm LN_B1_S1024_D2048 0.03 False
4056
- torch_layer_norm LN_B1_S1024_D4096 0.03 False
4057
- torch_layer_norm LN_B1_S1024_D8192 0.04 False
4058
- torch_layer_norm LN_B1_S128_D1024 0.02 False
4059
- torch_layer_norm LN_B1_S128_D2048 0.03 False
4060
- torch_layer_norm LN_B1_S128_D4096 0.03 False
4061
- torch_layer_norm LN_B1_S128_D8192 0.03 False
4062
- torch_layer_norm LN_B1_S2048_D1024 0.03 False
4063
- torch_layer_norm LN_B1_S2048_D2048 0.03 False
4064
- torch_layer_norm LN_B1_S2048_D4096 0.04 False
4065
- torch_layer_norm LN_B1_S2048_D8192 0.05 False
4066
- torch_layer_norm LN_B1_S512_D1024 0.03 False
4067
- torch_layer_norm LN_B1_S512_D2048 0.03 False
4068
- torch_layer_norm LN_B1_S512_D4096 0.03 False
4069
- torch_layer_norm LN_B1_S512_D8192 0.03 False
4070
- torch_layer_norm LN_B4_S1024_D1024 0.03 False
4071
- torch_layer_norm LN_B4_S1024_D2048 0.04 False
4072
- torch_layer_norm LN_B4_S1024_D4096 0.05 False
4073
- torch_layer_norm LN_B4_S1024_D8192 0.20 False
4074
- torch_layer_norm LN_B4_S128_D1024 0.03 False
4075
- torch_layer_norm LN_B4_S128_D2048 0.03 False
4076
- torch_layer_norm LN_B4_S128_D4096 0.03 False
4077
- torch_layer_norm LN_B4_S128_D8192 0.03 False
4078
- torch_layer_norm LN_B4_S2048_D1024 0.04 False
4079
- torch_layer_norm LN_B4_S2048_D2048 0.05 False
4080
- torch_layer_norm LN_B4_S2048_D4096 0.21 False
4081
- torch_layer_norm LN_B4_S2048_D8192 0.44 False
4082
- torch_layer_norm LN_B4_S512_D1024 0.03 False
4083
- torch_layer_norm LN_B4_S512_D2048 0.03 False
4084
- torch_layer_norm LN_B4_S512_D4096 0.04 False
4085
- torch_layer_norm LN_B4_S512_D8192 0.05 False
4086
 
4087
  GENERATING COMBINED VISUALIZATION
4088
 
4089
- Loaded 96 records
4090
- No valid records found
 
4091
  ✓ Visualization saved as latency.svg
4092
  ✓ SVG visualization ready!
4093
 
@@ -4101,7 +4219,7 @@ Implementations included:
4101
  <div class="uv-install-logs" id="uv-logs-combine">
4102
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4103
  <div class="uv-logs-content" style="display: none;">
4104
- Installed 37 packages in 260ms
4105
  </div>
4106
  </div>
4107
  <div class="cell-artifacts">
@@ -4109,12 +4227,12 @@ Installed 37 packages in 260ms
4109
  <a href="artifacts/combine/latency.svg" class="artifact" target="_blank">latency.svg</a>
4110
  <div class="artifact-preview">
4111
  <?xml version='1.0' encoding='utf-8'?>
4112
- <svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="460.8pt" height="345.6pt" viewBox="0 0 460.8 345.6" version="1.1">
4113
  <metadata>
4114
  <rdf:RDF>
4115
  <ns2:Work>
4116
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4117
- <dc:date>2025-10-27T14:46:34.455868</dc:date>
4118
  <dc:format>image/svg+xml</dc:format>
4119
  <dc:creator>
4120
  <ns2:Agent>
@@ -4129,9 +4247,214 @@ Installed 37 packages in 260ms
4129
  </defs>
4130
  <g id="figure--latency" class="figure">
4131
  <g id="patch_1">
4132
- <path d="M 0 345.6 L 460.8 345.6 L 460.8 0 L 0 0 L 0 345.6 z " style="fill: none" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4133
  </g>
4134
  </g>
 
 
 
 
 
4135
  </svg>
4136
  </div>
4137
  </div>
 
3867
  <h2>Combined Summary and Visualization</h2>
3868
  <div class="artifact-preview">
3869
  <?xml version='1.0' encoding='utf-8'?>
3870
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
3871
  <metadata>
3872
  <rdf:RDF>
3873
  <ns2:Work>
3874
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3875
+ <dc:date>2025-10-28T14:09:21.825978</dc:date>
3876
  <dc:format>image/svg+xml</dc:format>
3877
  <dc:creator>
3878
  <ns2:Agent>
 
3887
  </defs>
3888
  <g id="figure--latency" class="figure">
3889
  <g id="patch_1">
3890
+ <path d="M 0 576 L 864 576 L 864 0 L 0 0 L 0 576 z " style="fill: none" />
3891
+ </g>
3892
+ <g id="axes--1" class="axes">
3893
+ <g id="patch_2">
3894
+ <path d="M 47.72 457.251932 L 840.20233 457.251932 L 840.20233 26.88 L 47.72 26.88 L 47.72 457.251932 z " style="fill: none" />
3895
+ </g>
3896
+ <g id="matplotlib.axis_1">
3897
+ <g id="xtick_1">
3898
+ <g id="grid-x--1" class="grid grid-x">
3899
+ <path d="M 83.741924 457.251932 L 83.741924 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3900
+ </g>
3901
+ <g id="line2d_1">
3902
+ <defs>
3903
+ <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3904
+ </defs>
3905
+ <g>
3906
+ <use ns4:href="#mafb3703e5b" x="83.741924" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
3907
+ </g>
3908
+ </g>
3909
+ <g id="text_1">
3910
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(45.726648 549.111375) rotate(-45)">LN_B16_S2048_D4096</text>
3911
+ </g>
3912
+ </g>
3913
+ <g id="xtick_2">
3914
+ <g id="grid-x--2" class="grid grid-x">
3915
+ <path d="M 323.888085 457.251932 L 323.888085 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3916
+ </g>
3917
+ <g id="line2d_2">
3918
+ <g>
3919
+ <use ns4:href="#mafb3703e5b" x="323.888085" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
3920
+ </g>
3921
+ </g>
3922
+ <g id="text_2">
3923
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(285.872809 549.111375) rotate(-45)">LN_B16_S2048_D8192</text>
3924
+ </g>
3925
+ </g>
3926
+ <g id="xtick_3">
3927
+ <g id="grid-x--3" class="grid grid-x">
3928
+ <path d="M 564.034245 457.251932 L 564.034245 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3929
+ </g>
3930
+ <g id="line2d_3">
3931
+ <g>
3932
+ <use ns4:href="#mafb3703e5b" x="564.034245" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
3933
+ </g>
3934
+ </g>
3935
+ <g id="text_3">
3936
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(526.018969 549.111375) rotate(-45)">LN_B16_S4096_D4096</text>
3937
+ </g>
3938
+ </g>
3939
+ <g id="xtick_4">
3940
+ <g id="grid-x--4" class="grid grid-x">
3941
+ <path d="M 804.180406 457.251932 L 804.180406 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3942
+ </g>
3943
+ <g id="line2d_4">
3944
+ <g>
3945
+ <use ns4:href="#mafb3703e5b" x="804.180406" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
3946
+ </g>
3947
+ </g>
3948
+ <g id="text_4">
3949
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(766.16513 549.111375) rotate(-45)">LN_B16_S4096_D8192</text>
3950
+ </g>
3951
+ </g>
3952
+ <g id="label--x" class="xlabel">
3953
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="562.377038" transform="rotate(-0 443.961165 562.377038)">Workload</text>
3954
+ </g>
3955
+ </g>
3956
+ <g id="matplotlib.axis_2">
3957
+ <g id="ytick_1">
3958
+ <g id="grid-y--2" class="grid grid-y">
3959
+ <path d="M 47.72 409.029804 L 840.20233 409.029804 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3960
+ </g>
3961
+ <g id="line2d_5">
3962
+ <defs>
3963
+ <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3964
+ </defs>
3965
+ <g>
3966
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
3967
+ </g>
3968
+ </g>
3969
+ <g id="text_5">
3970
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
3971
+ </g>
3972
+ </g>
3973
+ <g id="ytick_2">
3974
+ <g id="grid-y--3" class="grid grid-y">
3975
+ <path d="M 47.72 331.290271 L 840.20233 331.290271 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3976
+ </g>
3977
+ <g id="line2d_6">
3978
+ <g>
3979
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
3980
+ </g>
3981
+ </g>
3982
+ <g id="text_6">
3983
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
3984
+ </g>
3985
+ </g>
3986
+ <g id="ytick_3">
3987
+ <g id="grid-y--4" class="grid grid-y">
3988
+ <path d="M 47.72 253.550738 L 840.20233 253.550738 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3989
+ </g>
3990
+ <g id="line2d_7">
3991
+ <g>
3992
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
3993
+ </g>
3994
+ </g>
3995
+ <g id="text_7">
3996
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
3997
+ </g>
3998
+ </g>
3999
+ <g id="ytick_4">
4000
+ <g id="grid-y--5" class="grid grid-y">
4001
+ <path d="M 47.72 175.811205 L 840.20233 175.811205 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4002
+ </g>
4003
+ <g id="line2d_8">
4004
+ <g>
4005
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
4006
+ </g>
4007
+ </g>
4008
+ <g id="text_8">
4009
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
4010
+ </g>
4011
+ </g>
4012
+ <g id="ytick_5">
4013
+ <g id="grid-y--6" class="grid grid-y">
4014
+ <path d="M 47.72 98.071672 L 840.20233 98.071672 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4015
+ </g>
4016
+ <g id="line2d_9">
4017
+ <g>
4018
+ <use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
4019
+ </g>
4020
+ </g>
4021
+ <g id="text_9">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
4023
+ </g>
4024
+ </g>
4025
+ <g id="label--y" class="ylabel">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="242.065966" transform="rotate(-90 18.737188 242.065966)">Latency P50 (ms)</text>
4027
+ </g>
4028
+ </g>
4029
+ <g id="series--torch-layer-norm" class="series">
4030
+ <path d="M 83.741924 437.689571 L 323.888085 303.094453 L 564.034245 314.534914 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4031
+ <defs>
4032
+ <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4033
+ </defs>
4034
+ <g clip-path="url(#p2214f54723)">
4035
+ <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4036
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
4037
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
4038
+ <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4039
+ </g>
4040
+ </g>
4041
+ <g id="series--hf-kernels-layer-norm" class="series">
4042
+ <path d="M 83.741924 434.514533 L 323.888085 307.713737 L 564.034245 307.460461 L 804.180406 56.028111 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4043
+ <defs>
4044
+ <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4045
+ </defs>
4046
+ <g clip-path="url(#p2214f54723)">
4047
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
4048
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
4049
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4050
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
4051
+ </g>
4052
+ </g>
4053
+ <g id="patch_3">
4054
+ <path d="M 47.72 457.251932 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4055
+ </g>
4056
+ <g id="patch_4">
4057
+ <path d="M 840.20233 457.251932 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4058
+ </g>
4059
+ <g id="patch_5">
4060
+ <path d="M 47.72 457.251932 L 840.20233 457.251932 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4061
+ </g>
4062
+ <g id="patch_6">
4063
+ <path d="M 47.72 26.88 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4064
+ </g>
4065
+ <g id="text_10">
4066
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="20.88" transform="rotate(-0 443.961165 20.88)">Attention Implementation Latency</text>
4067
+ </g>
4068
+ <g id="legend" class="legend">
4069
+ <g id="patch_7">
4070
+ <path d="M 54.72 64.7925 L 198.795 64.7925 Q 200.795 64.7925 200.795 62.7925 L 200.795 33.88 Q 200.795 31.88 198.795 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4071
+ </g>
4072
+ <g id="line2d_10">
4073
+ <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4074
+ <g>
4075
+ <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4076
+ </g>
4077
+ </g>
4078
+ <g id="legend-label--torch-layer-norm" class="legend">
4079
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">torch_layer_norm</text>
4080
+ </g>
4081
+ <g id="line2d_11">
4082
+ <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4083
+ <g>
4084
+ <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4085
+ </g>
4086
+ </g>
4087
+ <g id="legend-label--hf-kernels-layer-norm" class="legend">
4088
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">hf_kernels_layer_norm</text>
4089
+ </g>
4090
+ </g>
4091
  </g>
4092
  </g>
4093
+ <defs>
4094
+ <clipPath id="p2214f54723">
4095
+ <rect x="47.72" y="26.88" width="792.48233" height="430.371932" />
4096
+ </clipPath>
4097
+ </defs>
4098
  </svg>
4099
  </div>
4100
 
 
4105
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4106
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4107
  </span> |
4108
+ Cell: combine | 4.25s
4109
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4110
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4111
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4192
  COMBINED BENCHMARK SUMMARY
4193
 
4194
  impl wl p50(ms) ok
4195
+ hf_kernels_layer_norm LN_B16_S2048_D4096 0.84 True
4196
+ hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4197
+ hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4198
+ hf_kernels_layer_norm LN_B16_S4096_D8192 3.27 True
4199
+ torch_layer_norm LN_B16_S2048_D4096 0.82 True
4200
+ torch_layer_norm LN_B16_S2048_D8192 1.68 True
4201
+ torch_layer_norm LN_B16_S4096_D4096 1.61 True
4202
+ torch_layer_norm LN_B16_S4096_D8192 3.33 True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4203
 
4204
  GENERATING COMBINED VISUALIZATION
4205
 
4206
+ Loaded 8 records
4207
+ Visualization saved as latency.svg
4208
+ Saved latency.png
4209
  ✓ Visualization saved as latency.svg
4210
  ✓ SVG visualization ready!
4211
 
 
4219
  <div class="uv-install-logs" id="uv-logs-combine">
4220
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4221
  <div class="uv-logs-content" style="display: none;">
4222
+ Installed 37 packages in 219ms
4223
  </div>
4224
  </div>
4225
  <div class="cell-artifacts">
 
4227
  <a href="artifacts/combine/latency.svg" class="artifact" target="_blank">latency.svg</a>
4228
  <div class="artifact-preview">
4229
  <?xml version='1.0' encoding='utf-8'?>
4230
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns2="http://creativecommons.org/ns#" xmlns:ns4="http://www.w3.org/1999/xlink" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" width="864pt" height="576pt" viewBox="0 0 864 576" version="1.1">
4231
  <metadata>
4232
  <rdf:RDF>
4233
  <ns2:Work>
4234
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4235
+ <dc:date>2025-10-28T14:09:21.825978</dc:date>
4236
  <dc:format>image/svg+xml</dc:format>
4237
  <dc:creator>
4238
  <ns2:Agent>
 
4247
  </defs>
4248
  <g id="figure--latency" class="figure">
4249
  <g id="patch_1">
4250
+ <path d="M 0 576 L 864 576 L 864 0 L 0 0 L 0 576 z " style="fill: none" />
4251
+ </g>
4252
+ <g id="axes--1" class="axes">
4253
+ <g id="patch_2">
4254
+ <path d="M 47.72 457.251932 L 840.20233 457.251932 L 840.20233 26.88 L 47.72 26.88 L 47.72 457.251932 z " style="fill: none" />
4255
+ </g>
4256
+ <g id="matplotlib.axis_1">
4257
+ <g id="xtick_1">
4258
+ <g id="grid-x--1" class="grid grid-x">
4259
+ <path d="M 83.741924 457.251932 L 83.741924 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4260
+ </g>
4261
+ <g id="line2d_1">
4262
+ <defs>
4263
+ <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4264
+ </defs>
4265
+ <g>
4266
+ <use ns4:href="#mafb3703e5b" x="83.741924" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
4267
+ </g>
4268
+ </g>
4269
+ <g id="text_1">
4270
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(45.726648 549.111375) rotate(-45)">LN_B16_S2048_D4096</text>
4271
+ </g>
4272
+ </g>
4273
+ <g id="xtick_2">
4274
+ <g id="grid-x--2" class="grid grid-x">
4275
+ <path d="M 323.888085 457.251932 L 323.888085 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4276
+ </g>
4277
+ <g id="line2d_2">
4278
+ <g>
4279
+ <use ns4:href="#mafb3703e5b" x="323.888085" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
4280
+ </g>
4281
+ </g>
4282
+ <g id="text_2">
4283
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(285.872809 549.111375) rotate(-45)">LN_B16_S2048_D8192</text>
4284
+ </g>
4285
+ </g>
4286
+ <g id="xtick_3">
4287
+ <g id="grid-x--3" class="grid grid-x">
4288
+ <path d="M 564.034245 457.251932 L 564.034245 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4289
+ </g>
4290
+ <g id="line2d_3">
4291
+ <g>
4292
+ <use ns4:href="#mafb3703e5b" x="564.034245" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
4293
+ </g>
4294
+ </g>
4295
+ <g id="text_3">
4296
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(526.018969 549.111375) rotate(-45)">LN_B16_S4096_D4096</text>
4297
+ </g>
4298
+ </g>
4299
+ <g id="xtick_4">
4300
+ <g id="grid-x--4" class="grid grid-x">
4301
+ <path d="M 804.180406 457.251932 L 804.180406 26.88 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4302
+ </g>
4303
+ <g id="line2d_4">
4304
+ <g>
4305
+ <use ns4:href="#mafb3703e5b" x="804.180406" y="457.251932" style="stroke: #000000; stroke-width: 0.8" />
4306
+ </g>
4307
+ </g>
4308
+ <g id="text_4">
4309
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(766.16513 549.111375) rotate(-45)">LN_B16_S4096_D8192</text>
4310
+ </g>
4311
+ </g>
4312
+ <g id="label--x" class="xlabel">
4313
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="562.377038" transform="rotate(-0 443.961165 562.377038)">Workload</text>
4314
+ </g>
4315
+ </g>
4316
+ <g id="matplotlib.axis_2">
4317
+ <g id="ytick_1">
4318
+ <g id="grid-y--2" class="grid grid-y">
4319
+ <path d="M 47.72 409.029804 L 840.20233 409.029804 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4320
+ </g>
4321
+ <g id="line2d_5">
4322
+ <defs>
4323
+ <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4324
+ </defs>
4325
+ <g>
4326
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.029804" style="stroke: #000000; stroke-width: 0.8" />
4327
+ </g>
4328
+ </g>
4329
+ <g id="text_5">
4330
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.829023" transform="rotate(-0 40.72 412.829023)">1.0</text>
4331
+ </g>
4332
+ </g>
4333
+ <g id="ytick_2">
4334
+ <g id="grid-y--3" class="grid grid-y">
4335
+ <path d="M 47.72 331.290271 L 840.20233 331.290271 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4336
+ </g>
4337
+ <g id="line2d_6">
4338
+ <g>
4339
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.290271" style="stroke: #000000; stroke-width: 0.8" />
4340
+ </g>
4341
+ </g>
4342
+ <g id="text_6">
4343
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="335.08949" transform="rotate(-0 40.72 335.08949)">1.5</text>
4344
+ </g>
4345
+ </g>
4346
+ <g id="ytick_3">
4347
+ <g id="grid-y--4" class="grid grid-y">
4348
+ <path d="M 47.72 253.550738 L 840.20233 253.550738 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4349
+ </g>
4350
+ <g id="line2d_7">
4351
+ <g>
4352
+ <use ns4:href="#m0fca2865ba" x="47.72" y="253.550738" style="stroke: #000000; stroke-width: 0.8" />
4353
+ </g>
4354
+ </g>
4355
+ <g id="text_7">
4356
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="257.349957" transform="rotate(-0 40.72 257.349957)">2.0</text>
4357
+ </g>
4358
+ </g>
4359
+ <g id="ytick_4">
4360
+ <g id="grid-y--5" class="grid grid-y">
4361
+ <path d="M 47.72 175.811205 L 840.20233 175.811205 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4362
+ </g>
4363
+ <g id="line2d_8">
4364
+ <g>
4365
+ <use ns4:href="#m0fca2865ba" x="47.72" y="175.811205" style="stroke: #000000; stroke-width: 0.8" />
4366
+ </g>
4367
+ </g>
4368
+ <g id="text_8">
4369
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="179.610424" transform="rotate(-0 40.72 179.610424)">2.5</text>
4370
+ </g>
4371
+ </g>
4372
+ <g id="ytick_5">
4373
+ <g id="grid-y--6" class="grid grid-y">
4374
+ <path d="M 47.72 98.071672 L 840.20233 98.071672 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4375
+ </g>
4376
+ <g id="line2d_9">
4377
+ <g>
4378
+ <use ns4:href="#m0fca2865ba" x="47.72" y="98.071672" style="stroke: #000000; stroke-width: 0.8" />
4379
+ </g>
4380
+ </g>
4381
+ <g id="text_9">
4382
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="101.870891" transform="rotate(-0 40.72 101.870891)">3.0</text>
4383
+ </g>
4384
+ </g>
4385
+ <g id="label--y" class="ylabel">
4386
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.737188" y="242.065966" transform="rotate(-90 18.737188 242.065966)">Latency P50 (ms)</text>
4387
+ </g>
4388
+ </g>
4389
+ <g id="series--torch-layer-norm" class="series">
4390
+ <path d="M 83.741924 437.689571 L 323.888085 303.094453 L 564.034245 314.534914 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4391
+ <defs>
4392
+ <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4393
+ </defs>
4394
+ <g clip-path="url(#p2214f54723)">
4395
+ <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4396
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.094453" style="fill: #1f77b4; stroke: #1f77b4" />
4397
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.534914" style="fill: #1f77b4; stroke: #1f77b4" />
4398
+ <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4399
+ </g>
4400
+ </g>
4401
+ <g id="series--hf-kernels-layer-norm" class="series">
4402
+ <path d="M 83.741924 434.514533 L 323.888085 307.713737 L 564.034245 307.460461 L 804.180406 56.028111 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4403
+ <defs>
4404
+ <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4405
+ </defs>
4406
+ <g clip-path="url(#p2214f54723)">
4407
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="434.514533" style="fill: #ff7f0e; stroke: #ff7f0e" />
4408
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="307.713737" style="fill: #ff7f0e; stroke: #ff7f0e" />
4409
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="307.460461" style="fill: #ff7f0e; stroke: #ff7f0e" />
4410
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.028111" style="fill: #ff7f0e; stroke: #ff7f0e" />
4411
+ </g>
4412
+ </g>
4413
+ <g id="patch_3">
4414
+ <path d="M 47.72 457.251932 L 47.72 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4415
+ </g>
4416
+ <g id="patch_4">
4417
+ <path d="M 840.20233 457.251932 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4418
+ </g>
4419
+ <g id="patch_5">
4420
+ <path d="M 47.72 457.251932 L 840.20233 457.251932 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4421
+ </g>
4422
+ <g id="patch_6">
4423
+ <path d="M 47.72 26.88 L 840.20233 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4424
+ </g>
4425
+ <g id="text_10">
4426
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="443.961165" y="20.88" transform="rotate(-0 443.961165 20.88)">Attention Implementation Latency</text>
4427
+ </g>
4428
+ <g id="legend" class="legend">
4429
+ <g id="patch_7">
4430
+ <path d="M 54.72 64.7925 L 198.795 64.7925 Q 200.795 64.7925 200.795 62.7925 L 200.795 33.88 Q 200.795 31.88 198.795 31.88 L 54.72 31.88 Q 52.72 31.88 52.72 33.88 L 52.72 62.7925 Q 52.72 64.7925 54.72 64.7925 L 54.72 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4431
+ </g>
4432
+ <g id="line2d_10">
4433
+ <path d="M 56.72 39.978438 L 66.72 39.978438 L 76.72 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4434
+ <g>
4435
+ <use ns4:href="#md7efaf3aec" x="66.72" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4436
+ </g>
4437
+ </g>
4438
+ <g id="legend-label--torch-layer-norm" class="legend">
4439
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="43.478438" transform="rotate(-0 84.72 43.478438)">torch_layer_norm</text>
4440
+ </g>
4441
+ <g id="line2d_11">
4442
+ <path d="M 56.72 54.934687 L 66.72 54.934687 L 76.72 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4443
+ <g>
4444
+ <use ns4:href="#m9b8c54d372" x="66.72" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4445
+ </g>
4446
+ </g>
4447
+ <g id="legend-label--hf-kernels-layer-norm" class="legend">
4448
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="84.72" y="58.434687" transform="rotate(-0 84.72 58.434687)">hf_kernels_layer_norm</text>
4449
+ </g>
4450
+ </g>
4451
  </g>
4452
  </g>
4453
+ <defs>
4454
+ <clipPath id="p2214f54723">
4455
+ <rect x="47.72" y="26.88" width="792.48233" height="430.371932" />
4456
+ </clipPath>
4457
+ </defs>
4458
  </svg>
4459
  </div>
4460
  </div>
rotary/impls/artifacts/benchmark/rotary.jsonl ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.1724160000549091, "p50": 0.17308600001797458, "p90": 0.1756759999125279, "mean": 0.1760500000045795, "iqr": 0.0032199998258874984, "raw_times": [0.17245600008664042, 0.1756759999125279, 0.1724160000549091, 0.17308600001797458, 0.18661599995084543], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.17975699995531613, "peak_bytes": 1720320, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
2
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22642799990535423, "p50": 0.2294280000114668, "p90": 0.23093799995876907, "mean": 0.23135619996992318, "iqr": 0.0026599999500831473, "raw_times": [0.23093799995876907, 0.22642799990535423, 0.22827800000868592, 0.2417089999653399, 0.2294280000114668], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23494799995660287, "peak_bytes": 3440640, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
3
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21688800006813835, "p50": 0.21992799997860857, "p90": 0.2219079999576934, "mean": 0.22172000001319248, "iqr": 0.004439999884198187, "raw_times": [0.2174680000734952, 0.2219079999576934, 0.21688800006813835, 0.23240799998802686, 0.21992799997860857], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.225418000013633, "peak_bytes": 6832128, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
4
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21487700007583044, "p50": 0.21964699999443837, "p90": 0.22132800006602338, "mean": 0.21978760003094067, "iqr": 0.005100000066704524, "raw_times": [0.21487700007583044, 0.21622799999931885, 0.21964699999443837, 0.22132800006602338, 0.2268580000190923], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.24882799993974913, "peak_bytes": 13664256, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
5
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21332699998311, "p50": 0.21615699995436444, "p90": 0.21744800005762954, "mean": 0.21590960000139603, "iqr": 0.0025000000505315256, "raw_times": [0.21332699998311, 0.21744800005762954, 0.21494800000709802, 0.21766800000477815, 0.21615699995436444], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22001800005000405, "peak_bytes": 6881280, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
6
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21038799991401902, "p50": 0.21561700009442575, "p90": 0.21720800009461527, "mean": 0.22098599999935686, "iqr": 0.004100000182916119, "raw_times": [0.21038799991401902, 0.21720800009461527, 0.21561700009442575, 0.24860899998202513, 0.21310799991169915], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2178580000418151, "peak_bytes": 13762560, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
7
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21404700009952649, "p50": 0.21557699994900759, "p90": 0.2158679999411106, "mean": 0.2152116000161186, "iqr": 0.0011999999287581886, "raw_times": [0.2158679999411106, 0.21589800007859594, 0.21404700009952649, 0.21466800001235242, 0.21557699994900759], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.21567799990407366, "peak_bytes": 27328512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
8
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21701799994389148, "p50": 0.21822700000484474, "p90": 0.22002800005793688, "mean": 0.2237478000097326, "iqr": 0.002031000008173578, "raw_times": [0.22002800005793688, 0.2179970000497633, 0.2454689999922266, 0.21822700000484474, 0.21701799994389148], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22291799996310147, "peak_bytes": 54657024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
9
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21712800003115262, "p50": 0.21885700004986575, "p90": 0.2196080000658185, "mean": 0.22401780001928273, "iqr": 0.001630000042496249, "raw_times": [0.21797800002332224, 0.2196080000658185, 0.24651799992625456, 0.21885700004986575, 0.21712800003115262], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2206780000051367, "peak_bytes": 27525120, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
10
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21462800009430794, "p50": 0.21782799990432977, "p90": 0.21795700001803198, "mean": 0.21911359999648994, "iqr": 0.0030300000162242213, "raw_times": [0.21462800009430794, 0.23022799996397225, 0.21782799990432977, 0.21492700000180776, 0.21795700001803198], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2186980000260519, "peak_bytes": 55050240, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
11
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21579799999926763, "p50": 0.21701699995446688, "p90": 0.22130799993647088, "mean": 0.2237457999626713, "iqr": 0.004450000005817856, "raw_times": [0.21701699995446688, 0.22130799993647088, 0.21579799999926763, 0.24774799999249808, 0.21685799993065302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22235700009787251, "peak_bytes": 109314048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
12
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22434800007431477, "p50": 0.2248280000003433, "p90": 0.22490799995011912, "mean": 0.22479799997654482, "iqr": 0.00031000001854408765, "raw_times": [0.2248280000003433, 0.22490799995011912, 0.22459799993157503, 0.22434800007431477, 0.22530799992637185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23522799995134847, "peak_bytes": 218628096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
13
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21574699997017888, "p50": 0.21802799994929956, "p90": 0.21904799996264046, "mean": 0.22033179998288688, "iqr": 0.0018999999156221747, "raw_times": [0.21714800004701829, 0.21802799994929956, 0.2316879999852972, 0.21904799996264046, 0.21574699997017888], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22168800001054478, "peak_bytes": 68698112, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
14
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21753800001533818, "p50": 0.21888800006308884, "p90": 0.22129700005280029, "mean": 0.22190180004599824, "iqr": 0.003358999947522534, "raw_times": [0.21753800001533818, 0.23384799999348616, 0.21793800010527775, 0.21888800006308884, 0.22129700005280029], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22266799999215436, "peak_bytes": 6848512, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
15
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2166670000178783, "p50": 0.21850699999959033, "p90": 0.21964699999443837, "mean": 0.21864339998955984, "iqr": 0.001419000000169035, "raw_times": [0.21850699999959033, 0.2166670000178783, 0.22016799994162284, 0.21822799999426934, 0.21964699999443837], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23552799996195972, "peak_bytes": 13647872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
16
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21624800001518452, "p50": 0.21773700007088337, "p90": 0.21802799994929956, "mean": 0.21774760000425886, "iqr": 0.0013409999155555852, "raw_times": [0.21668700003374397, 0.21773700007088337, 0.22003799995218287, 0.21624800001518452, 0.21802799994929956], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2226780000000872, "peak_bytes": 27295744, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
17
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21748699998624943, "p50": 0.22014700005001941, "p90": 0.22206799997093185, "mean": 0.22232159999475698, "iqr": 0.0019999999949504854, "raw_times": [0.22014700005001941, 0.23183799999060284, 0.22206799997093185, 0.21748699998624943, 0.22006799997598137], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22040800001832395, "peak_bytes": 13697024, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
18
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21601800006010308, "p50": 0.21957800004202, "p90": 0.22023799999715266, "mean": 0.2213318000030995, "iqr": 0.0024510000002919696, "raw_times": [0.23303799991936103, 0.21601800006010308, 0.2177869999968607, 0.21957800004202, 0.22023799999715266], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.220787999978711, "peak_bytes": 27394048, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
19
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21692799998618284, "p50": 0.22003699996275827, "p90": 0.2230679999684071, "mean": 0.222287800011145, "iqr": 0.0031599998919773498, "raw_times": [0.21692799998618284, 0.21990800007642974, 0.2314980000619471, 0.2230679999684071, 0.22003699996275827], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22102700006598752, "peak_bytes": 54591488, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
20
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2160679999860804, "p50": 0.21972700005790102, "p90": 0.22029800004474964, "mean": 0.21970960001453932, "iqr": 0.0024610000082248007, "raw_times": [0.2160679999860804, 0.2246179999474407, 0.22029800004474964, 0.21972700005790102, 0.21783700003652484], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22191799996562622, "peak_bytes": 109182976, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
21
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2172279999967941, "p50": 0.21847799996521644, "p90": 0.22105800007921061, "mean": 0.22193580000475777, "iqr": 0.0035110000453641987, "raw_times": [0.21847799996521644, 0.22105800007921061, 0.23536799994872126, 0.21754700003384642, 0.2172279999967941], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.22206799997093185, "peak_bytes": 54788096, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
22
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.21436800000174117, "p50": 0.21785799992812827, "p90": 0.2195579999124675, "mean": 0.2202379999744153, "iqr": 0.0030299999025373836, "raw_times": [0.21436800000174117, 0.21785799992812827, 0.2195579999124675, 0.2165280000099301, 0.2328780000198094], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25353900002755836, "peak_bytes": 109576192, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
23
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.22968799999034673, "p50": 0.23015800002212927, "p90": 0.23064800006977748, "mean": 0.23369620002995362, "iqr": 0.0006600000688194996, "raw_times": [0.23015800002212927, 0.24799900006655662, 0.22968799999034673, 0.22998800000095798, 0.23064800006977748], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.23042800000894204, "peak_bytes": 218365952, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
24
+ {"ts": "2025-10-28T14:08:27Z", "run": "19f83b2c64b6453c8956e46baabe2e54", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.6347319999804313, "p50": 0.6375930000785957, "p90": 0.639283000055002, "mean": 0.6376124000325945, "iqr": 0.003270999968663091, "raw_times": [0.6375930000785957, 0.636012000086339, 0.6404419999626043, 0.6347319999804313, 0.639283000055002], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.639422999938688, "peak_bytes": 436731904, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax_q": 0.0, "absmax_k": 0.0, "mae_q": 0.0, "mae_k": 0.0, "mse_q": 0.0, "mse_k": 0.0, "ref": "rotary_torch"}, "err": null}
rotary/impls/cells/benchmark.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # ]
8
+ #
9
+ # [tool.uv.sources]
10
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
11
+ # ///
12
+ import torch
13
+ import sys
14
+ from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
15
+
16
+
17
+ def apply_rotary_torch(x1, x2, cos, sin, conj=False):
18
+ """Reference rotary implementation."""
19
+ if not conj:
20
+ out1 = x1 * cos - x2 * sin
21
+ out2 = x1 * sin + x2 * cos
22
+ else:
23
+ out1 = x1 * cos + x2 * sin
24
+ out2 = -x1 * sin + x2 * cos
25
+ return out1, out2
26
+
27
+
28
+ def torch_rotary(query, key, cos, sin, conj=False):
29
+ rotary_dim = cos.shape[-1]
30
+
31
+ # Clone inputs to avoid modifying them
32
+ q_out = query.clone()
33
+ k_out = key.clone()
34
+
35
+ # Apply rotation to query
36
+ q1 = q_out[..., :rotary_dim]
37
+ q2 = q_out[..., rotary_dim : 2 * rotary_dim]
38
+ q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
39
+ q_out[..., :rotary_dim] = q_out_1
40
+ q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2
41
+
42
+ # Apply rotation to key
43
+ k1 = k_out[..., :rotary_dim]
44
+ k2 = k_out[..., rotary_dim : 2 * rotary_dim]
45
+ k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
46
+ k_out[..., :rotary_dim] = k_out_1
47
+ k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2
48
+
49
+ return q_out, k_out
50
+
51
+
52
+ run_benchmark(
53
+ kernel_type=KernelTypeEnum.ROTARY,
54
+ impl_name="torch_eager",
55
+ impl_tags={"family": "pytorch", "backend": "eager"},
56
+ impl_func=torch_rotary,
57
+ )
rotary/impls/cells/nv.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import subprocess
2
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
rotary/impls/hf_kernels_rotary.html ADDED
The diff for this file is too large to render. See raw diff
 
rotary/impls/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /rotary/impls</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /rotary/impls</h1>
84
+ <ul>
85
+ <li><a href='hf_kernels_rotary.html' class='file'>hf_kernels_rotary.html</a></li>
86
+ <li><a href='torch_rotary.html' class='file'>torch_rotary.html</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
rotary/impls/torch_rotary.html ADDED
The diff for this file is too large to render. See raw diff
 
rotary/index.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /rotary</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /rotary</h1>
84
+ <ul>
85
+ <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results/index.html' class='dir'>results/</a></li>
87
+ </ul>
88
+ </body>
89
+ </html>
rotary/results/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: 0517a426384d0bc9df1932ace04595ea1867cb036e7fbeced61eb044cff2e335
  • Pointer size: 130 Bytes
  • Size of remote file: 31 kB
rotary/results/cells/combine.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib",
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # ///
13
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
14
+
15
+ # Map display names to uvnote environment variables
16
+ cache_env_map = {
17
+ "HF Kernels Rotary": "UVNOTE_FILE_HF_KERNELS_ROTARY_BENCHMARK",
18
+ "PyTorch Rotary": "UVNOTE_FILE_TORCH_ROTARY_BENCHMARK",
19
+ }
20
+
21
+ # Generate combined results with visualization
22
+ generate_combined_results(
23
+ cache_env_map=cache_env_map,
24
+ output_filename="rotary.jsonl",
25
+ svg_filename="latency.svg"
26
+ )
rotary/results/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
rotary/results/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /rotary/results</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /rotary/results</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>